In [1]:
import pandas as pd
import ast
import os
import math

os.chdir("/Users/zaza/Valorant Match Predictor")
df = pd.read_csv("data/processed/R2_processed/R2_unified_dataset.csv")

# Parse the string lists
df['all_player_ratings'] = df['all_player_ratings'].apply(ast.literal_eval)
df['all_player_teams'] = df['all_player_teams'].apply(ast.literal_eval)
df['all_players'] = df['all_players'].apply(ast.literal_eval)

In [2]:
### Removes duplicate data collection (error in data collection)
def remove_double_back_and_forth(L):
    n = len(L)
    if n % 2 == 0:
        half = n // 2
        # if the first half exactly equals the second half, keep only the first
        if L[:half] == L[half:]:
            return L[:half]
    return L

# Apply to all columns
for col in ['all_players', 'all_player_teams', 'all_player_ratings']:
    df[col] = df[col].apply(remove_double_back_and_forth)

# Sanity check
lengths = df[['all_players','all_player_teams','all_player_ratings']].map(len).head()
print(lengths)

# Top 43 VCT teams from VLR as of July 2025
team_abbreviations = {
    "Sentinels":          ["SEN"],
    "100 Thieves":        ["100T"],
    "Cloud9":             ["C9"],
    "NRG":                ["NRG"],
    "Evil Geniuses":      ["EG"],
    "G2 Esports":         ["G2"],
    "LOUD":               ["LOUD"],
    "MIBR":               ["MIBR"],
    "FURIA":              ["FUR"],
    "KRÜ Esports":        ["KRÜ"],
    "LEVIATÁN":           ["LEV"],
    "FNATIC":             ["FNC"],
    "Team Liquid":        ["TL"],
    "Team Heretics":      ["TH"],
    "BBL Esports":        ["BBL"],
    "FUT Esports":        ["FUT"],
    "Karmine Corp":       ["KC"],
    "Team Vitality":      ["VIT"],
    "Natus Vincere":      ["NAVI"],
    "Gentle Mates":       ["M8"],
    "Apeks":              ["APK"],
    "Paper Rex":          ["PRX"],
    "DRX":                ["DRX"],
    "Gen.G":              ["GEN"],
    "T1":                 ["T1"],
    "Rex Regum Qeon":     ["RRQ"],
    "TALON":              ["TLN"],
    "Team Secret":        ["TS"],
    "ZETA DIVISION":      ["ZETA","FL"],
    "DetonatioN FocusMe": ["DFM","CR"],
    "Global Esports":     ["GE"],
    "EDward Gaming":      ["EDG"],
    "Bilibili Gaming":    ["BLG"],
    "FunPlus Phoenix":    ["FPX"],
    "Dragon Ranger Gaming":["DRG"],
    "Wolves Esports":     ["WOL"],
    "Trace Esports":      ["TE"],
    "Titan Esports Club": ["TEC"],
    "Nova Esports":       ["NOVA"],
    "All Gamers":         ["AG"],
    "TYLOO":              ["TYL"],
    "JDG Esports":        ["JDG"],
    "Rare Atom":          ["RA"],
}

valid_codes = {c for codes in team_abbreviations.values() for c in codes}

# Flag rows missing any valid code
df['has_valid_team'] = df['all_player_teams'].apply(lambda teams: any(c in valid_codes for c in teams))

# Print invalid matches (no valid code) (Debugging)
print("Matches with no recognized team code:")
for _, row in df[~df['has_valid_team']].iterrows():
    print(f"{row['team_name']} — {row['match_url']}")

   all_players  all_player_teams  all_player_ratings
0           10                10                  10
1           10                10                  10
2           10                10                  10
3           10                10                  10
4           10                10                  10
Matches with no recognized team code:


In [3]:
### Calculate mean ratings for each team
### Team player and enemy player R2 score was collected in same list, so splitting logic is used

def split_mean_ratings(row):
    teams = row['all_player_teams']
    ratings = row['all_player_ratings']
    
    codes = team_abbreviations[row['team_name']]
    
    my_rates = [r for t,r in zip(teams, ratings) if t in codes]
    enemy_rates = [r for t,r in zip(teams, ratings) if t not in codes]
    
    average_my_rate = sum(my_rates) / len(my_rates) if my_rates else math.nan
    average_enemy_rate = sum(enemy_rates) / len(enemy_rates) if enemy_rates else math.nan
    
    return pd.Series({
        'mean_my_rating':    average_my_rate,
        'mean_opp_rating':   average_enemy_rate
    })

df[['mean_my_rating','mean_opp_rating']] = df.apply(split_mean_ratings, axis=1)

In [4]:
###=== R2 COMBAT SCORE ROLLING AVERAGES FEATURE ===###
# Given specified rolling periods, calculate rolling averages for both teams R2, (R2 Combat Score)


#Yes, current function is sloppy, will update eventually

def add_rolling_averages(df, rolling_periods=[15, 20, 21, 22, 23, 24, 25, 30, 45, 60, 80, 90, 100, 120, 150, 200], date_format='%Y/%m/%d', min_periods=2):
    # copy to keep original
    df = df.copy()
    
    # Convert date and sort
    df['date'] = pd.to_datetime(df['date'], format=date_format)
    df = df.sort_values(['team_id', 'date']).reset_index(drop=True)
    
    
    for period in rolling_periods:
        df[f'rolling_{period}d_my'] = pd.NA
        df[f'rolling_{period}d_opp'] = pd.NA
    
    # Calculate rolling averages for each team
    for team_id in df['team_id'].unique():
        team_mask = df['team_id'] == team_id
        team_data = df[team_mask].copy()
        
        
        team_data = team_data.set_index('date').sort_index()
        
        # Calculate rolling averages for each period
        for period in rolling_periods:
            rolling_window = f'{period}D'
            
            rolling_my = team_data['mean_my_rating'].rolling(rolling_window, min_periods=min_periods).mean().shift(1)
            rolling_opp = team_data['mean_opp_rating'].rolling(rolling_window, min_periods=min_periods).mean().shift(1)
            
            
            df.loc[team_mask, f'rolling_{period}d_my'] = rolling_my.values
            df.loc[team_mask, f'rolling_{period}d_opp'] = rolling_opp.values
    
    # Just converting to float
    for period in rolling_periods:
        df[f'rolling_{period}d_my'] = df[f'rolling_{period}d_my'].astype('float64')
        df[f'rolling_{period}d_opp'] = df[f'rolling_{period}d_opp'].astype('float64')
    
    return df


df = add_rolling_averages(df)

df['r2_advantage'] = df['rolling_30d_my'] - df['rolling_30d_opp']

df['r2_advantage'] = df['r2_advantage'].fillna(0)


In [5]:
###=== MAP POOL ADVANTAGE FEATURE ===###
import numpy as np
import pandas as pd
import ast

# parse for string maps
for col in ['maps_played', 'map_results']:
    if col in df.columns and df[col].dtype == object:
        df[col] = df[col].apply(lambda x: x if isinstance(x, list) else ast.literal_eval(x))

# make a "long" version where each row is a single map a team played
if {'maps_played', 'map_results'}.issubset(df.columns):
    maps_long = (
        df[['team_name', 'date', 'maps_played', 'map_results']].copy()
          .explode(['maps_played', 'map_results'])
          .rename(columns={'maps_played': 'map_name', 'map_results': 'map_result'})
    )
    maps_long['map_win'] = (maps_long['map_result'] == 'W').astype(float)
    maps_long['date'] = pd.to_datetime(maps_long['date'])

    # Calculate overall map pool strength on ALL past maps 
    def decayed_map_pool_strength(hist_df, team, current_date,
                                  days_limit=180, tau=70, min_rows=2):
        past = hist_df[
            (hist_df['team_name'] == team) &
            (hist_df['date'] < current_date) &
            (hist_df['date'] >= current_date - pd.Timedelta(days=days_limit))
        ]
        # REMOVED the map filtering - now uses ALL past maps
        if len(past) < min_rows:
            return 0
        ages = (current_date - past['date']) / np.timedelta64(1, 'D')
        weights = np.exp(-ages / tau)
        return np.average(past['map_win'].values, weights=weights)

    # for each match, get our decayed map winrate and theirs, then take the diff
    # Now passing NO map list of current game (data leakage) - will use all historical maps
    df['decayed_map_pool_my'] = df.apply(
        lambda r: decayed_map_pool_strength(maps_long, r['team_name'], r['date']),
        axis=1
    )
    df['decayed_map_pool_opp'] = df.apply(
        lambda r: decayed_map_pool_strength(maps_long, r['opponent'], r['date']),
        axis=1
    )
    df['map_pool_advantage'] = df['decayed_map_pool_my'] - df['decayed_map_pool_opp']
    
    # to prevent neutral matches skewing the results
    df['has_map_pool_data'] = (
        (df['decayed_map_pool_my'] != 0) | (df['decayed_map_pool_opp'] != 0)
    ).astype(int)

else:
    # if no map pool history data, filling with 0.5
    df['decayed_map_pool_my'] = 0.5
    df['decayed_map_pool_opp'] = 0.5
    df['map_pool_advantage'] = 0.5



def get_past_winrate(df, team, current_date, last_n_matches=5):
    """Get winrate from last N matches before current date"""
    past_matches = df[(df['team_name'] == team) & 
                      (df['date'] < current_date)].sort_values('date', ascending=False)
    
    if len(past_matches) == 0:
        return 0.5  # No history, assume average
    
    # Take last N matches (or all if less than N)
    recent = past_matches.head(last_n_matches)
    wins = (recent['result'] == 'W').sum()
    
    return wins / len(recent)

# Calculate for both teams
df['my_past_winrate'] = df.apply(lambda row: 
    get_past_winrate(df, row['team_name'], row['date']), axis=1)

df['opp_past_winrate'] = df.apply(lambda row: 
    get_past_winrate(df, row['opponent'], row['date']), axis=1)

# Simple winrate advantage
df['winrate_advantage'] = df['my_past_winrate'] - df['opp_past_winrate']



In [6]:
###=== ROLLING DIFFERENTIAL FEATURE ===###


import pandas as pd
import numpy as np
import ast

# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'])


df['our_scores'] = df['our_scores'].apply(ast.literal_eval)
df['their_scores'] = df['their_scores'].apply(ast.literal_eval)

# Calculate round differential (difference in sum of all round scores in a game)
df['round_differential'] = df['our_scores'].apply(sum) - df['their_scores'].apply(sum)



# Sort for group-wise time-based operations
df = df.sort_values(by=['team_name', 'date'])

def compute_rolling_round_diff(group):
    return (
        group.set_index('date')['round_differential']
             .rolling('15D', min_periods=1)
             .mean()
             .shift(1)
             .reset_index(drop=True)
    )
    
# Calculate 15-day rolling round differential
df['rolling_round_diff'] = (
    df.groupby('team_name', group_keys=False)
      .apply(compute_rolling_round_diff)
      .values
)

#dropping any non rolling_round_diff
df = df.dropna(subset=['rolling_round_diff'])

  .apply(compute_rolling_round_diff)


In [7]:
import numpy as np
import pandas as pd

### STAR PLAYER ADVANTAGE + CONSISTENCY ADVANTAGE ###

# Safely calculate std with fallback if data is NaN or malformed
def safe_std(x):
    try:
        arr = np.array(x)
        if len(arr) == 10:
            my_std = np.std(arr[:5])
            opp_std = np.std(arr[5:])
            # Return default if NaN/inf
            my_std = 0.0 if pd.isna(my_std) or np.isinf(my_std) else my_std
            opp_std = 0.0 if pd.isna(opp_std) or np.isinf(opp_std) else opp_std
            return my_std, opp_std
    except:
        return 0.0, 0.0  # Default values instead of NaN
    return 0.0, 0.0

df['my_team_rating_std'], df['opp_team_rating_std'] = zip(*df['all_player_ratings'].apply(safe_std))

# Initialize rolling columns with default values
df['rolling_my_consistency'] = 0.0  # Default to 0 instead of pd.NA
df['rolling_opp_consistency'] = 0.0

# Calculate rolling 20-day std per team
for team_id in df['team_id'].unique():
    team_mask = df['team_id'] == team_id
    team_data = df[team_mask].copy()
    team_data = team_data.set_index('date').sort_index()

    rolling_my_std = (
        team_data['my_team_rating_std']
        .rolling('20D', min_periods=2)  # Changed from 2 to 1
        .mean()
        .shift(1)
        .fillna(0.0)  # Fill any remaining NaN with 0
    )
    rolling_opp_std = (
        team_data['opp_team_rating_std']
        .rolling('20D', min_periods=2)  # Changed from 2 to 1
        .mean()
        .shift(1)
        .fillna(0.0)  # Fill any remaining NaN with 0
    )

    df.loc[team_mask, 'rolling_my_consistency'] = rolling_my_std.values
    df.loc[team_mask, 'rolling_opp_consistency'] = rolling_opp_std.values

# Converting to float dtype and fill any remaining NaN
df['rolling_my_consistency'] = pd.to_numeric(df['rolling_my_consistency'], errors='coerce').fillna(0.0)
df['rolling_opp_consistency'] = pd.to_numeric(df['rolling_opp_consistency'], errors='coerce').fillna(0.0)

# Creating consistency advantage (lower std is better) - default to 0 (neutral)
df['consistency_advantage'] = df['rolling_opp_consistency'] - df['rolling_my_consistency']
df['consistency_advantage'] = df['consistency_advantage'].fillna(0.0)


### STAR PLAYER ADVANTAGE - FAILED FEATURE (hurt model performance)###

# extract best players
def safe_max(x):
    try:
        arr = np.array(x)
        if len(arr) == 10:
            my_max = max(arr[:5])
            opp_max = max(arr[5:])
            # Return default if NaN/inf (use reasonable Valorant rating default)
            my_max = 1.0 if pd.isna(my_max) or np.isinf(my_max) else my_max
            opp_max = 1.0 if pd.isna(opp_max) or np.isinf(opp_max) else opp_max
            return my_max, opp_max
    except:
        return 1.0, 1.0  # Default neutral Valorant rating instead of NaN
    return 1.0, 1.0

df['my_best_player'], df['opp_best_player'] = zip(*df['all_player_ratings'].apply(safe_max))
df['star_player_diff'] = df['my_best_player'] - df['opp_best_player']

df['rolling_my_star'] = 1.0  # Default to neutral Valorant rating
df['rolling_opp_star'] = 1.0

for team_id in df['team_id'].unique():
    team_mask = df['team_id'] == team_id
    team_data = df[team_mask].copy()
    team_data = team_data.set_index('date').sort_index()

    rolling_my_star = (
        team_data['my_best_player']
        .rolling('20D', min_periods=1)  # Changed from 2 to 1
        .mean()
        .shift(1)
        .fillna(1.0)  # Fill with default neutral rating
    )
    rolling_opp_star = (
        team_data['opp_best_player']
        .rolling('20D', min_periods=1)  # Changed from 2 to 1
        .mean()
        .shift(1)
        .fillna(1.0)  # Fill with default neutral rating
    )

    df.loc[team_mask, 'rolling_my_star'] = rolling_my_star.values
    df.loc[team_mask, 'rolling_opp_star'] = rolling_opp_star.values

# Convert types and fill any remaining NaN
df['rolling_my_star'] = pd.to_numeric(df['rolling_my_star'], errors='coerce').fillna(1.0)
df['rolling_opp_star'] = pd.to_numeric(df['rolling_opp_star'], errors='coerce').fillna(1.0)

# Star advantage (positive = better player) - defaults to 0 (neutral)
df['star_player_advantage'] = df['rolling_my_star'] - df['rolling_opp_star']
df['star_player_advantage'] = df['star_player_advantage'].fillna(0.0)

# Final safety check - ensure no NaN values in key columns
key_columns = ['consistency_advantage', 'star_player_advantage', 'rolling_my_consistency', 
               'rolling_opp_consistency', 'rolling_my_star', 'rolling_opp_star']

for col in key_columns:
    if col in df.columns:
        df[col] = df[col].fillna(0.0 if 'advantage' in col or 'consistency' in col else 1.0)
        print(f"{col}: {df[col].isna().sum()} NaN values remaining")

print(f"\nFinal check - any NaN in key columns: {df[key_columns].isna().any().any()}")

consistency_advantage: 0 NaN values remaining
star_player_advantage: 0 NaN values remaining
rolling_my_consistency: 0 NaN values remaining
rolling_opp_consistency: 0 NaN values remaining
rolling_my_star: 0 NaN values remaining
rolling_opp_star: 0 NaN values remaining

Final check - any NaN in key columns: False


In [8]:
## RECENT FORM FEATURE ###

import numpy as np

def get_decayed_recent_form(df, team_id, current_date, n_games=10, days_limit=30, tau=90):
    # Filter matches before the current date and within the time window
    team_matches = df[(df['team_id'] == team_id) & 
                      (df['date'] < current_date) & 
                      (df['date'] >= current_date - pd.Timedelta(days=days_limit))]

    # Take the last n games within that window
    recent = team_matches.sort_values(by='date').tail(n_games)

    if recent.empty:
        return 0.5

    # Calculate age in days for decay
    recent['days_ago'] = (current_date - recent['date']).dt.days

    # exponential decay weights (use tau to modify)
    recent['decay'] = np.exp(-recent['days_ago'] / tau)

    # Win = 1, Loss = 0
    recent['win'] = (recent['result'] == 'W').astype(float)

    # Weighted average using decay
    weighted_form = np.average(recent['win'], weights=recent['decay'])

    return weighted_form

# Apply with default: 30-day window, 10 games max, tau = 90 for decay
df['recent_form'] = df.apply(
    lambda row: get_decayed_recent_form(df, row['team_id'], row['date']), axis=1
)


In [9]:
### HEAD-TO-HEAD ADVANTAGE FEATURE ### - FAILED - LACK OF H2H DATA
## Calculates the winrate against a specific opponent before the current match date


def get_h2h_winrate(df, team1, opponent, current_date):
    h2h = df[(df['team_name'] == team1) &
             (df['opponent']  == opponent) &
             (df['date'] < current_date)].sort_values('date', ascending=False)
    
    # Only use if we have at least 2 past matches
    if len(h2h) >= 2:
        # Take only the last 2 matches
        last_two = h2h.head(2)
        wins = (last_two['result'] == 'W').sum()
        return wins / 2  # Will be 0, 0.5, or 1
    else:
        return 0.5  # Neutral if no rivalry exists

df['h2h_advantage'] = df.apply(lambda row: 
    get_h2h_winrate(df, row['team_name'], row['opponent'], row['date']), axis=1)

In [10]:

df.to_csv('notebooks/processed_valorant_dataset.csv', index=False)