In [66]:
import pandas as pd
import ast
import os
import math

os.chdir("/Users/zaza/Valorant Match Predictor")
df = pd.read_csv("data/processed/R2_processed/R2_unified_dataset.csv")

# Parse string representations of lists
df['all_player_ratings'] = df['all_player_ratings'].apply(ast.literal_eval)
df['all_player_teams'] = df['all_player_teams'].apply(ast.literal_eval)
df['all_players'] = df['all_players'].apply(ast.literal_eval)

In [67]:
### Removes duplicate data collection (error in data collection)
def remove_double_back_and_forth(L):
    n = len(L)
    if n % 2 == 0:
        half = n // 2
        # if the first half exactly equals the second half, keep only the first
        if L[:half] == L[half:]:
            return L[:half]
    return L

# Apply to all columns
for col in ['all_players', 'all_player_teams', 'all_player_ratings']:
    df[col] = df[col].apply(remove_double_back_and_forth)

# Sanity check
lengths = df[['all_players','all_player_teams','all_player_ratings']].applymap(len).head()
print(lengths)

# Top 43 VCT teams from VLR as of July 2025
team_abbreviations = {
    "Sentinels":          ["SEN"],
    "100 Thieves":        ["100T"],
    "Cloud9":             ["C9"],
    "NRG":                ["NRG"],
    "Evil Geniuses":      ["EG"],
    "G2 Esports":         ["G2"],
    "LOUD":               ["LOUD"],
    "MIBR":               ["MIBR"],
    "FURIA":              ["FUR"],
    "KRÜ Esports":        ["KRÜ"],
    "LEVIATÁN":           ["LEV"],
    "FNATIC":             ["FNC"],
    "Team Liquid":        ["TL"],
    "Team Heretics":      ["TH"],
    "BBL Esports":        ["BBL"],
    "FUT Esports":        ["FUT"],
    "Karmine Corp":       ["KC"],
    "Team Vitality":      ["VIT"],
    "Natus Vincere":      ["NAVI"],
    "Gentle Mates":       ["M8"],
    "Apeks":              ["APK"],
    "Paper Rex":          ["PRX"],
    "DRX":                ["DRX"],
    "Gen.G":              ["GEN"],
    "T1":                 ["T1"],
    "Rex Regum Qeon":     ["RRQ"],
    "TALON":              ["TLN"],
    "Team Secret":        ["TS"],
    "ZETA DIVISION":      ["ZETA","FL"],
    "DetonatioN FocusMe": ["DFM","CR"],
    "Global Esports":     ["GE"],
    "EDward Gaming":      ["EDG"],
    "Bilibili Gaming":    ["BLG"],
    "FunPlus Phoenix":    ["FPX"],
    "Dragon Ranger Gaming":["DRG"],
    "Wolves Esports":     ["WOL"],
    "Trace Esports":      ["TE"],
    "Titan Esports Club": ["TEC"],
    "Nova Esports":       ["NOVA"],
    "All Gamers":         ["AG"],
    "TYLOO":              ["TYL"],
    "JDG Esports":        ["JDG"],
    "Rare Atom":          ["RA"],
}

valid_codes = {c for codes in team_abbreviations.values() for c in codes}

# Flag rows missing any valid code
df['has_valid_team'] = df['all_player_teams'].apply(lambda teams: any(c in valid_codes for c in teams))

# Print invalid matches (no valid code)
print("Matches with no recognized team code:")
for _, row in df[~df['has_valid_team']].iterrows():
    print(f"{row['team_name']} — {row['match_url']}")

   all_players  all_player_teams  all_player_ratings
0           10                10                  10
1           10                10                  10
2           10                10                  10
3           10                10                  10
4           10                10                  10
Matches with no recognized team code:


  lengths = df[['all_players','all_player_teams','all_player_ratings']].applymap(len).head()


In [68]:
### Calculate mean ratings for each team
### Team player and enemy player R2 score was collected in same list, so splitting logic is used

def split_mean_ratings(row):
    teams = row['all_player_teams']
    ratings = row['all_player_ratings']
    
    codes = team_abbreviations[row['team_name']]
    
    my_rates = [r for t,r in zip(teams, ratings) if t in codes]
    enemy_rates = [r for t,r in zip(teams, ratings) if t not in codes]
    
    average_my_rate = sum(my_rates) / len(my_rates) if my_rates else math.nan
    average_enemy_rate = sum(enemy_rates) / len(enemy_rates) if enemy_rates else math.nan
    
    return pd.Series({
        'mean_my_rating':    average_my_rate,
        'mean_opp_rating':   average_enemy_rate
    })

df[['mean_my_rating','mean_opp_rating']] = df.apply(split_mean_ratings, axis=1)

In [69]:
### R2 COMBAT SCORE ROLLING AVERAGES FEATURE ###
# Given specified rolling periods, calculate rolling averages for both teams R2, (R2 Combat Score)

def add_rolling_averages(df, rolling_periods=[15, 20, 21, 22, 23, 24, 25, 30, 45, 60, 80, 90, 100, 120, 150, 200], date_format='%Y/%m/%d', min_periods=2):
    # copy to keep original
    df = df.copy()
    
    # Convert date and sort
    df['date'] = pd.to_datetime(df['date'], format=date_format)
    df = df.sort_values(['team_id', 'date']).reset_index(drop=True)
    
    
    for period in rolling_periods:
        df[f'rolling_{period}d_my'] = pd.NA
        df[f'rolling_{period}d_opp'] = pd.NA
    
    # Calculate rolling averages for each team
    for team_id in df['team_id'].unique():
        team_mask = df['team_id'] == team_id
        team_data = df[team_mask].copy()
        
        
        team_data = team_data.set_index('date').sort_index()
        
        # Calculate rolling averages for each period
        for period in rolling_periods:
            rolling_window = f'{period}D'
            
            rolling_my = team_data['mean_my_rating'].rolling(rolling_window, min_periods=min_periods).mean().shift(1)
            rolling_opp = team_data['mean_opp_rating'].rolling(rolling_window, min_periods=min_periods).mean().shift(1)
            
            
            df.loc[team_mask, f'rolling_{period}d_my'] = rolling_my.values
            df.loc[team_mask, f'rolling_{period}d_opp'] = rolling_opp.values
    
    # Just converting to float
    for period in rolling_periods:
        df[f'rolling_{period}d_my'] = df[f'rolling_{period}d_my'].astype('float64')
        df[f'rolling_{period}d_opp'] = df[f'rolling_{period}d_opp'].astype('float64')
    
    return df


df = add_rolling_averages(df)


In [70]:
### TESTING ###



# Show the first 10 rows with the new columns
print("First 10 rows with rolling averages:")
print(df[['team_name', 'date', 'mean_my_rating', 'rolling_30d_my', 
         'mean_opp_rating', 'rolling_30d_opp', 'rolling_15d_my', 'rolling_15d_opp', 'rolling_60d_my', 'rolling_60d_opp']].head(10))

# Show overall averages
print("\nOverall averages:")
print(f"Mean my rating: {df['mean_my_rating'].mean():.2f}")
print(f"Mean opponent rating: {df['mean_opp_rating'].mean():.2f}")
print(f"Mean 30-day rolling my rating: {df['rolling_30d_my'].mean():.2f}")
print(f"Mean 30-day rolling opponent rating: {df['rolling_30d_opp'].mean():.2f}")
print(f"Mean 15-day rolling my rating: {df['rolling_15d_my'].mean():.2f}")
print(f"Mean 15-day rolling opponent rating: {df['rolling_15d_opp'].mean():.2f}")
print(f"Mean 60-day rolling my rating: {df['rolling_60d_my'].mean():.2f}")
print(f"Mean 60-day rolling opponent rating: {df['rolling_60d_opp'].mean():.2f}")

# Show averages by team (top 10 teams)
print("\nAverages by team (top 10):")
team_avgs = df.groupby('team_name').agg({
    'mean_my_rating': 'mean',
    'mean_opp_rating': 'mean',
    'rolling_30d_my': 'mean',
    'rolling_30d_opp': 'mean',
    'rolling_15d_my': 'mean',
    'rolling_15d_opp': 'mean',
    'rolling_60d_my': 'mean',
    'rolling_60d_opp': 'mean'
}).round(2)
print(team_avgs.head(10))

# Show descriptive statistics
print("\nDescriptive statistics:")
print(df[['mean_my_rating', 'mean_opp_rating', 'rolling_30d_my', 'rolling_30d_opp', 'rolling_15d_my', 'rolling_15d_opp', 'rolling_60d_my', 'rolling_60d_opp']].describe())

First 10 rows with rolling averages:
   team_name       date  mean_my_rating  rolling_30d_my  mean_opp_rating  \
0  Sentinels 2024-05-03           1.146             NaN            0.828   
1  Sentinels 2024-06-22           1.200             NaN            0.820   
2  Sentinels 2024-06-29           1.104             NaN            0.900   
3  Sentinels 2024-07-01           1.144        1.152000            0.826   
4  Sentinels 2024-07-14           0.826        1.149333            1.190   
5  Sentinels 2024-07-17           0.912        1.068500            1.082   
6  Sentinels 2024-08-01           0.856        1.037200            1.132   
7  Sentinels 2024-08-08           1.210        0.864667            0.794   
8  Sentinels 2024-08-10           1.210        0.951000            0.794   
9  Sentinels 2024-08-14           1.108        1.002800            0.920   

   rolling_30d_opp  rolling_15d_my  rolling_15d_opp  rolling_60d_my  \
0              NaN             NaN              NaN    

In [71]:
### TESTING ###



# Check for duplicates
print("Checking for duplicate matches:")
duplicate_check = df[df['team_name'] == 'Sentinels'][['date', 'mean_my_rating', 'mean_opp_rating', 'match_url']].head(10)
print(duplicate_check)

# Check the rating distribution
print("\nRating distribution for Sentinels:")
sen_data = df[df['team_name'] == 'Sentinels']
print(f"Min rating: {sen_data['mean_my_rating'].min():.3f}")
print(f"Max rating: {sen_data['mean_my_rating'].max():.3f}")
print(f"Std deviation: {sen_data['mean_my_rating'].std():.3f}")

# Check what the ratings represent
print("\nSample of individual player ratings from one match:")
sample_match = df[df['team_name'] == 'Sentinels'].iloc[7] 
print(f"All player ratings: {sample_match['all_player_ratings']}")
print(f"All player teams: {sample_match['all_player_teams']}")

Checking for duplicate matches:
        date  mean_my_rating  mean_opp_rating  \
0 2024-05-03           1.146            0.828   
1 2024-06-22           1.200            0.820   
2 2024-06-29           1.104            0.900   
3 2024-07-01           1.144            0.826   
4 2024-07-14           0.826            1.190   
5 2024-07-17           0.912            1.082   
6 2024-08-01           0.856            1.132   
7 2024-08-08           1.210            0.794   
8 2024-08-10           1.210            0.794   
9 2024-08-14           1.108            0.920   

                                           match_url  
0  https://www.vlr.gg/314650/sentinels-vs-furia-c...  
1  https://www.vlr.gg/353178/sentinels-vs-nrg-esp...  
2  https://www.vlr.gg/353183/sentinels-vs-kr-espo...  
3  https://www.vlr.gg/353187/sentinels-vs-cloud9-...  
4  https://www.vlr.gg/353201/sentinels-vs-g2-espo...  
5  https://www.vlr.gg/371267/sentinels-vs-100-thi...  
6  https://www.vlr.gg/378662/gen-g-vs-senti

In [72]:
### MAPS WON RATIO AND ROUND DIFFERENTIAL ###
### RESULTS: ROUND DIFFERENTIAL IS MORE PREDICTIVE THAN MAPS WON RATIO ###


import ast

# converting string to list
df['our_scores'] = df['our_scores'].apply(ast.literal_eval)
df['their_scores'] = df['their_scores'].apply(ast.literal_eval)

# features
df['maps_won_ratio'] = df['maps_won'] / df['total_maps']
df['round_differential'] = df['our_scores'].apply(sum) - df['their_scores'].apply(sum)


print("Sample round_differential values:")
print(df['round_differential'].head(5))

df['maps_won_ratio'] = df['maps_won'] / df['total_maps']
df['round_differential'] = df['our_scores'].apply(sum) - df['their_scores'].apply(sum)




df = df.sort_values(['team_id', 'date']).reset_index(drop=True)

for team_id in df['team_id'].unique():
    team_mask = df['team_id'] == team_id
    team_data = df[team_mask].copy().set_index('date').sort_index()
    
    # Rolling averages of past performance
    rolling_maps_ratio = team_data['maps_won_ratio'].rolling('20D').mean().shift(1)
    rolling_round_diff = team_data['round_differential'].rolling('20D').mean().shift(1)
    
    df.loc[team_mask, 'rolling_maps_won_ratio'] = rolling_maps_ratio.values
    df.loc[team_mask, 'rolling_round_diff'] = rolling_round_diff.values




Sample round_differential values:
0    12.0
1     7.0
2     5.0
3    13.0
4   -11.0
Name: round_differential, dtype: float64


In [73]:
# Debug our_scores and their_scores
print("=== SCORE DATA DEBUGGING ===")

print("1. BASIC INFO:")
print(f"Total matches in df: {len(df)}")
print(f"our_scores missing: {df['our_scores'].isna().sum()}")
print(f"their_scores missing: {df['their_scores'].isna().sum()}")

print("\n2. SAMPLE our_scores:")
print(df['our_scores'].head(10))
print(f"our_scores data type: {type(df['our_scores'].iloc[0])}")

print("\n3. SAMPLE their_scores:")
print(df['their_scores'].head(10))
print(f"their_scores data type: {type(df['their_scores'].iloc[0])}")

print("\n4. PROBLEMATIC MATCHES:")
problem_our = df[df['our_scores'].isna() | (df['our_scores'] == '') | (df['our_scores'] == '[]')]
problem_their = df[df['their_scores'].isna() | (df['their_scores'] == '') | (df['their_scores'] == '[]')]

print(f"Matches with problematic our_scores: {len(problem_our)}")
if len(problem_our) > 0:
    print("Examples:")
    print(problem_our[['team_name', 'date', 'result', 'our_scores']].head())

print(f"Matches with problematic their_scores: {len(problem_their)}")
if len(problem_their) > 0:
    print("Examples:")
    print(problem_their[['team_name', 'date', 'result', 'their_scores']].head())

print("\n5. ROUND_DIFFERENTIAL CALCULATION CHECK:")
try:
    round_diff_test = df['our_scores'].apply(sum) - df['their_scores'].apply(sum)
    print(f"round_differential calculated successfully: {round_diff_test.notna().sum()}/{len(df)} matches")
    print(f"Missing round_differential: {round_diff_test.isna().sum()}")
    
    # Show which matches are failing
    failing_matches = df[round_diff_test.isna()]
    print(f"Failing matches count: {len(failing_matches)}")
    if len(failing_matches) > 0:
        print("Sample failing matches:")
        print(failing_matches[['team_name', 'date', 'our_scores', 'their_scores']].head())
        
except Exception as e:
    print(f"Error calculating round_differential: {e}")

print("\n6. DATA TYPES AFTER PARSING:")
if 'our_scores' in df.columns:
    sample_parsed = df['our_scores'].iloc[0]
    print(f"Sample parsed our_scores: {sample_parsed}")
    print(f"Type after parsing: {type(sample_parsed)}")
    try:
        print(f"Sum test: {sum(sample_parsed)}")
    except Exception as e:
        print(f"Sum failed: {e}")

=== SCORE DATA DEBUGGING ===
1. BASIC INFO:
Total matches in df: 1643
our_scores missing: 0
their_scores missing: 0

2. SAMPLE our_scores:
0    [13.0, 13.0]
1    [13.0, 14.0]
2    [13.0, 13.0]
3    [13.0, 13.0]
4      [6.0, 9.0]
5     [7.0, 10.0]
6      [8.0, 7.0]
7    [13.0, 13.0]
8    [13.0, 13.0]
9    [15.0, 13.0]
Name: our_scores, dtype: object
our_scores data type: <class 'list'>

3. SAMPLE their_scores:
0      [5.0, 9.0]
1     [8.0, 12.0]
2    [11.0, 10.0]
3      [5.0, 8.0]
4    [13.0, 13.0]
5    [13.0, 13.0]
6    [13.0, 13.0]
7      [8.0, 5.0]
8      [9.0, 6.0]
9     [13.0, 8.0]
Name: their_scores, dtype: object
their_scores data type: <class 'list'>

4. PROBLEMATIC MATCHES:
Matches with problematic our_scores: 0
Matches with problematic their_scores: 0

5. ROUND_DIFFERENTIAL CALCULATION CHECK:
round_differential calculated successfully: 1643/1643 matches
Missing round_differential: 0
Failing matches count: 0

6. DATA TYPES AFTER PARSING:
Sample parsed our_scores: [13.0, 13.0]
T

In [74]:
### FAILED FEATURES: STAR PLAYER ADVANTAGE, CONSISTENCY ADVANTAGE###
### DECREASED MODEL ACCURACY, INDIVIDUAL PLAYER DATA MAY BE HARMFUL###

# FAILED EXPERIMENTS - Both decreased accuracy:
# 1. Team consistency (rating std) - teams with more consistent players didn't perform better
# 2. Star player advantage (best player comparison) - individual stars don't predict wins well



import numpy as np


# raw consistency
df['my_team_rating_std'] = df['all_player_ratings'].apply(lambda x: np.std(x[:5]))
df['opp_team_rating_std'] = df['all_player_ratings'].apply(lambda x: np.std(x[5:]))


df['rolling_my_consistency'] = pd.NA
df['rolling_opp_consistency'] = pd.NA

# Calculate 20-day rolling averages for each team
for team_id in df['team_id'].unique():
    team_mask = df['team_id'] == team_id
    team_data = df[team_mask].copy()
    
    # rolling means
    team_data = team_data.set_index('date').sort_index()
    rolling_my_std = team_data['my_team_rating_std'].rolling('20D', min_periods=2).mean().shift(1)
    rolling_opp_std = team_data['opp_team_rating_std'].rolling('20D', min_periods=2).mean().shift(1)
    
    
    df.loc[team_mask, 'rolling_my_consistency'] = rolling_my_std.values
    df.loc[team_mask, 'rolling_opp_consistency'] = rolling_opp_std.values


df['rolling_my_consistency'] = df['rolling_my_consistency'].astype('float64')
df['rolling_opp_consistency'] = df['rolling_opp_consistency'].astype('float64')

# Create consistency advantage feature
df['consistency_advantage'] = df['rolling_opp_consistency'] - df['rolling_my_consistency']  # Lower std is better, so flip it





# First calculate the raw star player advantage for each match
df['my_best_player'] = df['all_player_ratings'].apply(lambda x: max(x[:5]))
df['opp_best_player'] = df['all_player_ratings'].apply(lambda x: max(x[5:]))
df['star_player_diff'] = df['my_best_player'] - df['opp_best_player']


df['rolling_my_star'] = pd.NA
df['rolling_opp_star'] = pd.NA


for team_id in df['team_id'].unique():
    team_mask = df['team_id'] == team_id
    team_data = df[team_mask].copy()
    
    
    team_data = team_data.set_index('date').sort_index()
    rolling_my_star = team_data['my_best_player'].rolling('20D', min_periods=2).mean().shift(1)
    rolling_opp_star = team_data['opp_best_player'].rolling('20D', min_periods=2).mean().shift(1)
    
    
    df.loc[team_mask, 'rolling_my_star'] = rolling_my_star.values
    df.loc[team_mask, 'rolling_opp_star'] = rolling_opp_star.values


df['rolling_my_star'] = df['rolling_my_star'].astype('float64')
df['rolling_opp_star'] = df['rolling_opp_star'].astype('float64')

# Create star player advantage feature
df['star_player_advantage'] = df['rolling_my_star'] - df['rolling_opp_star']

In [75]:
### RECENT FORM FEATURE ###
### Calculates the recent form of a team over the last n games within a specified time window ###

def get_recent_form(df, team_id, current_date, n_games, days_limit):
    # get matches before cutoff date
    team_matches = df[(df['team_id'] == team_id) & (df['date'] < current_date)]
    
    # time limit cutoff
    cutoff_date = current_date - pd.Timedelta(days=days_limit)
    recent_matches = team_matches[team_matches['date'] >= cutoff_date]

    # take games within the limit
    recent = recent_matches.tail(n_games)
    
    return (recent['result'] == 'W').sum() / len(recent) if len(recent) > 0 else 0.5

# Apply with 30-day time limit, 5 games (sweet spot for recent form)
df['recent_form'] = df.apply(lambda row: get_recent_form(df, row['team_id'], row['date'], 5, 30), axis=1)

In [76]:
### HEAD-TO-HEAD ADVANTAGE FEATURE ###
## Calculates the winrate against a specific opponent before the current match date
## Much more reliable in test dataset compared to train dataset

def get_h2h_winrate(df, team1, opponent, current_date):
    h2h = df[(df['team_name'] == team1) & 
             (df['opponent'] == opponent) & 
             (df['date'] < current_date)]
    if len(h2h) < 2: return 0.5  # Need minimum history
    return (h2h['result'] == 'W').mean()

df['h2h_advantage'] = df.apply(lambda row: 
    get_h2h_winrate(df, row['team_name'], row['opponent'], row['date']), axis=1)

In [77]:
### Checking 21-22 ###


print("With 2021-22:")
print(f"Overall win rate: {(df['result'] == 'W').mean():.3f}")
print(f"Overall baseline: {max((df['result'] == 'W').mean(), (df['result'] == 'L').mean()) * 100:.1f}%")

# Exclude 2021-2022 matches
df['date'] = pd.to_datetime(df['date'])
df_filtered = df[df['date'] >= '2023-01-01']
print("\nWithout 2021-22:")
print(f"Overall win rate: {(df_filtered['result'] == 'W').mean():.3f}")
print(f"Overall baseline: {max((df_filtered['result'] == 'W').mean(), (df_filtered['result'] == 'L').mean()) * 100:.1f}%")
print(f"Matches remaining: {len(df_filtered)}")

With 2021-22:
Overall win rate: 0.523
Overall baseline: 52.3%

Without 2021-22:
Overall win rate: 0.518
Overall baseline: 51.8%
Matches remaining: 1587


In [78]:
df.to_csv('notebooks/processed_valorant_dataset.csv', index=False)