# Sports Analytics - Import Data and Calculating Players Stats

In [82]:
# Import libraries for data cleaning and manipulation
import pandas as pd
import numpy as np

In [83]:
# Load the player data from a CSV file and show a preview
player_data = pd.read_csv('Data/player_data.csv')
print(f"\nShape: {player_data.shape}") 
print(f"Columns: {player_data.columns.tolist()}")


Shape: (144, 2)
Columns: ['player', 'rating']


In [84]:
# Load the game stint data from a CSV file and show a preview
stint_data = pd.read_csv('Data/stint_data.csv')
print(f"\nShape: {stint_data.shape}")
print(f"Columns: {stint_data.columns.tolist()}")


Shape: (7448, 14)
Columns: ['game_id', 'h_team', 'a_team', 'minutes', 'h_goals', 'a_goals', 'home1', 'home2', 'home3', 'home4', 'away1', 'away2', 'away3', 'away4']


In [85]:
# Get final score per game
final_scores = (
    stint_data
    .groupby('game_id', as_index=False)
    .agg({
        'h_team': 'first',
        'a_team': 'first',
        'h_goals': 'max',
        'a_goals': 'max'
    })
)

# Determine winner
final_scores['winner'] = np.where(
    final_scores['h_goals'] > final_scores['a_goals'],
    final_scores['h_team'],
    np.where(
        final_scores['a_goals'] > final_scores['h_goals'],
        final_scores['a_team'],
        'Draw'
    )
)

final_scores

Unnamed: 0,game_id,h_team,a_team,h_goals,a_goals,winner
0,1,USA,Japan,8,13,Japan
1,2,USA,Great.Britain,10,7,USA
2,3,USA,Canada,13,6,USA
3,4,USA,France,12,8,USA
4,5,USA,Denmark,11,15,Denmark
...,...,...,...,...,...,...
655,656,Chile,Sweden,17,9,Chile
656,657,Chile,Brazil,10,12,Brazil
657,658,Chile,Columbia,15,9,Chile
658,659,Chile,Poland,10,9,Chile


In [86]:
# Explode stints into player-level data

player_cols_home = ['home1','home2','home3','home4']
player_cols_away = ['away1','away2','away3','away4']

def explode_stints(stints):
    rows = []
    for _, r in stints.iterrows():
        for side, cols in [('home', player_cols_home), ('away', player_cols_away)]:
            for p in cols:
                rows.append({
                    'game_id': r.game_id,
                    'player': r[p],
                    'side': side,
                    'minutes': r.minutes,
                    'team_goals': r.h_goals if side == 'home' else r.a_goals,
                    'opp_goals': r.a_goals if side == 'home' else r.h_goals,
                    'home_win': int(
                        (r.h_goals > r.a_goals) if side == 'home' else (r.a_goals > r.h_goals)
                    )
                })
    return pd.DataFrame(rows)

player_stints = explode_stints(stint_data)
player_stints

Unnamed: 0,game_id,player,side,minutes,team_goals,opp_goals,home_win
0,1,USA_p4,home,4.252969,4,9,0
1,1,USA_p1,home,4.252969,4,9,0
2,1,USA_p3,home,4.252969,4,9,0
3,1,USA_p6,home,4.252969,4,9,0
4,1,Japan_p12,away,4.252969,9,4,1
...,...,...,...,...,...,...,...
59579,660,Chile_p4,home,2.200405,5,2,1
59580,660,Argentina_p6,away,2.200405,2,5,0
59581,660,Argentina_p5,away,2.200405,2,5,0
59582,660,Argentina_p12,away,2.200405,2,5,0


In [87]:
# Calculate per-minute goal differenatial and add to player_stints df
player_stints['goal_diff_per_min'] = (
    (player_stints.team_goals - player_stints.opp_goals)
    / player_stints.minutes
)

player_stints['team_goals_per_min'] = player_stints.team_goals / player_stints.minutes
player_stints['opp_goals_per_min']  = player_stints.opp_goals  / player_stints.minutes
player_stints

Unnamed: 0,game_id,player,side,minutes,team_goals,opp_goals,home_win,goal_diff_per_min,team_goals_per_min,opp_goals_per_min
0,1,USA_p4,home,4.252969,4,9,0,-1.175649,0.940519,2.116169
1,1,USA_p1,home,4.252969,4,9,0,-1.175649,0.940519,2.116169
2,1,USA_p3,home,4.252969,4,9,0,-1.175649,0.940519,2.116169
3,1,USA_p6,home,4.252969,4,9,0,-1.175649,0.940519,2.116169
4,1,Japan_p12,away,4.252969,9,4,1,1.175649,2.116169,0.940519
...,...,...,...,...,...,...,...,...,...,...
59579,660,Chile_p4,home,2.200405,5,2,1,1.363385,2.272309,0.908923
59580,660,Argentina_p6,away,2.200405,2,5,0,-1.363385,0.908923,2.272309
59581,660,Argentina_p5,away,2.200405,2,5,0,-1.363385,0.908923,2.272309
59582,660,Argentina_p12,away,2.200405,2,5,0,-1.363385,0.908923,2.272309


In [88]:
# Add ating features to player_stints df

ratings = player_data.set_index('player')['rating']

def add_teammate_features(df):
    df = df.copy()
    for gid in df.game_id.unique():
        mask = df.game_id == gid
        players = df.loc[mask, 'player'] #collect players in the game
        r = ratings.loc[players].values
        df.loc[mask, 'avg_teammate_rating'] = np.mean(r)
        df.loc[mask, 'sum_teammate_rating'] = np.sum(r)
        df.loc[mask, 'max_teammate_rating'] = np.max(r)
    return df

player_stints = add_teammate_features(player_stints)
player_stints

Unnamed: 0,game_id,player,side,minutes,team_goals,opp_goals,home_win,goal_diff_per_min,team_goals_per_min,opp_goals_per_min,avg_teammate_rating,sum_teammate_rating,max_teammate_rating
0,1,USA_p4,home,4.252969,4,9,0,-1.175649,0.940519,2.116169,1.875000,165.0,3.5
1,1,USA_p1,home,4.252969,4,9,0,-1.175649,0.940519,2.116169,1.875000,165.0,3.5
2,1,USA_p3,home,4.252969,4,9,0,-1.175649,0.940519,2.116169,1.875000,165.0,3.5
3,1,USA_p6,home,4.252969,4,9,0,-1.175649,0.940519,2.116169,1.875000,165.0,3.5
4,1,Japan_p12,away,4.252969,9,4,1,1.175649,2.116169,0.940519,1.875000,165.0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59579,660,Chile_p4,home,2.200405,5,2,1,1.363385,2.272309,0.908923,1.920455,169.0,3.5
59580,660,Argentina_p6,away,2.200405,2,5,0,-1.363385,0.908923,2.272309,1.920455,169.0,3.5
59581,660,Argentina_p5,away,2.200405,2,5,0,-1.363385,0.908923,2.272309,1.920455,169.0,3.5
59582,660,Argentina_p12,away,2.200405,2,5,0,-1.363385,0.908923,2.272309,1.920455,169.0,3.5


In [89]:
# Features to be used in ML
features = [
    'team_goals_per_min',
    'opp_goals_per_min',
    'avg_teammate_rating',
    'sum_teammate_rating',
    'max_teammate_rating',
    'home_win'
]

# Sports Analytics - Machine Learning

In [95]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GroupKFold
from sklearn.utils import shuffle

In [96]:
# Define feature matrix (X) and target vector (y)
X = player_stints[features]
y = player_stints['goal_diff_per_min']

In [97]:
# Random Forest model
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=9,
    min_samples_split=20,
    min_samples_leaf=5,
    max_features=0.5,
    random_state=42,
    n_jobs=-1
)

# Group K-Fold Cross-Validation
groups = player_stints["game_id"] 
X, y, groups = shuffle(X, y, groups, random_state=42)
gkf = GroupKFold(n_splits=5)

train_r2, test_r2 = [], []
train_mse, test_mse = [], []
train_mae, test_mae = [], []

# Cross-validation
for train_idx, test_idx in gkf.split(X, y, groups):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    rf.fit(X_train, y_train)

    y_train_pred = rf.predict(X_train)
    y_test_pred = rf.predict(X_test)

    train_r2.append(r2_score(y_train, y_train_pred))
    test_r2.append(r2_score(y_test, y_test_pred))

    train_mse.append(mean_squared_error(y_train, y_train_pred))
    test_mse.append(mean_squared_error(y_test, y_test_pred))

    train_mae.append(mean_absolute_error(y_train, y_train_pred))
    test_mae.append(mean_absolute_error(y_test, y_test_pred))

# Fit model
rf.fit(X, y)


In [98]:
# Output cross-validation results
print(f"Train R²: {np.mean(train_r2):.3f} ± {np.std(train_r2):.3f}")
print(f"Test  R²: {np.mean(test_r2):.3f} ± {np.std(test_r2):.3f}")

print(f"Train MSE: {np.mean(train_mse):.3f}")
print(f"Test  MSE: {np.mean(test_mse):.3f}")

print(f"Train MAE: {np.mean(train_mae):.3f}")
print(f"Test  MAE: {np.mean(test_mae):.3f}")


Train R²: 0.804 ± 0.090
Test  R²: 0.817 ± 0.312
Train MSE: 2.564
Test  MSE: 7.401
Train MAE: 0.144
Test  MAE: 0.169


In [99]:
# Predict player scores
player_stints['predicted_score'] = rf.predict(X)

player_scores = (
    player_stints
    .groupby('player')['predicted_score']
    .mean()
    .reset_index()  
)

# Output player scores to Excel - for optimization portion
player_scores.to_excel('player_scores.xlsx', index=False)