In [24]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [26]:
df = pd.read_csv("/Users/lionlucky7/Desktop/Coding_Project/data/processed_whole/processed_data.csv")

In [27]:
def make_samples(df, window, features_exclude, target_cols):

    df.sort_values(['player', 'season'], inplace=True)

    rows = []
    
    for player, g in df.groupby('player'):
        g = g.reset_index(drop=True)
        if len(g) <= window:
            continue
        for i in range(window, len(g)):
            past = g.loc[i-window:i-1].copy()
            target = g.loc[i, target_cols].copy()
            
            flat = {}
            for j, (_, r) in enumerate(past.iterrows(), start=1):
                suffix = f"_t-{window-j+1}"
                for c in r.index:
                    if c in features_exclude:
                        continue
                    flat[c+suffix] = r[c]
            
            flat['player'] = player
            flat['season_target'] = g.loc[i, 'season']
            flat['age_at_target'] = g.loc[i-1, 'age'] + 1
            
            flat['team_last'] = g.loc[i-1, 'team']
            flat["league_last"] = g.loc[i-1, 'league']

            for tcol in target_cols:
                flat[tcol] = target[tcol]
            
            rows.append(flat)
    return pd.DataFrame(rows)

In [None]:
df['season_code'] = df['season'].astype(str).str.zfill(4)

df['season'] = df['season_code'].apply(
    lambda x: int("19" + x[:2]) if int(x[2:]) > 40 else int("20" + x[:2]))

WINDOW = 3
TARGET_SEASONS = ['Standard_Sh/90', 'Standard_SoT/90', 'Standard_SoT%', 'Standard_G/Sh',
       'Standard_G/SoT', 'Per 90 Minutes_Gls', 'Per 90 Minutes_Ast',
       'Per 90 Minutes_G+A', 'Expected_G-xG', 'Expected_A-xAG',
       'Per 90 Minutes_xG', 'Per 90 Minutes_xAG', 'Per 90 Minutes_xG+xAG',
       'Progression_PrgC', 'Progression_PrgP', 'Progression_PrgR', 'PrgP',
       'Carries_PrgC', 'Short_Cmp%', 'Medium_Cmp%', 'Long_Cmp%', '1/3', 'PPA',
       'CrsPA', 'SCA_SCA90', 'GCA_GCA90', 'Tackles_TklW', 'Challenges_Tkl%',
       'Int', 'Blocks_Blocks', 'Performance_Recov', 'Take-Ons_Att',
       'Take-Ons_Succ%', 'Carries_Mis', 'Receiving_Rec', 'Receiving_PrgR',
       'Aerial Duels_Won%']

CATEGORICAL_COLS = ['league', 'pos']
NUMERIC_IGNORE = ['player', 'team', 'season'] 

all_cols = list(df.columns)
df['season'] = df['season'].astype(int)
target_cols = TARGET_SEASONS 
exclude = ['player', 'team', 'season', 'league', 'pos']
dataset = make_samples(df, window=WINDOW, features_exclude=exclude, target_cols=target_cols)

In [19]:
def time_split(df, train_until=2018, val_from=2019, val_until=2021, test_from=2022):
    train = df[df['season_target'] <= train_until]
    val = df[(df['season_target'] >= val_from) & (df['season_target'] <= val_until)]
    test = df[df['season_target'] >= test_from]
    return train, val, test

train_df, val_df, test_df = time_split(dataset, train_until=2018, val_from=2019, val_until=2021, test_from=2022)

In [None]:
def train_save_model(window, save_path):
    id_cols = ['player','season_target','team_last','league_last']
    y_cols = target_cols
    X_cols = [c for c in train_df.columns if (c not in id_cols + y_cols)]

    cat_cols = [c for c in X_cols if c.startswith('pos') or ('pos' in c and c.endswith('_t-1'))]

    last_cat_cols = [c for c in X_cols if c.endswith('_t-1') and ('pos' in c or 'league_last' in c)]
    num_cols = [c for c in X_cols if c not in last_cat_cols]

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), last_cat_cols)
    ])

    model = Pipeline([
        ('pre', preprocessor),
        ('est', MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.05, max_depth=6)))
    ])

    X_train = train_df[X_cols]
    y_train = train_df[y_cols]
    X_val = val_df[X_cols]
    y_val = val_df[y_cols]

    model.fit(X_train, y_train)
    
    
    model_path = os.path.join(save_path, "metrics", f"Model_Window{window}.pkl")
    joblib.dump(model, model_path)

    metrics_list = []
    y_pred = model.predict(X_val)
    for i, col in enumerate(y_cols):
        mae = mean_absolute_error(y_val.iloc[:, i], y_pred[:, i])
        rmse = np.sqrt(mean_squared_error(y_val.iloc[:, i], y_pred[:, i]))
        r2 = r2_score(y_val.iloc[:, i], y_pred[:, i])
        
        metrics_list.append({
            "column":col,
            "MAE":mae,
            "RMSE":rmse,
            "R2": r2
        })
        
        
        
        metrics_df = pd.DataFrame(metrics_list)
    metrics_path = os.path.join(save_path, "metrics", f"model_window{window}_metrics.csv")
    metrics_df.to_csv(metrics_path, index=False)

In [23]:
df.columns

Index(['league', 'season', 'team', 'player', 'pos', 'age', 'Playing Time_90s',
       'Standard_Sh/90', 'Standard_SoT/90', 'Standard_SoT%', 'Standard_G/Sh',
       'Standard_G/SoT', 'Per 90 Minutes_Gls', 'Per 90 Minutes_Ast',
       'Per 90 Minutes_G+A', 'Expected_G-xG', 'Expected_A-xAG',
       'Per 90 Minutes_xG', 'Per 90 Minutes_xAG', 'Per 90 Minutes_xG+xAG',
       'Progression_PrgC', 'Progression_PrgP', 'Progression_PrgR', 'PrgP',
       'Carries_PrgC', 'Short_Cmp%', 'Medium_Cmp%', 'Long_Cmp%', '1/3', 'PPA',
       'CrsPA', 'SCA_SCA90', 'GCA_GCA90', 'Tackles_TklW', 'Challenges_Tkl%',
       'Int', 'Blocks_Blocks', 'Performance_Recov', 'Take-Ons_Att',
       'Take-Ons_Succ%', 'Carries_Mis', 'Receiving_Rec', 'Receiving_PrgR',
       'Aerial Duels_Won%', 'season_code'],
      dtype='object')