In [2]:
import pandas as pd

In [19]:
df = pd.read_csv("../data/final/final.csv")

In [20]:
selected_features = [
    # Identifiers / grouping
    "league",
    "general_position",
    "best_position",
    "season",

    # Demographics & physical
    "age",
    "height(cm)",
    "weight(kg)",
    "foot",
    "skill_moves",
    "international_reputation",

    # Playing time
    "Playing_Time_Min",
    "Playing_Time_90s",
    "Starts_Starts",

    # Attacking output (per 90)
    "Per_90_Minutes_Gls",
    "Per_90_Minutes_Ast",
    "Per_90_Minutes_G+A",
    "Per_90_Minutes_xG",
    "Per_90_Minutes_xAG",
    "Per_90_Minutes_npxG",

    # Shooting efficiency
    "Standard_SoT%",
    "Standard_G/Sh",
    "Expected_npxG/Sh",
    "Expected_G-xG",

    # Passing & creativity
    "KP",
    "Ast",
    "Total_PrgDist",
    "Progression_PrgP",
    "SCA_SCA90",
    "GCA_GCA90",

    # Carrying
    "Carries_PrgDist",
    "Carries_1/3",
    "Take-Ons_Succ",

    # Defensive contribution
    "Tkl+Int",
    "Int",
    "Blocks_Blocks",
    "Aerial_Duels_Won",

    # Discipline / reliability
    "Performance_CrdY",
    "Performance_CrdR",
    "Err",

    # Team context
    "Team_Success_PPM",
    "Team_Success_+/-90",
    "Team_Success_(xG)_xG+/-90"
]

TARGETS = [
    # Scoring threat (finishing quality & volume)
    "Per_90_Minutes_npxG",
    "Per_90_Minutes_xG",
    "Standard_Sh/90",

    # Creativity & chance creation
    "Per_90_Minutes_xAG",
    "KP",
    "SCA_SCA90",

    # Ball progression & buildup value
    "Progression_PrgP",
    "Progression_PrgC",
    "Carries_PrgDist",

    # Defensive contribution
    "Tkl+Int",
    "Blocks_Blocks",
    "Aerial_Duels_Won",

    # Involvement / usage
    "Touches_Touches",
    "Receiving_Rec",
]


In [21]:
def create_lagged_dataset(
    df,
    target_cols,
    lag_years=4,
    static_features=None,
    id_cols=None,
    predict_next_season=True,
    test_season=None,
    min_seasons=None
):
    """
    Transform time-series player data into lagged features for ML training.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Raw dataframe with player-season observations
    target_cols : list
        Target variables to predict (will NOT be lagged)
    lag_years : int, default=4
        Number of historical seasons to create lags for
    static_features : list, default=['age', 'foot', 'height(cm)', 'general_position', 'best_position', 'league']
        Features that don't get lagged (current season only)
    id_cols : list, default=['player', 'season']
        Identifier columns (not used as features)
    predict_next_season : bool, default=True
        If True, targets are from season t+1 (forward prediction)
        If False, targets are from season t (current season estimation)
    test_season : int, optional
        If provided, creates train/test split with this season as test
    min_seasons : int, optional
        Minimum seasons required per player (filters before lagging)
    
    Returns:
    --------
    If test_season is None:
        df_lagged : pd.DataFrame with lagged features + targets
    If test_season is provided:
        (df_train, df_test) : tuple of train and test DataFrames
    """
    import pandas as pd
    
    # Defaults
    if static_features is None:
        static_features = ['age', 'foot', 'height(cm)', 'general_position', 'best_position', 'league']
    if id_cols is None:
        id_cols = ['player', 'season']
    
    # Filter by minimum seasons if specified
    if min_seasons is not None:
        player_counts = df['player'].value_counts()
        players_to_keep = player_counts[player_counts >= min_seasons].index
        df = df[df['player'].isin(players_to_keep)].copy()
    
    # Sort by player and season
    df_base = df.sort_values(id_cols).reset_index(drop=True)
    
    # Identify dynamic features (exclude IDs, static, and targets)
    all_feature_cols = [c for c in df_base.columns if c not in id_cols]
    lag_features = [
        c for c in all_feature_cols
        if c not in static_features + target_cols
    ]
    
    # Create lagged features for dynamic variables
    lagged_dfs = []
    for lag in range(1, lag_years + 1):
        shifted = (
            df_base
            .groupby('player')[lag_features]
            .shift(lag)
            .add_suffix(f'_lag{lag}')
        )
        lagged_dfs.append(shifted)
    
    # Handle targets based on prediction type
    if predict_next_season:
        # Shift targets backward (future values become current row's target)
        targets = (
            df_base
            .groupby('player')[target_cols]
            .shift(-1)  # Get next season's values
        )
        print(f"‚ö†Ô∏è  Predicting NEXT season's performance (t+1)")
        print(f"    Features: seasons t, t-1, t-2, ... t-{lag_years}")
        print(f"    Targets: season t+1")
    else:
        # Keep targets as current season
        targets = df_base[target_cols].copy()
        print(f"‚ö†Ô∏è  Predicting CURRENT season's performance (t)")
        print(f"    Features: seasons t-1, t-2, t-3, ... t-{lag_years}")
        print(f"    Targets: season t")
    
    # Assemble final dataset
    df_lagged = pd.concat(
        [
            df_base[id_cols + static_features],  # IDs + current static features
            *lagged_dfs,  # Historical lagged features
            targets  # Target variables
        ],
        axis=1
    )
    
    # Drop rows with incomplete lag history or missing targets
    df_lagged = df_lagged.dropna().reset_index(drop=True)
    
    print(f"‚úÖ Final dataset: {len(df_lagged)} samples, {len(df_lagged.columns)} columns")
    
    # Create train/test split if test_season specified
    if test_season is not None:
        df_train = df_lagged[df_lagged['season'] < test_season].copy()
        df_test = df_lagged[df_lagged['season'] == test_season].copy()
        
        print(f"\nüìä Train: {len(df_train)} samples (seasons < {test_season})")
        print(f"üìä Test:  {len(df_test)} samples (season == {test_season})")
        
        return df_train, df_test
    
    return df_lagged

In [23]:
df_lagged_3 = create_lagged_dataset(
    df,
    lag_years = 3,
    static_features = [
        "age",
        "foot",
        "height(cm)",
        "general_position",
        "best_position",
        "league"
    ],
    id_cols = ["player", "season"],
    target_cols=TARGETS
)

‚ö†Ô∏è  Predicting NEXT season's performance (t+1)
    Features: seasons t, t-1, t-2, ... t-3
    Targets: season t+1
‚úÖ Final dataset: 2830 samples, 628 columns


In [24]:
df_lagged_3

Unnamed: 0,player,season,age,foot,height(cm),general_position,best_position,league,team_lag1,nation_lag1,...,KP,SCA_SCA90,Progression_PrgP,Progression_PrgC,Carries_PrgDist,Tkl+Int,Blocks_Blocks,Aerial_Duels_Won,Touches_Touches,Receiving_Rec
0,a. bamba,2020,29,2,182,DF,RB,FRA-Ligue 1,Angers,CIV,...,37.0,4.54,58.0,82.0,2564.0,35.0,8.0,10.0,945.0,705.0
1,a. bamba,2020,29,2,182,FW,RB,FRA-Ligue 1,Angers,CIV,...,3.0,0.97,15.0,9.0,258.0,14.0,4.0,3.0,321.0,177.0
2,a. bamba,2020,25,2,176,DF,RM,FRA-Ligue 1,Nantes,FRA,...,37.0,4.54,58.0,82.0,2564.0,35.0,8.0,10.0,945.0,705.0
3,a. bamba,2020,25,2,176,FW,RM,FRA-Ligue 1,Angers,CIV,...,8.0,1.40,53.0,19.0,818.0,46.0,7.0,10.0,993.0,553.0
4,a. bamba,2021,30,2,182,DF,RB,FRA-Ligue 1,Nantes,FRA,...,23.0,4.42,43.0,41.0,1171.0,25.0,10.0,4.0,619.0,450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2825,√°. correa,2024,28,2,171,FW,RW,ESP-La Liga,Atl√©tico Madrid,ARG,...,14.0,3.92,25.0,30.0,768.0,13.0,10.0,1.0,419.0,303.0
2826,√ß. s√∂y√ºnc√º,2021,24,2,185,DF,CB,ENG-Premier League,Leicester City,TUR,...,7.0,0.54,90.0,27.0,3440.0,55.0,12.0,61.0,1609.0,1138.0
2827,√ß. s√∂y√ºnc√º,2021,24,2,185,DF,CB,ENG-Premier League,Leicester City,TUR,...,1.0,0.34,12.0,5.0,601.0,21.0,9.0,24.0,382.0,203.0
2828,√≥. duarte,2021,31,2,186,DF,CB,ESP-La Liga,Levante,CRC,...,4.0,0.57,52.0,6.0,1818.0,58.0,23.0,71.0,1446.0,920.0


In [18]:
df_lagged_3

Unnamed: 0,player,season,age,foot,height(cm),general_position,best_position,league,team,nation,...,Team_Success_(xG)_xG+/-_lag3,Team_Success_(xG)_xG+/-90_lag3,Team_Success_(xG)_On-Off_lag3,Performance_Recov_lag3,Aerial_Duels_Won_lag3,Aerial_Duels_Lost_lag3,weight(kg)_lag3,value(‚Ç¨)_lag3,wage(‚Ç¨)_lag3,release_clause(‚Ç¨)_lag3
0,a. ayew,2025,34,1,176,FW,ST,FRA-Ligue 1,Le Havre,GHA,...,-7.2,-0.68,0.04,41.0,28.0,48.0,72.0,11000000.0,70000.0,20900000.0
1,a. bamba,2020,29,2,182,DF,RB,FRA-Ligue 1,Angers,CIV,...,-0.7,-0.04,-0.16,81.0,17.0,13.0,72.0,2000000.0,15000.0,3900000.0
2,a. bamba,2020,29,2,182,FW,RB,FRA-Ligue 1,Nantes,FRA,...,0.9,0.03,-0.17,139.0,22.0,28.0,72.0,3200000.0,20000.0,6300000.0
3,a. bamba,2020,25,2,176,DF,RM,FRA-Ligue 1,Angers,CIV,...,0.9,0.03,-0.17,139.0,22.0,28.0,68.0,400000.0,4000.0,790000.0
4,a. bamba,2020,25,2,176,FW,RM,FRA-Ligue 1,Nantes,FRA,...,0.5,0.08,0.06,32.0,3.0,8.0,72.0,3300000.0,18000.0,6600000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4191,√≥. duarte,2021,31,2,186,DF,CB,ESP-La Liga,Levante,CRC,...,-1.4,-0.13,0.23,33.0,32.0,14.0,81.0,6000000.0,30000.0,12600000.0
4192,√≥. duarte,2021,31,2,186,DF,CB,ESP-La Liga,Levante,CRC,...,-3.2,-0.44,-0.39,35.0,23.0,10.0,81.0,6000000.0,30000.0,12600000.0
4193,√≥. trejo,2025,36,2,180,MF,CAM,ESP-La Liga,Rayo Vallecano,ARG,...,-3.0,-0.16,0.29,121.0,28.0,33.0,79.0,7000000.0,20000.0,15400000.0
4194,√∂. toprak,2021,30,2,186,DF,CB,GER-Bundesliga,Werder Bremen,TUR,...,27.8,1.24,1.59,145.0,78.0,34.0,83.0,21000000.0,70000.0,37300000.0


### Usage Examples

```python
# Example 1: Predict NEXT season (most common for scouting)
df_lagged = create_lagged_dataset(
    df,
    target_cols=TARGETS,
    lag_years=3,
    predict_next_season=True,  # Use seasons t-1, t-2, t-3 to predict t+1
    min_seasons=7
)

# Example 2: Predict CURRENT season (retrodiction/estimation)
df_lagged = create_lagged_dataset(
    df,
    target_cols=TARGETS,
    lag_years=3,
    predict_next_season=False,  # Use seasons t-1, t-2, t-3 to predict t
    min_seasons=7
)

# Example 3: With train/test split (2024 as test)
df_train, df_test = create_lagged_dataset(
    df,
    target_cols=TARGETS,
    lag_years=3,
    test_season=2024,
    predict_next_season=True,
    min_seasons=7
)

# After getting the dataset, separate features and targets:
X_train = df_train.drop(columns=['player', 'season'] + TARGETS)
y_train = df_train[TARGETS]
```

In [None]:
# Step 8: Save model and predictions
import joblib

# Save model and preprocessor
joblib.dump(model, '../models/main/artifacts/player_predictor.pkl')
joblib.dump(preprocessor, '../models/preprocessing/artifacts/preprocessor.pkl')

# Save predictions with player names
predictions_df = df_test[['player', 'season']].copy()
for i, target in enumerate(TARGETS):
    predictions_df[f'{target}_pred'] = y_test_pred[:, i]
    predictions_df[f'{target}_actual'] = y_test[target].values

predictions_df.to_csv('../outputs/predictions/player_predictions_2024.csv', index=False)
print("‚úÖ Model and predictions saved!")

In [None]:
# Step 7: Feature importance (for Random Forest)
if hasattr(model, 'feature_importances_'):
    # Get feature names after preprocessing
    cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_feature_names = list(numerical_features) + list(cat_features)
    
    # Feature importance
    importance_df = pd.DataFrame({
        'feature': all_feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 20 Most Important Features:")
    print(importance_df.head(20))

In [None]:
# Step 6: Evaluate
y_train_pred = model.predict(X_train_processed)
y_test_pred = model.predict(X_test_processed)

# Per-target metrics
print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)
print(f"\n{'Target':<30} {'Train MAE':<12} {'Test MAE':<12} {'Train R¬≤':<12} {'Test R¬≤':<12}")
print("-"*80)

for i, target in enumerate(TARGETS):
    train_mae = mean_absolute_error(y_train.iloc[:, i], y_train_pred[:, i])
    test_mae = mean_absolute_error(y_test.iloc[:, i], y_test_pred[:, i])
    train_r2 = r2_score(y_train.iloc[:, i], y_train_pred[:, i])
    test_r2 = r2_score(y_test.iloc[:, i], y_test_pred[:, i])
    
    print(f"{target:<30} {train_mae:<12.4f} {test_mae:<12.4f} {train_r2:<12.4f} {test_r2:<12.4f}")

# Overall metrics
overall_train_mae = np.mean([mean_absolute_error(y_train.iloc[:, i], y_train_pred[:, i]) 
                              for i in range(len(TARGETS))])
overall_test_mae = np.mean([mean_absolute_error(y_test.iloc[:, i], y_test_pred[:, i]) 
                             for i in range(len(TARGETS))])

print("-"*80)
print(f"{'OVERALL AVERAGE':<30} {overall_train_mae:<12.4f} {overall_test_mae:<12.4f}")
print("="*80)

In [None]:
# Step 5: Train the model
print("Training model...")
model.fit(X_train_processed, y_train)
print("‚úÖ Training complete!")

In [None]:
# Step 4: Model selection - Multi-output regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# Option 1: Random Forest (handles multi-output natively)
model_rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42,
    n_jobs=-1
)

# Option 2: XGBoost (wrap in MultiOutputRegressor)
model_xgb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        random_state=42
    )
)

# Choose model
model = model_rf  # or model_xgb

In [None]:
# Step 3: Preprocessing pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify categorical vs numerical features
categorical_features = ['foot', 'general_position', 'best_position', 'league']
numerical_features = [col for col in X_train.columns if col not in categorical_features]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)

# Fit and transform
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed features: {X_train_processed.shape[1]}")

In [None]:
# Step 1: Create train/test split
df_train, df_test = create_lagged_dataset(
    df,
    target_cols=TARGETS,
    lag_years=3,
    test_season=2024,  # Use 2024 as test
    predict_next_season=True,
    min_seasons=7
)

# Step 2: Separate features and targets
X_train = df_train.drop(columns=['player', 'season'] + TARGETS)
y_train = df_train[TARGETS]

X_test = df_test.drop(columns=['player', 'season'] + TARGETS)
y_test = df_test[TARGETS]

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Targets: {y_train.shape[1]} metrics")

## Training Pipeline

In [5]:
df = df[df["Playing_Time_90s"] >= 5]

player_counts = df['player'].value_counts()
players_to_keep = player_counts[player_counts > 6].index
df = df[df['player'].isin(players_to_keep)]
df.drop_duplicates(subset=['player', 'season'], inplace=True)

In [6]:
df = df[['player'] + selected_features]

In [7]:
# Configuration
lag_years = 6
ID_COLS = ['player', 'season']
STATIC_FEATURES = ['age', 'foot', 'height(cm)', 'general_position', 'best_position', 'league']  # age kept as current-season only

# Sort once
df_base = (
    df
    .sort_values(['player', 'season'])
    .reset_index(drop=True)
)

# Dynamic features = everything except IDs + static
lag_features = [
    c for c in df_base.columns
    if c not in ID_COLS + STATIC_FEATURES
]

# Create lagged features (ONLY dynamic ones)
lagged = []
for lag in range(1, lag_years + 1):
    lagged.append(
        df_base
        .groupby('player')[lag_features]
        .shift(lag)
        .add_suffix(f'_lag{lag}')
    )

# Assemble dataset
df_lagged = pd.concat(
    [
        df_base[['player', 'season'] + STATIC_FEATURES],  # current-season static
        *lagged
    ],
    axis=1
)

# Collapse to one row per player (most recent season)
df_player_wide = (
    df_lagged
    .groupby('player', as_index=False)
    .last()
    .drop(columns='season')
)


In [8]:
df_lagged = df_lagged.dropna(axis=0)
df_lagged.reset_index(drop=True, inplace=True)

In [10]:
df_lagged.columns.tolist()

['player',
 'season',
 'age',
 'foot',
 'height(cm)',
 'general_position',
 'best_position',
 'league',
 'weight(kg)_lag1',
 'skill_moves_lag1',
 'international_reputation_lag1',
 'Playing_Time_Min_lag1',
 'Playing_Time_90s_lag1',
 'Starts_Starts_lag1',
 'Per_90_Minutes_Gls_lag1',
 'Per_90_Minutes_Ast_lag1',
 'Per_90_Minutes_G+A_lag1',
 'Per_90_Minutes_xG_lag1',
 'Per_90_Minutes_xAG_lag1',
 'Per_90_Minutes_npxG_lag1',
 'Standard_SoT%_lag1',
 'Standard_G/Sh_lag1',
 'Expected_npxG/Sh_lag1',
 'Expected_G-xG_lag1',
 'KP_lag1',
 'Ast_lag1',
 'Total_PrgDist_lag1',
 'Progression_PrgP_lag1',
 'SCA_SCA90_lag1',
 'GCA_GCA90_lag1',
 'Carries_PrgDist_lag1',
 'Carries_1/3_lag1',
 'Take-Ons_Succ_lag1',
 'Tkl+Int_lag1',
 'Int_lag1',
 'Blocks_Blocks_lag1',
 'Aerial_Duels_Won_lag1',
 'Performance_CrdY_lag1',
 'Performance_CrdR_lag1',
 'Err_lag1',
 'Team_Success_PPM_lag1',
 'Team_Success_+/-90_lag1',
 'Team_Success_(xG)_xG+/-90_lag1',
 'weight(kg)_lag2',
 'skill_moves_lag2',
 'international_reputation_l