# Feature Engineering for Team and Player Analysis
Generate rolling features and performance metrics for the 2023/24 season.

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

DATA_DIR = "./data"

print("üîß Feature Engineering Pipeline")
print("="*60)

üîß Feature Engineering Pipeline


## 1. Load 2023/24 Season Data

In [2]:
# Load team and player match data
print("\nüìÇ Loading data...")
team_matches = pd.read_csv(os.path.join(DATA_DIR, "team_matches_2023.csv"))
team_matches['date'] = pd.to_datetime(team_matches['date'])

player_matches = pd.read_csv(os.path.join(DATA_DIR, "player_matches_2023.csv"))
player_matches['date'] = pd.to_datetime(player_matches['date'])

print(f"‚úì Team matches: {len(team_matches)}")
print(f"‚úì Player matches: {len(player_matches)}")


üìÇ Loading data...
‚úì Team matches: 760
‚úì Player matches: 11384


## 2. Team Feature Engineering
Create rolling form features and performance metrics.

In [3]:
print("\n" + "="*60)
print("TEAM FEATURE ENGINEERING")
print("="*60)

# Sort by team and date
team_matches = team_matches.sort_values(['team_name', 'date']).reset_index(drop=True)

# Helper function for rolling features
def add_rolling_features(df, team_col='team_name', windows=[3, 5, 10]):
    """Add rolling features for each team."""
    result = df.copy()
    
    for window in windows:
        print(f"\n  Computing {window}-match rolling features...")
        
        # Create binary win column for rolling calculations
        result['_is_win'] = (result['result'] == 'W').astype(int)
        result['_is_draw'] = (result['result'] == 'D').astype(int)
        
        # Rolling features by team
        rolling_cols = {
            f'goals_for_L{window}': result.groupby(team_col)['goals_for'].transform(lambda x: x.rolling(window, min_periods=1).mean()),
            f'goals_against_L{window}': result.groupby(team_col)['goals_against'].transform(lambda x: x.rolling(window, min_periods=1).mean()),
            f'xG_L{window}': result.groupby(team_col)['xG'].transform(lambda x: x.rolling(window, min_periods=1).mean()),
            f'xGA_L{window}': result.groupby(team_col)['xGA'].transform(lambda x: x.rolling(window, min_periods=1).mean()),
            f'shots_L{window}': result.groupby(team_col)['shots'].transform(lambda x: x.rolling(window, min_periods=1).mean()),
            f'ppg_L{window}': result.groupby(team_col)['points'].transform(lambda x: x.rolling(window, min_periods=1).mean()),
            f'win_rate_L{window}': result.groupby(team_col)['_is_win'].transform(lambda x: x.rolling(window, min_periods=1).mean()),
        }
        
        for col_name, values in rolling_cols.items():
            result[col_name] = values
    
    # Drop temporary columns
    result = result.drop(columns=['_is_win', '_is_draw'])
    
    return result

# Add rolling features
team_features = add_rolling_features(team_matches)

# Add derived metrics
print("\n  Computing performance metrics...")
team_features['xG_diff'] = team_features['xG'] - team_features['xGA']
team_features['shot_quality'] = team_features['xG'] / team_features['shots'].replace(0, np.nan)
team_features['conversion_rate'] = team_features['goals_for'] / team_features['shots'].replace(0, np.nan)
team_features['pressing_intensity'] = 1 / team_features['ppda'].replace(0, np.nan)

# Add matchweek
team_features['matchweek'] = team_features.groupby('team_name').cumcount() + 1

print(f"\n‚úì Team features created: {len(team_features.columns)} columns")

# Save
team_features.to_csv(os.path.join(DATA_DIR, "team_features_2023.csv"), index=False)
print(f"üíæ Saved: team_features_2023.csv")


TEAM FEATURE ENGINEERING

  Computing 3-match rolling features...

  Computing 5-match rolling features...

  Computing 10-match rolling features...

  Computing performance metrics...

‚úì Team features created: 48 columns
üíæ Saved: team_features_2023.csv


## 3. Player Feature Engineering
Create per-90 metrics and cumulative season stats.

In [4]:
print("\n" + "="*60)
print("PLAYER FEATURE ENGINEERING")
print("="*60)

# Sort by player and date
player_matches = player_matches.sort_values(['player_name', 'date']).reset_index(drop=True)

# Calculate per-90 metrics
print("\n  Computing per-90 metrics...")
player_features = player_matches.copy()

per_90_cols = ['goals', 'assists', 'xG', 'xA', 'shots', 'key_passes', 'xGChain', 'xGBuildup']
minutes_played = player_features['minutes'].replace(0, np.nan)

for col in per_90_cols:
    player_features[f'{col}_per90'] = (player_features[col] / minutes_played) * 90

# Calculate cumulative season stats
print("\n  Computing cumulative season stats...")
cumulative_cols = ['minutes', 'goals', 'assists', 'xG', 'xA', 'shots', 'key_passes']

for col in cumulative_cols:
    player_features[f'{col}_cumsum'] = player_features.groupby('player_name')[col].cumsum()

# Add appearance count
player_features['appearances'] = player_features.groupby('player_name').cumcount() + 1

# Calculate season averages (rolling)
print("\n  Computing season averages...")
for col in per_90_cols:
    p90_col = f'{col}_per90'
    player_features[f'{col}_season_avg'] = player_features.groupby('player_name')[p90_col].transform(
        lambda x: x.expanding().mean()
    )

print(f"\n‚úì Player features created: {len(player_features.columns)} columns")

# Save
player_features.to_csv(os.path.join(DATA_DIR, "player_features_2023.csv"), index=False)
print(f"üíæ Saved: player_features_2023.csv")


PLAYER FEATURE ENGINEERING

  Computing per-90 metrics...

  Computing cumulative season stats...

  Computing season averages...

‚úì Player features created: 44 columns
üíæ Saved: player_features_2023.csv


## 4. Aggregate Player Season Stats
Create season-level player profiles for current form analysis.

In [5]:
print("\n" + "="*60)
print("AGGREGATING PLAYER SEASON STATS")
print("="*60)

# Aggregate to season level
player_season_stats = player_features.groupby(['player_id', 'player_name', 'team_name', 'position']).agg({
    'minutes': 'sum',
    'goals': 'sum',
    'assists': 'sum',
    'xG': 'sum',
    'xA': 'sum',
    'shots': 'sum',
    'key_passes': 'sum',
    'xGChain': 'sum',
    'xGBuildup': 'sum',
    'yellow_card': 'sum',
    'red_card': 'sum',
    'appearances': 'max',
}).reset_index()

# Calculate per-90 stats
print("\n  Computing season per-90 metrics...")
for col in ['goals', 'assists', 'xG', 'xA', 'shots', 'key_passes', 'xGChain', 'xGBuildup']:
    player_season_stats[f'{col}_per90'] = (player_season_stats[col] / player_season_stats['minutes']) * 90

# Filter to players with minimum minutes
MIN_MINUTES = 450  # ~5 full matches
qualified_players = player_season_stats[player_season_stats['minutes'] >= MIN_MINUTES].copy()

print(f"\n‚úì Player season stats created: {len(player_season_stats)} total players")
print(f"‚úì Qualified players (>={MIN_MINUTES} mins): {len(qualified_players)}")

# Save both
player_season_stats.to_csv(os.path.join(DATA_DIR, "player_season_stats_2023.csv"), index=False)
qualified_players.to_csv(os.path.join(DATA_DIR, "player_profiles_2023.csv"), index=False)

print(f"üíæ Saved: player_season_stats_2023.csv")
print(f"üíæ Saved: player_profiles_2023.csv")


AGGREGATING PLAYER SEASON STATS

  Computing season per-90 metrics...

‚úì Player season stats created: 1528 total players
‚úì Qualified players (>=450 mins): 435
üíæ Saved: player_season_stats_2023.csv
üíæ Saved: player_profiles_2023.csv


## 5. Feature Summary
Display summary of engineered features.

In [6]:
print("\n" + "="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)

print("\nüìä TEAM FEATURES:")
print(f"  Total records: {len(team_features):,}")
print(f"  Features: {len(team_features.columns)}")
print(f"  Rolling windows: 3, 5, 10 matches")
print(f"  Key metrics: form, xG trends, shot quality, pressing")

print("\nüë§ PLAYER FEATURES:")
print(f"  Match-level records: {len(player_features):,}")
print(f"  Season-level profiles: {len(qualified_players):,}")
print(f"  Features: {len(player_features.columns)}")
print(f"  Metrics: per-90 stats, cumulative totals, season averages")

print("\n" + "="*60)
print("‚úÖ FEATURE ENGINEERING COMPLETE!")
print("="*60)

# Display sample
print("\nüìã Sample Team Features (latest matchweek):")
latest_mw = team_features['matchweek'].max()
sample_teams = team_features[team_features['matchweek'] == latest_mw].nlargest(5, 'ppg_L5')
print(sample_teams[['team_name', 'matchweek', 'goals_for', 'xG', 'ppg_L5', 'win_rate_L5']].to_string(index=False))

print("\nüìã Top 5 Players by xG per90:")
top_players = qualified_players.nlargest(5, 'xG_per90')
print(top_players[['player_name', 'team_name', 'position', 'minutes', 'goals', 'xG_per90']].to_string(index=False))


FEATURE ENGINEERING SUMMARY

üìä TEAM FEATURES:
  Total records: 760
  Features: 48
  Rolling windows: 3, 5, 10 matches
  Key metrics: form, xG trends, shot quality, pressing

üë§ PLAYER FEATURES:
  Match-level records: 11,384
  Season-level profiles: 435
  Features: 44
  Metrics: per-90 stats, cumulative totals, season averages

‚úÖ FEATURE ENGINEERING COMPLETE!

üìã Sample Team Features (latest matchweek):
      team_name  matchweek  goals_for       xG  ppg_L5  win_rate_L5
        Arsenal         38          2 3.211700     3.0          1.0
        Chelsea         38          2 1.597710     3.0          1.0
Manchester City         38          3 2.315040     3.0          1.0
 Crystal Palace         38          5 2.231890     2.6          0.8
        Everton         38          1 0.664904     2.0          0.6

üìã Top 5 Players by xG per90:
   player_name        team_name position  minutes  goals  xG_per90
Erling Haaland  Manchester City       FW     2540     26  1.097844
 Callum W