## Download Data

In [1]:
%pip install soccerdata

Collecting soccerdata
  Downloading soccerdata-1.8.8-py3-none-any.whl.metadata (5.4 kB)
Collecting html5lib<2.0.0,>=1.1 (from soccerdata)
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Collecting lxml>=4.9.4 (from soccerdata)
  Downloading lxml-6.0.2-cp312-cp312-win_amd64.whl.metadata (3.7 kB)
Collecting seleniumbase<5.0.0,>=4.38.2 (from soccerdata)
  Downloading seleniumbase-4.46.4-py3-none-any.whl.metadata (88 kB)
Collecting unidecode<2.0.0,>=1.4.0 (from soccerdata)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting urllib3<2 (from soccerdata)
  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
Collecting wrapper-tls-requests<2.0.0,>=1.1.4 (from soccerdata)
  Downloading wrapper_tls_requests-1.2.4-py3-none-any.whl.metadata (8.6 kB)
Collecting webencodings (from html5lib<2.0.0,>=1.1->soccerdata)
  Downloading webencodings-0.5.1-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting pip>=26.0 (from seleniumbase<5.0.0,>=4.38.2->soccer

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-core 0.3.68 requires packaging<25,>=23.2, but you have packaging 26.0 which is incompatible.
streamlit 1.46.1 requires packaging<26,>=20, but you have packaging 26.0 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.

[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Library imports
import os
import pandas as pd
import soccerdata as sd

In [None]:
# Configuration
LEAGUE = "ENG-Premier League"
SEASON = "2024"  # 2024/25 season in FBref notation
DATA_DIR = "./data"
os.makedirs(DATA_DIR, exist_ok=True)

# Paths to downloaded Understat CSVs
UNDERSTAT_TEAM_CSV = "./understat_matches_team.csv"   
UNDERSTAT_ROSTER_CSV = "./understat_matches_roster.csv"  

In [None]:
# Load FBref data via soccerdata
print("Loading FBref data from soccerdata...")

fbref = sd.FBref(LEAGUE, SEASON)

# Schedule / results
schedule = fbref.read_schedule()
schedule.to_csv(os.path.join(DATA_DIR, "schedule.csv"), index=False)

# Team season stats (basic)
team_stats = fbref.read_team_season_stats(stat_type="standard")
team_stats.to_csv(os.path.join(DATA_DIR, "team_stats_standard.csv"), index=False)

# Player season stats (basic)
player_stats = fbref.read_player_season_stats(stat_type="standard")
player_stats.to_csv(os.path.join(DATA_DIR, "player_stats_standard.csv"), index=False)

print("FBref data saved: schedule.csv, team_stats_standard.csv, player_stats_standard.csv")

In [None]:
# Load Understat Match Team Dataset
print("Loading Understat match team dataset...")
team_df = pd.read_csv(UNDERSTAT_TEAM_CSV)

# Basic cleaning
team_df['date'] = pd.to_datetime(team_df['date'])
team_df.sort_values(['date'], inplace=True)

# Example derived columns
team_df['goal_diff'] = team_df['goals_for'] - team_df['goals_against']
team_df['points'] = team_df['result'].map({'W': 3, 'D': 1, 'L': 0})

# Save cleaned team dataset
team_df.to_csv(os.path.join(DATA_DIR, "team_df_clean.csv"), index=False)

print("Understat team data saved: team_df_clean.csv")

In [None]:
# Load Understat Match Roster Dataset (Player stats)
print("Loading Understat match roster dataset...")
player_df = pd.read_csv(UNDERSTAT_ROSTER_CSV)

# Basic cleaning
player_df['date'] = pd.to_datetime(player_df['date'])
player_df.sort_values(['date'], inplace=True)

# Example: calculate minutes played percentage
player_df['minutes_pct'] = player_df['minutes'] / 90

# Save cleaned player dataset
player_df.to_csv(os.path.join(DATA_DIR, "player_df_clean.csv"), index=False)

print("Understat player data saved: player_df_clean.csv")

In [None]:
# Compute Rolling Features (last N matches)
def compute_rolling_features(df, group_col, target_cols, window=5):
    """
    Compute rolling mean features for last N matches per team/player.
    """
    df_rolled = df.copy()
    df_rolled = df_rolled.sort_values('date')
    for col in target_cols:
        df_rolled[f'{col}_rolling{window}'] = df_rolled.groupby(group_col)[col]\
            .rolling(window, min_periods=1).mean().reset_index(level=0, drop=True)
    return df_rolled

# Team rolling features
team_features = ['goals_for', 'goals_against', 'xG', 'xGA', 'shots', 'shots_on_target']
team_df_rolled = compute_rolling_features(team_df, 'team_name', team_features, window=5)
team_df_rolled.to_csv(os.path.join(DATA_DIR, "team_df_rolled.csv"), index=False)

# Player rolling features
player_features = ['xG', 'xA', 'shots', 'key_passes', 'minutes']
player_df_rolled = compute_rolling_features(player_df, 'player_name', player_features, window=5)
player_df_rolled.to_csv(os.path.join(DATA_DIR, "player_df_rolled.csv"), index=False)

print("Rolling features computed and saved: team_df_rolled.csv, player_df_rolled.csv")

In [None]:
# Santiy check - print previews
print("\nPreview: schedule")
print(schedule.head())

print("\nPreview: team_stats")
print(team_stats.head())

print("\nPreview: player_stats")
print(player_stats.head())

print("\nPreview: team_df_clean")
print(team_df.head())

print("\nPreview: player_df_clean")
print(player_df.head())

print("\nPreview: team_df_rolled")
print(team_df_rolled.head())

print("\nPreview: player_df_rolled")
print(player_df_rolled.head())

print("\nData loading complete. All cleaned and rolled datasets saved in ./data/")