In [1]:
import pandas as pd 
import numpy as np

# Import Data

In [2]:
df = pd.read_csv("../../data/raw_data/2022-23/merged_gw.csv")
df.head()

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW
0,Nathan Redmond,MID,Southampton,1.5,0,0,3,0,0.0,403,...,4,0.0,1,0,0,0,55,False,0,1
1,Junior Stanislas,MID,Bournemouth,1.1,0,0,3,0,0.0,58,...,2,0.0,1,0,0,0,50,True,0,1
2,Armando Broja,FWD,Chelsea,2.0,0,0,3,0,0.3,150,...,0,19.0,1,0,0,0,55,False,0,1
3,Fabian Schär,DEF,Newcastle,2.4,0,3,43,1,14.6,366,...,2,25.0,15,0,0,0,45,True,0,1
4,Jonny Evans,DEF,Leicester,1.9,0,0,15,0,1.3,249,...,2,0.0,1,0,0,0,45,True,0,1


In [3]:
df.columns

Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'round', 'saves', 'selected', 'starts', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'transfers_balance', 'transfers_in',
       'transfers_out', 'value', 'was_home', 'yellow_cards', 'GW'],
      dtype='object')

# Get ID columns and basic stats 

In [4]:
df_cleaned = df[[
    "GW", 
    "name", 
    "position", 
    "team", 
    "minutes", 
    "goals_scored", 
    "assists", 
    "clean_sheets", 
    "saves", 
    "penalties_saved", 
    "penalties_missed", 
    "bonus", 
    "goals_conceded", 
    "yellow_cards", 
    "red_cards", 
    "own_goals", 
    "expected_goals", 
    "expected_assists", 
    "expected_goal_involvements", 
    "expected_goals_conceded"
]]

df_cleaned.head()

Unnamed: 0,GW,name,position,team,minutes,goals_scored,assists,clean_sheets,saves,penalties_saved,penalties_missed,bonus,goals_conceded,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded
0,1,Nathan Redmond,MID,Southampton,1,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1,1,Junior Stanislas,MID,Bournemouth,1,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
2,1,Armando Broja,FWD,Chelsea,15,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
3,1,Fabian Schär,DEF,Newcastle,90,1,0,1,0,0,0,3,0,0,0,0,0.0,0.0,0.0,0.0
4,1,Jonny Evans,DEF,Leicester,90,0,0,0,0,0,0,0,2,0,0,0,0.0,0.0,0.0,0.0


# Group by rolling 5 games

In [5]:
boundary_list = [] 
for i in range(1, 35): 
    boundary_list.append(range(i, i+5)) 

Get rolling average player points for 5 game periods 

In [6]:
df_list = [] 
for boundary in boundary_list: 
    df_period = df_cleaned[df_cleaned["GW"].isin(boundary)]

    df_agg = df_period.groupby(["name", "position"]).agg({
        "team": "max", 
        "GW": "max", 
        "minutes": "mean", 
        "goals_scored": "mean", 
        "assists": "mean", 
        "clean_sheets": "mean", 
        "saves": "mean", 
        "penalties_saved": "mean", 
        "penalties_missed": "mean", 
        "bonus": "mean", 
        "goals_conceded": "mean", 
        "yellow_cards": "mean", 
        "red_cards": "mean", 
        "own_goals": "mean", 
        "expected_goals": "mean", 
        "expected_assists": "mean", 
        "expected_goal_involvements": "mean", 
        "expected_goals_conceded": "mean"
    }).reset_index()

    df_list.append(df_agg) 

df_grouped = pd.concat(df_list)

In [7]:
df_grouped.head()

Unnamed: 0,name,position,team,GW,minutes,goals_scored,assists,clean_sheets,saves,penalties_saved,penalties_missed,bonus,goals_conceded,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded
0,Aaron Cresswell,DEF,West Ham,5,86.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,1.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0
1,Aaron Hickey,DEF,Brentford,5,77.8,0.0,0.0,0.2,0.0,0.0,0.0,0.0,1.2,0.4,0.0,0.0,0.0,0.0,0.0,0.0
2,Aaron Ramsdale,GK,Arsenal,5,90.0,0.0,0.0,0.4,1.6,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Aaron Wan-Bissaka,DEF,Man Utd,5,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Abdoulaye Doucouré,MID,Everton,5,24.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Add Schedule Strength

## Pre-process fixtures data

In [8]:
df_fixtures = pd.read_csv("../../data/raw_data/2022-23/fixtures.csv")
df_teams = pd.read_csv("../../data/raw_data/2022-23/teams.csv")

In [9]:
df_merged_home = pd.merge(df_fixtures[["event", "team_h", "team_a"]], df_teams[["name", "id"]], how="left", left_on="team_h", right_on="id")
df_merged_away = pd.merge(df_merged_home, df_teams[["name", "id"]], how="left", left_on="team_a", right_on="id")
df_fixtures_cleaned = df_merged_away[["event", "name_x", "name_y"]]
df_fixtures_cleaned.columns = ["GW", "home_team", "away_team"]
df_fixtures_cleaned.head()

Unnamed: 0,GW,home_team,away_team
0,1,Crystal Palace,Arsenal
1,1,Fulham,Liverpool
2,1,Bournemouth,Aston Villa
3,1,Leeds,Wolves
4,1,Newcastle,Nott'm Forest


## Pre-process weekly standing data 

In [10]:
df_standing = pd.read_csv("../../data/raw_data/2022-23/standing.csv")
df_standing.head()

Unnamed: 0,GW,Team,Standing,id
0,1,Tottenham Hotspur,1,18
1,1,AFC Bournemouth,2,3
2,1,Arsenal FC,2,1
3,1,Manchester City,2,13
4,1,Newcastle United,2,15


In [11]:
df_merged = pd.merge(df_teams, df_standing, on="id", how="inner")
df_standing_cleaned = df_merged[["GW", "name", "Standing"]]
df_standing_cleaned.head()

Unnamed: 0,GW,name,Standing
0,1,Arsenal,2
1,2,Arsenal,2
2,3,Arsenal,1
3,4,Arsenal,1
4,5,Arsenal,1


## Add team standing 

In [12]:
def get_team_standing(team, current_gw, df_standing): 
    # determine current 5 gw period
    current_5 = range(current_gw-4, current_gw+1) 
    df_standing_current_5 = df_standing[(df_standing["name"] == team) & (df_standing["GW"].isin(current_5))]
    
    # calculate avg team standing 
    team_standing = df_standing_current_5["Standing"].mean() 

    return team_standing 

In [13]:
df_grouped["team_standing"] = df_grouped.apply(lambda row: get_team_standing(row["team"], row["GW"], df_standing_cleaned), axis=1)
df_grouped.head()

Unnamed: 0,name,position,team,GW,minutes,goals_scored,assists,clean_sheets,saves,penalties_saved,...,bonus,goals_conceded,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,team_standing
0,Aaron Cresswell,DEF,West Ham,5,86.2,0.0,0.0,0.2,0.0,0.0,...,0.0,1.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,17.0
1,Aaron Hickey,DEF,Brentford,5,77.8,0.0,0.0,0.2,0.0,0.0,...,0.0,1.2,0.4,0.0,0.0,0.0,0.0,0.0,0.0,8.2
2,Aaron Ramsdale,GK,Arsenal,5,90.0,0.0,0.0,0.4,1.6,0.0,...,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4
3,Aaron Wan-Bissaka,DEF,Man Utd,5,0.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
4,Abdoulaye Doucouré,MID,Everton,5,24.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0


## Calculate future 5-game schedule strength

In [14]:
def get_sched_strength(team, current_gw, df_fixtures, df_standing): 
    # determine next 5 gw 
    next_5 = range(current_gw+1, current_gw+6) 
    df_fixtures_next_5 = df_fixtures[df_fixtures["GW"].isin(next_5)]

    # determine opponents 
    opp_list = df_fixtures_next_5[df_fixtures_next_5["home_team"] == team].away_team.tolist()
    opp_list += df_fixtures_next_5[df_fixtures_next_5["away_team"] == team].home_team.tolist()

    # calculate avg opponent standing 
    df_standing_current = df_standing[df_standing["GW"] == current_gw] # use standing from current gw (avoid data leakage)
    df_standing_opp = df_standing_current[df_standing_current["name"].isin(opp_list)] 
    avg_standing = df_standing_opp["Standing"].mean() 

    return avg_standing 

In [15]:
df_grouped["sched_strength"] = df_grouped.apply(lambda row: get_sched_strength(row["team"], row["GW"], df_fixtures_cleaned, df_standing_cleaned), axis=1)
df_grouped.head()

Unnamed: 0,name,position,team,GW,minutes,goals_scored,assists,clean_sheets,saves,penalties_saved,...,goals_conceded,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,team_standing,sched_strength
0,Aaron Cresswell,DEF,West Ham,5,86.2,0.0,0.0,0.2,0.0,0.0,...,1.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,17.0,13.25
1,Aaron Hickey,DEF,Brentford,5,77.8,0.0,0.0,0.2,0.0,0.0,...,1.2,0.4,0.0,0.0,0.0,0.0,0.0,0.0,8.2,9.0
2,Aaron Ramsdale,GK,Arsenal,5,90.0,0.0,0.0,0.4,1.6,0.0,...,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4,6.25
3,Aaron Wan-Bissaka,DEF,Man Utd,5,0.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.666667
4,Abdoulaye Doucouré,MID,Everton,5,24.8,0.0,0.0,0.0,0.0,0.0,...,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,8.5


# Append future 5-game FPL points average as labels 

In [16]:
def get_future_fpl_pts(player, current_gw, df): 
    # determine next 5 gw 
    next_5 = range(current_gw+1, current_gw+6) 
    df_next_5 = df[(df["name"] == player) & (df["GW"].isin(next_5))]
    
    avg_pts = df_next_5["total_points"].mean()
    return avg_pts

In [17]:
df_grouped["5_gw_fpl_pts"] = df_grouped.apply(lambda row: get_future_fpl_pts(row["name"], row["GW"], df), axis=1)
df_grouped.head()

Unnamed: 0,name,position,team,GW,minutes,goals_scored,assists,clean_sheets,saves,penalties_saved,...,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,team_standing,sched_strength,5_gw_fpl_pts
0,Aaron Cresswell,DEF,West Ham,5,86.2,0.0,0.0,0.2,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,17.0,13.25,2.25
1,Aaron Hickey,DEF,Brentford,5,77.8,0.0,0.0,0.2,0.0,0.0,...,0.4,0.0,0.0,0.0,0.0,0.0,0.0,8.2,9.0,1.0
2,Aaron Ramsdale,GK,Arsenal,5,90.0,0.0,0.0,0.4,1.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4,6.25,2.75
3,Aaron Wan-Bissaka,DEF,Man Utd,5,0.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.666667,0.0
4,Abdoulaye Doucouré,MID,Everton,5,24.8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,8.5,0.5


# Export cleaned data

In [18]:
df_grouped.to_csv("../../data/cleaned_data/2023_cleaned_ML_data.csv", index=False)