In [118]:
import pandas as pd 
import numpy as np

# Import Data

In [119]:
df = pd.read_csv("../../data/raw_data/2022-23/merged_gw.csv")
df.head()

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW
0,Nathan Redmond,MID,Southampton,1.5,0,0,3,0,0.0,403,...,4,0.0,1,0,0,0,55,False,0,1
1,Junior Stanislas,MID,Bournemouth,1.1,0,0,3,0,0.0,58,...,2,0.0,1,0,0,0,50,True,0,1
2,Armando Broja,FWD,Chelsea,2.0,0,0,3,0,0.3,150,...,0,19.0,1,0,0,0,55,False,0,1
3,Fabian Schär,DEF,Newcastle,2.4,0,3,43,1,14.6,366,...,2,25.0,15,0,0,0,45,True,0,1
4,Jonny Evans,DEF,Leicester,1.9,0,0,15,0,1.3,249,...,2,0.0,1,0,0,0,45,True,0,1


In [120]:
df.columns

Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'round', 'saves', 'selected', 'starts', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'transfers_balance', 'transfers_in',
       'transfers_out', 'value', 'was_home', 'yellow_cards', 'GW'],
      dtype='object')

# Get ID columns and basic stats 

In [121]:
df_cleaned = df[[
    "GW", 
    "name", 
    "position", 
    "team", 
    "value", 
    "minutes", 
    "goals_scored", 
    "assists", 
    "clean_sheets", 
    "saves", 
    "penalties_saved", 
    "penalties_missed", 
    "bonus", 
    "goals_conceded", 
    "yellow_cards", 
    "red_cards", 
    "own_goals", 
    "expected_goals", 
    "expected_assists", 
    "expected_goal_involvements", 
    "expected_goals_conceded", 
    "was_home" 
]]

df_cleaned.head()

Unnamed: 0,GW,name,position,team,value,minutes,goals_scored,assists,clean_sheets,saves,...,bonus,goals_conceded,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,was_home
0,1,Nathan Redmond,MID,Southampton,55,1,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,False
1,1,Junior Stanislas,MID,Bournemouth,50,1,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,True
2,1,Armando Broja,FWD,Chelsea,55,15,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,False
3,1,Fabian Schär,DEF,Newcastle,45,90,1,0,1,0,...,3,0,0,0,0,0.0,0.0,0.0,0.0,True
4,1,Jonny Evans,DEF,Leicester,45,90,0,0,0,0,...,0,2,0,0,0,0.0,0.0,0.0,0.0,True


# Pre-process fixtures data

In [122]:
df_fixtures = pd.read_csv("../../data/raw_data/2022-23/fixtures.csv")
df_teams = pd.read_csv("../../data/raw_data/2022-23/teams.csv")

In [123]:
df_merged_home = pd.merge(df_fixtures[["event", "team_h", "team_a"]], df_teams[["name", "id"]], how="left", left_on="team_h", right_on="id")
df_merged_away = pd.merge(df_merged_home, df_teams[["name", "id"]], how="left", left_on="team_a", right_on="id")
df_fixtures_cleaned = df_merged_away[["event", "name_x", "name_y"]]
df_fixtures_cleaned.columns = ["GW", "home_team", "away_team"]
df_fixtures_cleaned.head()

Unnamed: 0,GW,home_team,away_team
0,1,Crystal Palace,Arsenal
1,1,Fulham,Liverpool
2,1,Bournemouth,Aston Villa
3,1,Leeds,Wolves
4,1,Newcastle,Nott'm Forest


# Pre-process weekly standing data 

In [124]:
df_standing = pd.read_csv("../../data/raw_data/2022-23/standing.csv")
df_standing.head()

Unnamed: 0,GW,Team,Standing,id
0,1,Tottenham Hotspur,1,18
1,1,AFC Bournemouth,2,3
2,1,Arsenal FC,2,1
3,1,Manchester City,2,13
4,1,Newcastle United,2,15


In [125]:
df_merged = pd.merge(df_teams, df_standing, on="id", how="inner")
df_standing_cleaned = df_merged[["GW", "name", "Standing"]]
df_standing_cleaned.head()

Unnamed: 0,GW,name,Standing
0,1,Arsenal,2
1,2,Arsenal,2
2,3,Arsenal,1
3,4,Arsenal,1
4,5,Arsenal,1


# Calculate future 5-game schedule strength

In [126]:
def get_sched_strength(team, current_gw, df_fixtures, df_standing): 
    # determine next 5 gw 
    next_5 = range(current_gw+1, current_gw+6) 
    df_fixtures_next_5 = df_fixtures[df_fixtures["GW"].isin(next_5)]

    # determine opponents 
    opp_list = df_fixtures_next_5[df_fixtures_next_5["home_team"] == team].away_team.tolist()
    opp_list += df_fixtures_next_5[df_fixtures_next_5["away_team"] == team].home_team.tolist()

    # calculate avg opponent standing 
    df_standing_current = df_standing[df_standing["GW"] == current_gw] # use standing from current gw (avoid data leakage)
    df_standing_opp = df_standing_current[df_standing_current["name"].isin(opp_list)] 
    avg_standing = df_standing_opp["Standing"].mean() 

    return avg_standing 

In [127]:
df_cleaned["sched_strength"] = df_cleaned.apply(lambda row: get_sched_strength(row["team"], row["GW"], df_fixtures_cleaned, df_standing_cleaned), axis=1)
df_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["sched_strength"] = df_cleaned.apply(lambda row: get_sched_strength(row["team"], row["GW"], df_fixtures_cleaned, df_standing_cleaned), axis=1)


Unnamed: 0,GW,name,position,team,value,minutes,goals_scored,assists,clean_sheets,saves,...,goals_conceded,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,was_home,sched_strength
0,1,Nathan Redmond,MID,Southampton,55,1,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,False,9.8
1,1,Junior Stanislas,MID,Bournemouth,50,1,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,True,8.4
2,1,Armando Broja,FWD,Chelsea,55,15,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,False,10.4
3,1,Fabian Schär,DEF,Newcastle,45,90,1,0,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,True,9.2
4,1,Jonny Evans,DEF,Leicester,45,90,0,0,0,0,...,2,0,0,0,0.0,0.0,0.0,0.0,True,9.8


# Append future 5-game FPL points average as labels 

In [128]:
def get_future_fpl_pts(player, current_gw, df): 
    # determine next 5 gw 
    next_5 = range(current_gw+1, current_gw+6) 
    df_next_5 = df[(df["name"] == player) & (df["GW"].isin(next_5))]
    
    avg_pts = df_next_5["total_points"].mean()
    return avg_pts

In [129]:
df_cleaned["5_gw_fpl_pts"] = df_cleaned.apply(lambda row: get_future_fpl_pts(row["name"], row["GW"], df), axis=1)
df_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["5_gw_fpl_pts"] = df_cleaned.apply(lambda row: get_future_fpl_pts(row["name"], row["GW"], df), axis=1)


Unnamed: 0,GW,name,position,team,value,minutes,goals_scored,assists,clean_sheets,saves,...,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,was_home,sched_strength,5_gw_fpl_pts
0,1,Nathan Redmond,MID,Southampton,55,1,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,False,9.8,0.0
1,1,Junior Stanislas,MID,Bournemouth,50,1,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,True,8.4,0.2
2,1,Armando Broja,FWD,Chelsea,55,15,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,False,10.4,0.6
3,1,Fabian Schär,DEF,Newcastle,45,90,1,0,1,0,...,0,0,0,0.0,0.0,0.0,0.0,True,9.2,2.4
4,1,Jonny Evans,DEF,Leicester,45,90,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,True,9.8,0.8


Export cleaned data

In [131]:
df_cleaned.to_csv("../../data/cleaned_data/2023_cleaned_ML_data.csv")