In [1]:
import pandas as pd 
import numpy as np
import xgboost as xgb 
from prediction_utils import scrape_standing, get_team_standing, get_sched_strength

  from pandas import MultiIndex, Int64Index


# Import models and data

In [2]:
df = pd.read_csv("../../data/cleaned_data/2023_cleaned_ML_data.csv")
df = df[df["5_gw_fpl_pts"].isna()]
df = df[df["minutes"] >= 60.0]
df.drop("5_gw_fpl_pts", axis=1, inplace=True)
df.head()

Unnamed: 0,name,position,team,GW,minutes,goals_scored,assists,clean_sheets,saves,penalties_saved,...,goals_conceded,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,team_standing,sched_strength
23180,Aaron Hickey,DEF,Brentford,38,83.6,0.0,0.2,0.4,0.0,0.0,...,0.6,0.2,0.0,0.0,0.018,0.038,0.056,0.968,9.0,
23181,Aaron Ramsdale,GK,Arsenal,38,90.0,0.0,0.0,0.4,2.4,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.968,2.0,
23182,Aaron Wan-Bissaka,DEF,Man Utd,38,63.714286,0.0,0.0,0.285714,0.0,0.0,...,0.428571,0.142857,0.0,0.0,0.034286,0.022857,0.057143,1.012857,3.6,
23183,Abdoulaye Doucouré,MID,Everton,38,90.0,0.6,0.0,0.2,0.0,0.0,...,1.4,0.2,0.0,0.0,0.176,0.104,0.28,1.904,17.4,
23187,Adam Smith,DEF,Bournemouth,38,71.4,0.0,0.0,0.0,0.0,0.0,...,1.2,0.4,0.0,0.0,0.0,0.092,0.092,1.048,14.2,


In [3]:
fwd_model = xgb.XGBRegressor()
fwd_model.load_model("../../data/models/fwd_model.json")
mid_model = xgb.XGBRegressor()
mid_model.load_model("../../data/models/mid_model.json")
def_model = xgb.XGBRegressor()
def_model.load_model("../../data/models/def_model.json")
gk_model = xgb.XGBRegressor()
gk_model.load_model("../../data/models/gk_model.json")

# Attach cost to players 

Update `data/2023-24/cleaned_players.csv` by running the following in terminal: 
1. `git fetch upstream master`
2. `git checkout upstream/master -- data/2023-24/cleaned_players.csv`

In [4]:
df_players = pd.read_csv("../../data/2023-24/cleaned_players.csv")
df_players["name"] = df_players.apply(lambda row: row["first_name"] + " " + row["second_name"], axis=1)
df_players = df_players[["name", "now_cost"]]
df_players.head()

Unnamed: 0,name,now_cost
0,Folarin Balogun,45
1,Cédric Alves Soares,40
2,Mohamed Elneny,45
3,Fábio Ferreira Vieira,55
4,Gabriel dos Santos Magalhães,50


In [5]:
df = pd.merge(df, df_players, how="left", on="name")
df = df[df["now_cost"].notna()]
df.head()

Unnamed: 0,name,position,team,GW,minutes,goals_scored,assists,clean_sheets,saves,penalties_saved,...,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,team_standing,sched_strength,now_cost
0,Aaron Hickey,DEF,Brentford,38,83.6,0.0,0.2,0.4,0.0,0.0,...,0.2,0.0,0.0,0.018,0.038,0.056,0.968,9.0,,45.0
1,Aaron Ramsdale,GK,Arsenal,38,90.0,0.0,0.0,0.4,2.4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.968,2.0,,50.0
2,Aaron Wan-Bissaka,DEF,Man Utd,38,63.714286,0.0,0.0,0.285714,0.0,0.0,...,0.142857,0.0,0.0,0.034286,0.022857,0.057143,1.012857,3.6,,45.0
3,Abdoulaye Doucouré,MID,Everton,38,90.0,0.6,0.0,0.2,0.0,0.0,...,0.2,0.0,0.0,0.176,0.104,0.28,1.904,17.4,,55.0
4,Adam Smith,DEF,Bournemouth,38,71.4,0.0,0.0,0.0,0.0,0.0,...,0.4,0.0,0.0,0.0,0.092,0.092,1.048,14.2,,45.0


# Add 5-game schedule strength 

## Get up-to-date game schedule 

Update `data/2023-24/fixtures.csv` by running the following in terminal: 
1. `git fetch upstream master`
2. `git checkout upstream/master -- data/2023-24/fixtures.csv`

In [6]:
df_fixtures = pd.read_csv("../../data/2023-24/fixtures.csv")
df_teams = pd.read_csv("../../data/2023-24/teams.csv")

In [7]:
df_fixtures = df_fixtures[df_fixtures["event"].notna()]
df_fixtures["event"] = df_fixtures["event"].astype("int")
df_merged_home = pd.merge(df_fixtures[["event", "team_h", "team_a"]], df_teams[["name", "id"]], how="left", left_on="team_h", right_on="id")
df_merged_away = pd.merge(df_merged_home, df_teams[["name", "id"]], how="left", left_on="team_a", right_on="id")
df_fixtures_cleaned = df_merged_away[["event", "name_x", "name_y"]]
df_fixtures_cleaned.columns = ["GW", "home_team", "away_team"]
df_fixtures_cleaned.head()

Unnamed: 0,GW,home_team,away_team
0,1,Burnley,Man City
1,1,Arsenal,Nott'm Forest
2,1,Bournemouth,West Ham
3,1,Brighton,Luton
4,1,Everton,Fulham


## Get team standing

In [8]:
season = "2022-2023"
gw = 38
id_mapping = [3, 1, 2, 4, 5, 7, 8, 9, 10, np.nan, np.nan, 11, 13, 14, 15, 16, np.nan, 18, 19, 20]
df_standing = get_team_standing(season, gw, id_mapping, df_teams)

Clean up df_standing, discard once fully moved onto 23-24 data

In [9]:
df_standing["GW"] = df_standing["GW"].fillna(38.0).astype("int")
df_standing["Standing"] = df_standing["Standing"].fillna(20).astype("int")
df_standing.sort_values("Standing")

Unnamed: 0,GW,name,Standing
12,38,Man City,1
0,38,Arsenal,2
13,38,Man Utd,3
14,38,Newcastle,4
10,38,Liverpool,5
4,38,Brighton,6
1,38,Aston Villa,7
17,38,Spurs,8
3,38,Brentford,9
9,38,Fulham,10


## Calculate 5-game schedule strength

In [10]:
gw = 0

df["sched_strength"] = df.apply(lambda row: get_sched_strength(row["team"], gw, df_fixtures_cleaned, df_standing), axis=1)
df.head()

Unnamed: 0,name,position,team,GW,minutes,goals_scored,assists,clean_sheets,saves,penalties_saved,...,yellow_cards,red_cards,own_goals,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,team_standing,sched_strength,now_cost
0,Aaron Hickey,DEF,Brentford,38,83.6,0.0,0.2,0.4,0.0,0.0,...,0.2,0.0,0.0,0.018,0.038,0.056,0.968,9.0,9.6,45.0
1,Aaron Ramsdale,GK,Arsenal,38,90.0,0.0,0.0,0.4,2.4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.968,2.0,11.4,50.0
2,Aaron Wan-Bissaka,DEF,Man Utd,38,63.714286,0.0,0.0,0.285714,0.0,0.0,...,0.142857,0.0,0.0,0.034286,0.022857,0.057143,1.012857,3.6,9.0,45.0
3,Abdoulaye Doucouré,MID,Everton,38,90.0,0.6,0.0,0.2,0.0,0.0,...,0.2,0.0,0.0,0.176,0.104,0.28,1.904,17.4,10.4,55.0
4,Adam Smith,DEF,Bournemouth,38,71.4,0.0,0.0,0.0,0.0,0.0,...,0.4,0.0,0.0,0.0,0.092,0.092,1.048,14.2,9.6,45.0


# Predict FPL 5-game avg points using models

## Running Model

In [11]:
df_fwd = df[df["position"] == "FWD"]
df_mid = df[df["position"] == "MID"]
df_def = df[df["position"] == "DEF"] 
df_gk = df[df["position"] == "GK"] 

In [12]:
df_fwd_input = df_fwd.drop(["name", "position", "team", "now_cost"], axis=1) 
df_mid_input = df_mid.drop(["name", "position", "team", "now_cost"], axis=1) 
df_def_input = df_def.drop(["name", "position", "team", "now_cost"], axis=1) 
df_gk_input = df_gk.drop(["name", "position", "team", "now_cost"], axis=1) 

In [13]:
df_fwd["xP"] = fwd_model.predict(df_fwd_input)
df_mid["xP"] = mid_model.predict(df_mid_input)
df_def["xP"] = def_model.predict(df_def_input)
df_gk["xP"] = gk_model.predict(df_gk_input)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fwd["xP"] = fwd_model.predict(df_fwd_input)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mid["xP"] = mid_model.predict(df_mid_input)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_def["xP"] = def_model.predict(df_def_input)
A value is trying to be set on a copy of a slice from a DataFram

In [14]:
df_fwd_out = df_fwd[["name", "position", "team", "now_cost", "xP"]].sort_values("xP", ascending=False).reset_index(drop=True)
df_mid_out = df_mid[["name", "position", "team", "now_cost", "xP"]].sort_values("xP", ascending=False).reset_index(drop=True)
df_def_out = df_def[["name", "position", "team", "now_cost", "xP"]].sort_values("xP", ascending=False).reset_index(drop=True)
df_gk_out = df_gk[["name", "position", "team", "now_cost", "xP"]].sort_values("xP", ascending=False).reset_index(drop=True)

## Player Rankings 

In [15]:
df_fwd_out.head(10)

Unnamed: 0,name,position,team,now_cost,xP
0,Harry Kane,FWD,Spurs,125.0,10.891407
1,Ollie Watkins,FWD,Aston Villa,80.0,6.330798
2,Bryan Mbeumo,FWD,Brentford,65.0,5.500046
3,Gabriel Fernando de Jesus,FWD,Arsenal,80.0,5.310563
4,Alexander Isak,FWD,Newcastle,75.0,4.725514
5,Taiwo Awoniyi,FWD,Nott'm Forest,65.0,4.391552
6,Julián Álvarez,FWD,Man City,65.0,3.342514
7,Erling Haaland,FWD,Man City,140.0,2.487875
8,Callum Wilson,FWD,Newcastle,80.0,1.967064
9,Dominic Solanke,FWD,Bournemouth,65.0,1.602225


In [16]:
df_mid_out.head(10)

Unnamed: 0,name,position,team,now_cost,xP
0,Douglas Luiz Soares de Paulo,MID,Aston Villa,55.0,5.674575
1,Mohamed Salah,MID,Liverpool,125.0,4.746064
2,Jacob Ramsey,MID,Aston Villa,60.0,4.59285
3,Bruno Borges Fernandes,MID,Man Utd,85.0,4.244045
4,Pascal Groß,MID,Brighton,65.0,4.109529
5,Martin Ødegaard,MID,Arsenal,85.0,4.098105
6,Son Heung-min,MID,Spurs,90.0,4.034036
7,Dwight McNeil,MID,Everton,55.0,3.813678
8,Alex Iwobi,MID,Everton,55.0,3.786579
9,Carlos Henrique Casimiro,MID,Man Utd,55.0,3.756252


In [17]:
df_def_out.head(10)

Unnamed: 0,name,position,team,now_cost,xP
0,Jakub Kiwior,DEF,Arsenal,45.0,8.249063
1,Pedro Porro,DEF,Spurs,50.0,6.349626
2,Benjamin White,DEF,Arsenal,55.0,4.805025
3,Ethan Pinnock,DEF,Brentford,45.0,4.39737
4,Emerson Leite de Souza Junior,DEF,Spurs,45.0,4.255577
5,Aaron Hickey,DEF,Brentford,45.0,3.786247
6,Ben Mee,DEF,Brentford,50.0,3.682943
7,Trent Alexander-Arnold,DEF,Liverpool,80.0,3.475378
8,Kenny Tete,DEF,Fulham,45.0,3.443619
9,Kyle Walker,DEF,Man City,50.0,2.953459


In [18]:
df_gk_out.head()

Unnamed: 0,name,position,team,now_cost,xP
0,David Raya Martin,GK,Brentford,50.0,4.892256
1,Jason Steele,GK,Brighton,45.0,4.411284
2,Emiliano Martínez Romero,GK,Aston Villa,50.0,3.919473
3,Fraser Forster,GK,Spurs,40.0,3.82115
4,Bernd Leno,GK,Fulham,45.0,3.739291


# Generate recommended starting XI 

In [19]:
from mip import Model, xsum, maximize, BINARY 

def knapsack_fpl_xi(df_players, budget, fwd_count, mid_count, def_count, gk_count): 
    num_players = len(df_players)
    
    # create model 
    model = Model(sense=maximize) 
    
    # define decision variables 
    selected = [model.add_var(var_type=BINARY) for _ in df_players]

    # add budget constraint 
    model += sum(selected[i] * df_players.iloc[i]["now_cost"] for i in range()) <= budget 

    # add position constraints 
    model += xsum(selected[i] for i in range(num_players) if df_players.iloc[i]["position"] == "FWD") == fwd_count
    model += xsum(selected[i] for i in range(num_players) if df_players.iloc[i]["position"] == "MID") == mid_count
    model += xsum(selected[i] for i in range(num_players) if df_players.iloc[i]["position"] == "DEF") == def_count
    model += xsum(selected[i] for i in range(num_players) if df_players.iloc[i]["position"] == "GK") == gk_count 

    # set objective function 
    model.objective = xsum(selected[i] * df_players.iloc[i]["xP"] for i in range(num_players)) 

    model.optimize() 

    selected_players = [df_players.iloc[i] for i, var in enumerate(selected) if var.x >= 0.99]

    return selected_players

In [20]:
budget = 835
fwd_count = 2
mid_count = 5
def_count = 3
gk_count = 1

df_players = pd.concat([df_fwd_out, df_mid_out, df_def_out, df_gk_out]) 
df_players["now_cost"] = df_players["now_cost"].astype("int")

In [22]:
selected_players = knapsack_fpl_xi(df_players, budget, fwd_count, mid_count, def_count, gk_count)
selected_players

An error occurred while loading the CBC library:	 cannot load library '/Users/terryzhou/opt/anaconda3/lib/python3.9/site-packages/mip/libraries/cbc-c-darwin-x86-64.dylib': dlopen(/Users/terryzhou/opt/anaconda3/lib/python3.9/site-packages/mip/libraries/cbc-c-darwin-x86-64.dylib, 0x0002): tried: '/Users/terryzhou/opt/anaconda3/lib/python3.9/site-packages/mip/libraries/cbc-c-darwin-x86-64.dylib' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64')), '/System/Volumes/Preboot/Cryptexes/OS/Users/terryzhou/opt/anaconda3/lib/python3.9/site-packages/mip/libraries/cbc-c-darwin-x86-64.dylib' (no such file), '/Users/terryzhou/opt/anaconda3/lib/python3.9/site-packages/mip/libraries/cbc-c-darwin-x86-64.dylib' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64')).  Additionally, ctypes.util.find_library() did not manage to locate a library called '/Users/terryzhou/opt/anaconda3/lib/python3.9/site-packages/mip/libraries/cbc-c-darwin-x86-64.dylib'



NameError: name 'cbclib' is not defined