# Contents
1. Import Libraries, Data and Model <br>

2. Building Prediction DataFrame <br>

3. Applying Model to Prediction DataFrame <br>

4. Next Steps

# 1. Import Libraries, Data and Model

In [15]:
# Libraries
import pandas as pd
import pickle
import soccerdata as sd
pd.set_option('display.max_columns', None)
from datetime import date
import numpy as np

# Data
df = pd.read_csv('df.csv', encoding='utf-8')

# Model (Optimized XGB Regressor and Linear Regression ensemble trained on past 5 seasons of data)
with open('Final_model', 'rb') as f:
    model = pickle.load(f)
    
df.head()

Unnamed: 0,Location_Away,Location_Home,CrdR_0.0,CrdR_1.0,CrdY_0.0,CrdY_1.0,CrdY_2.0,Pos_Attacking Midfield,Pos_Center Back,Pos_Central Midfield,Pos_Defensive Midfield,Pos_Forward,Pos_Fullbacks,Pos_Goalkeeper,Pos_Wide Attacker,Pos_Wide Midfield,Continent_Africa,Continent_Asia,Continent_Europe,Continent_North America,Continent_Oceania,Continent_South America,Age,Min,Gls,Ast,PK,PKatt,Sh,SoT,Touches,Tkl,Int,Blocks,xG,npxG,xAG,SCA,GCA,Cmp,Att,Cmp%,PrgP,Carries,PrgC,Att.1,Succ,player,team
0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,26.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,4.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,11.0,16.0,68.8,2.0,7.0,0.0,0.0,0.0,Matty James,Leicester City
1,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,20.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,13.0,22.0,59.1,2.0,9.0,0.0,1.0,0.0,Wilfred Ndidi,Leicester City
2,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,33.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,12.0,16.0,75.0,3.0,3.0,0.0,1.0,1.0,Wes Morgan,Leicester City
3,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,31.0,71.0,1.0,0.0,0.0,0.0,2.0,1.0,28.0,1.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,10.0,22.0,45.5,1.0,13.0,0.0,1.0,0.0,Shinji Okazaki,Leicester City
4,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,26.0,90.0,0.0,1.0,0.0,0.0,2.0,0.0,53.0,2.0,2.0,3.0,0.1,0.1,0.2,2.0,2.0,24.0,34.0,70.6,0.0,28.0,1.0,6.0,2.0,Riyad Mahrez,Leicester City


# 2. Building Prediction DataFrame

The predictors will comprise of: <br>
* The location of their next game (Home/Away). <br>
* An average of various numerical indicators over the last 10 games. <br>
* Their current age, continent and position.  

### Last 10 Games

In [16]:
# Create an empty dataframe to store the concatenated results
df_last_10 = pd.DataFrame()

# Loop through each player in the unique list of players
for player in df['player'].unique():
    # Filter the original dataframe to only include rows for the current player
    filtered_df = df[df['player'] == player].tail(10)
    # Concatenate the filtered dataframe with the combined dataframe
    df_last_10 = pd.concat([df_last_10, filtered_df], axis=0)

### Location

In [17]:
fbref = sd.FBref(leagues="ENG-Premier League", seasons=2022)
epl_schedule = fbref.read_schedule()

team = epl_schedule.reset_index()
team = team[team['date'] >= str(date.today())]

# Group by home_team and get earliest date
home_team_df = team.groupby('home_team')['date'].min().reset_index()
home_team_df.rename(columns={'date': 'earliest_date_home'}, inplace=True)

# Group by away_team and get earliest date
away_team_df = team.groupby('away_team')['date'].min().reset_index()
away_team_df.rename(columns={'date': 'earliest_date_away'}, inplace=True)

# Merge the two DataFrames using home_team and away_team columns as keys
merged_df = pd.merge(home_team_df, away_team_df, left_on='home_team', right_on='away_team')

# Get the earliest date between earliest_date_home and earliest_date_away
merged_df['earliest_date'] = merged_df['earliest_date_home'].combine(merged_df['earliest_date_away'], min)

# Determine if the team played at home or away based on earliest dates
merged_df['home_away'] = np.where(merged_df['earliest_date_home'] < merged_df['earliest_date_away'], 'Home', 'Away')

# Rename home_team column to team
merged_df = merged_df.rename(columns={'home_team': 'team'})

# Merge with original DataFrame on team column
df_location = df.merge(merged_df, on='team')

# Select relevant columns and rename home_away column to Location
df_location = df_location[['Gls', 'team', 'player', 'home_away']]
df_location = df_location.rename(columns={'home_away': 'Location'})

# One hot encode Location column
encoded_location = pd.get_dummies(df_location['Location'], prefix='Location')
df_location = pd.concat([df_location, encoded_location], axis=1).drop(['Gls', 'team', 'Location'], axis=1)

df_location.drop_duplicates(inplace=True)

### Position, Continent

In [18]:
# Columns to take mode of
mode_cols = ['Pos_Attacking Midfield', 'Pos_Center Back', 'Pos_Central Midfield', 'Pos_Defensive Midfield', 'Pos_Forward', 'Pos_Fullbacks', 'Pos_Goalkeeper', 'Pos_Wide Attacker', 'Pos_Wide Midfield', 'Continent_Africa', 'Continent_Asia', 'Continent_Europe', 'Continent_North America', 'Continent_Oceania', 'Continent_South America', 'player']
df_mode = df_last_10[mode_cols]

# Group by player and average the columns
df_mode = df_mode.groupby('player')[mode_cols].apply(lambda x: x.mode().iloc[0]).reset_index(drop=True)

### Age

In [19]:
df[['Age']] = df[['Age']].astype('int64')
df_age = pd.DataFrame(df.groupby('player')['Age'].nlargest(1)).reset_index().drop(['level_1'], axis = 1)

### Averaged Numerical Indicators

In [20]:
# Columns to be averaged 
df[['CrdR_0.0', 'CrdR_1.0', 'CrdY_0.0', 'CrdY_1.0', 'CrdY_2.0']] = df[['CrdR_0.0', 'CrdR_1.0', 'CrdY_0.0', 'CrdY_1.0', 'CrdY_2.0']].astype(float)
avg_cols = df.select_dtypes(include=['float']).columns.to_list()

# Group by player and average the columns
df_average = df_last_10.groupby('player')[avg_cols].mean().reset_index().round(2)

### Merging

In [21]:
# merge df_mode and df_average
merged1 = pd.merge(df_mode, df_average, on='player')

# merge df_location with the previous merge
merged2 = pd.merge(merged1, df_location, on='player')

# merge df_age with the previous merge
df_predict = pd.merge(merged2, df_age, on='player').reindex(columns=df.columns)


players = df_predict['player']
df_predict.drop(['player', 'team', 'Gls'], axis = 1, inplace = True)

# 3. Applying Model to Prediction DataFrame

In [23]:
predictions = pd.DataFrame(model.predict(df_predict))
predictions['Player'] = players
predictions = predictions.sort_values(by=predictions.columns[0], ascending=False).rename(columns={predictions.columns[0]: "Predicted Goals"}).drop_duplicates(subset=["Player"]).reset_index(drop=True)
predictions.head(25)

Unnamed: 0,Predicted Goals,Player
0,0.668365,Eddie Nketiah
1,0.589674,Darwin Núñez
2,0.574674,Gabriel Jesus
3,0.572148,Marcus Rashford
4,0.563399,Aleksandar Mitrović
5,0.556781,Erling Haaland
6,0.556781,Ivan Toney
7,0.503397,Álvaro Morata
8,0.483397,Cody Gakpo
9,0.475923,João Félix


# 4. Next Steps/Considerations

### General
- Highly unprofessional at the moment, need to put above steps into a proper pre-processing pipeline w/ clear functions for each stage. 
- Add team and FPL position to final results. 
- Create a simple web interface for final results.
- Includes results from previous seasons. Reference players from the current season to prevent this. 

### Class Imbalance in Target Variable (Goals) in Training Data
- One potential issue with the regressor model could be that it is underpredicting the number of goals due to class imbalance in the training data, where a significant proportion of the observations have zero goals. To address this issue, we can combine the regressor model with a classifier to first predict if the number of goals is zero or greater than zero. When the predicted number of goals is greater than zero, we can train the regressor model on the resultant subset of data.

### Multicollinearity in Training Data 
- One issue with my current linear regressor model is that it may be affected by multicollinearity, as evident from the high correlation among the predictor variables found during the exploratory data analysis (EDA). I can further confirm this by calculating the variance inflation factor (VIF). To mitigate this issue, I could consider using regularization techniques such as ridge regression or Lasso regression. 

### Feature Engineering
- Team Stats - capture more information about chances each team's creating
- Time of Year 
- Recent performance/form for each player/team in offense/defense
- Travel time for away games 
- Game congestion impact
- Recency bias (weighted rolling average in predicted_df)