# Contents
1. Import Libraries, Data and Model <br>

2. Building Prediction DataFrame <br>

3. Applying Model to Prediction DataFrame <br>

4. Next Steps

# 1. Import Libraries, Data and Model

In [1]:
import pandas as pd
import pickle
import soccerdata as sd
pd.set_option('display.max_columns', None)
from datetime import date
import numpy as np


df = pd.read_csv('df.csv', encoding='utf-8')


with open('Final_model', 'rb') as f:
    model = pickle.load(f)
    

# 2. Building Prediction DataFrame

The predictors will comprise of: <br>
* The location of their next game (Home/Away). <br>
* An average of over 15 numerical indicators over the last 10 games. <br>
* Their current age, continent and position.  

### Last 10 Games

In [3]:
def get_last_n_rows_by_player(df, n=10):
    df_last_n = pd.DataFrame()

    for player in df['player'].unique():
        filtered_df = df[df['player'] == player].tail(n)
        df_last_n = pd.concat([df_last_n, filtered_df], axis=0)

    return df_last_n


df_last_10 = get_last_n_rows_by_player(df, 10)


### Location

In [4]:
def process_location_data(df, epl_schedule):
    team = epl_schedule.reset_index()
    team = team[team['date'] >= str(date.today())]

    home_team_df = team.groupby('home_team')['date'].min().reset_index().rename(columns={'date': 'earliest_date_home'})
    away_team_df = team.groupby('away_team')['date'].min().reset_index().rename(columns={'date': 'earliest_date_away'})

    merged_df = pd.merge(home_team_df, away_team_df, left_on='home_team', right_on='away_team')
    merged_df['earliest_date'] = merged_df['earliest_date_home'].combine(merged_df['earliest_date_away'], min)
    merged_df['home_away'] = np.where(merged_df['earliest_date_home'] < merged_df['earliest_date_away'], 'Home', 'Away')
    merged_df = merged_df.rename(columns={'home_team': 'team'})

    df_location = df.merge(merged_df, on='team')
    df_location = df_location[['Gls', 'team', 'player', 'home_away']].rename(columns={'home_away': 'Location'})

    encoded_location = pd.get_dummies(df_location['Location'], prefix='Location')
    df_location = pd.concat([df_location, encoded_location], axis=1).drop(['Gls', 'team', 'Location'], axis=1)
    df_location.drop_duplicates(inplace=True)

    return df_location

# Usage:
fbref = sd.FBref(leagues="ENG-Premier League", seasons=2022)
epl_schedule = fbref.read_schedule()
df_location = process_location_data(df, epl_schedule)

### Position, Continent

In [6]:
def get_mode_by_player(df, mode_cols):
    df_mode = df[mode_cols]
    df_mode = df_mode.groupby('player')[mode_cols].apply(lambda x: x.mode().iloc[0]).reset_index(drop=True) 
    return df_mode


mode_cols = ['Pos_Attacking Midfield', 'Pos_Center Back', 'Pos_Central Midfield', 'Pos_Defensive Midfield', 'Pos_Forward', 'Pos_Fullbacks', 'Pos_Goalkeeper', 'Pos_Wide Attacker', 'Pos_Wide Midfield', 'Continent_Africa', 'Continent_Asia', 'Continent_Europe', 'Continent_North America', 'Continent_Oceania', 'Continent_South America', 'player']
df_mode = get_mode_by_player(df_last_10, mode_cols)


### Age

In [7]:
def get_age_by_player(df):
    df[['Age']] = df[['Age']].astype('int64')
    df_age = pd.DataFrame(df.groupby('player')['Age'].nlargest(1)).reset_index().drop(['level_1'], axis=1)
   
    return df_age


df_age = get_age_by_player(df)


### Averaged Numerical Indicators

In [8]:
def get_average_statistics(df, df_last_10):
    df[['CrdR_0.0', 'CrdR_1.0', 'CrdY_0.0', 'CrdY_1.0', 'CrdY_2.0']] = df[['CrdR_0.0', 'CrdR_1.0', 'CrdY_0.0', 'CrdY_1.0', 'CrdY_2.0']].astype(float)
    avg_cols = df.select_dtypes(include=['float']).columns.to_list()
    df_average = df_last_10.groupby('player')[avg_cols].mean().reset_index().round(2)
    return df_average


df_average = get_average_statistics(df, df_last_10)


### Merging

In [13]:
merged1 = pd.merge(df_mode, df_average, on='player')
merged2 = pd.merge(merged1, df_location, on='player')
df_predict = pd.merge(merged2, df_age, on='player').reindex(columns=df.columns)

players = df_predict['player']
df_predict.drop(['player', 'team', 'Gls'], axis = 1, inplace = True)


# 3. Applying Model to Prediction DataFrame

In [23]:
predictions = pd.DataFrame(model.predict(df_predict))
predictions['Player'] = players
predictions = predictions.sort_values(by=predictions.columns[0], ascending=False).rename(columns={predictions.columns[0]: "Predicted Goals"}).drop_duplicates(subset=["Player"]).reset_index(drop=True)
predictions.head(25)

Unnamed: 0,Predicted Goals,Player
0,0.668365,Eddie Nketiah
1,0.589674,Darwin Núñez
2,0.574674,Gabriel Jesus
3,0.572148,Marcus Rashford
4,0.563399,Aleksandar Mitrović
5,0.556781,Erling Haaland
6,0.556781,Ivan Toney
7,0.503397,Álvaro Morata
8,0.483397,Cody Gakpo
9,0.475923,João Félix


# 4. Next Steps/Considerations

### General
- Add team and FPL position to final results. 
- Create a simple web interface for final results.
- Includes results from previous seasons. Reference players from the current season to prevent this. 

### Class Imbalance in Target Variable (Goals) in Training Data
- One potential issue with the regressor model could be that it is underpredicting the number of goals due to class imbalance in the training data, where a significant proportion of the observations have zero goals. To address this issue, we can combine the regressor model with a classifier to first predict if the number of goals is zero or greater than zero. When the predicted number of goals is greater than zero, we can train the regressor model on the resultant subset of data.

### Multicollinearity in Training Data 
- One issue with my current linear regressor model is that it may be affected by multicollinearity, as evident from the high correlation among the predictor variables found during the exploratory data analysis (EDA). I can further confirm this by calculating the variance inflation factor (VIF). To mitigate this issue, I could consider using regularization techniques such as ridge regression or Lasso regression. 

### Feature Engineering
- Team Stats - capture more information about chances each team's creating
- Time of Year 
- Recent performance/form for each player/team in offense/defense
- Travel time for away games 
- Game congestion impact
- Recency bias (weighted rolling average in predicted_df)