In [None]:
from sklearn.metrics import root_mean_squared_error, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, RFE, mutual_info_regression
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from sklearn.svm import SVC, LinearSVC
from keras.utils import to_categorical
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import pandas as pd
import numpy as np
import joblib

In [None]:
data_dir = "football-prediction/epl-training.csv"
df = pd.read_csv(data_dir)

In [None]:
def compute_difference_using_home_team_as_reference():
        features_to_differentiate = [
            ('Full Time Home Goals', 'Full Time Away Goals', 'Full Time Goal Difference'),
            ('Half Time Home Goals', 'Half Time Away Goals', 'Half Time Goal Difference'),
            ('Home Shots', 'Away Shots', 'Shot Difference'),
            ('Home Shots on Target', 'Away Shots on Target', 'Shots on Target Difference'),
            ('Home Corners', 'Away Corners', 'Corner Difference'),
            ('Home Fouls', 'Away Fouls', 'Foul Difference'),
            ('Home Yellow Cards', 'Away Yellow Cards', 'Yellow Card Difference'),
            ('Home Red Cards', 'Away Red Cards', 'Red Card Difference')
        ]
        for home_feature, away_feature, new_feature in features_to_differentiate:
            df[new_feature] = df[home_feature] - df[away_feature]
    
def get_season(date):
    if date.month >= 8:  
        return (date.year)
    else:  
        return (date.year - 1)

In [None]:
df.columns = [
            'Date',               
            'Home Team',          
            'Away Team',
            'Full Time Home Goals',  
            'Full Time Away Goals',  
            'Full Time Result',       
            'Half Time Home Goals',  
            'Half Time Away Goals',  
            'Half Time Result',      
            'Referee',
            'Home Shots',             
            'Away Shots',             
            'Home Shots on Target',   
            'Away Shots on Target',   
            'Home Corners',          
            'Away Corners',           
            'Home Fouls',             
            'Away Fouls',             
            'Home Yellow Cards',     
            'Away Yellow Cards',     
            'Home Red Cards',     
            'Away Red Cards'      
        ]
        
df['Home Goal Conversion Rate'] = df['Full Time Home Goals'] / df['Home Shots on Target'].replace(0, 1)
df['Away Goal Conversion Rate'] = df['Full Time Away Goals'] / df['Away Shots on Target'].replace(0, 1)
df['Home Attacking Intensity'] = 2 * df["Home Shots on Target"] + 1 * (df['Home Shots'] - df["Home Shots on Target"]) + 0.5 * df['Home Corners']
df['Away Attacking Intensity'] = 2 * df["Away Shots on Target"] + 1 * (df['Away Shots'] - df["Away Shots on Target"]) + 0.5 * df['Away Corners']
df['Attacking Intensity Difference'] = df['Home Attacking Intensity'] - df['Away Attacking Intensity']
df['Home Disciplinary Pressure'] = df['Home Fouls'] + df['Home Yellow Cards'] + df['Home Red Cards']
df['Away Disciplinary Pressure'] = df['Away Fouls'] + df['Away Yellow Cards'] + df['Away Red Cards']
df['Disciplinary Pressure Difference'] = df['Home Disciplinary Pressure'] - df['Away Disciplinary Pressure']
df['xG Home'] = df['Home Goal Conversion Rate'] * df['Home Attacking Intensity']
df['xG Away'] = df['Away Goal Conversion Rate'] * df['Away Attacking Intensity']
df['Match Outcome'] = df['Full Time Result'].map({'H': 1, 'D': 0, 'A': -1})

df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df = df.sort_values(by='Date')
df['Season'] = df['Date'].apply(get_season)

In [9]:
df

Unnamed: 0,Date,Home Team,Away Team,Full Time Home Goals,Full Time Away Goals,Full Time Result,Half Time Home Goals,Half Time Away Goals,Half Time Result,Referee,...,Home Attacking Intensity,Away Attacking Intensity,Attacking Intensity Difference,Home Disciplinary Pressure,Away Disciplinary Pressure,Disciplinary Pressure Difference,xG Home,xG Away,Match Outcome,Season
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,34.0,15.0,19.0,14.0,14.0,0.0,9.714286,0.000000,1.0,2000.0
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,30.5,20.5,10.0,20.0,16.0,4.0,12.200000,8.200000,1.0,2000.0
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,13.0,27.0,-14.0,21.0,24.0,-3.0,4.333333,9.000000,-1.0,2000.0
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,12.5,23.0,-10.5,12.0,14.0,-2.0,6.250000,7.666667,0.0,2000.0
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,28.0,20.0,8.0,22.0,23.0,-1.0,7.000000,0.000000,1.0,2000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9212,2024-05-19,Brentford,Newcastle,2.0,4.0,A,0.0,3.0,A,S Hooper,...,22.5,24.5,-2.0,7.0,4.0,3.0,9.000000,14.000000,-1.0,2023.0
9211,2024-05-19,Arsenal,Everton,2.0,1.0,H,1.0,1.0,D,M Oliver,...,35.0,12.5,22.5,12.0,4.0,8.0,14.000000,6.250000,1.0,2023.0
9214,2024-05-19,Burnley,Nott'm Forest,1.0,2.0,A,0.0,2.0,A,G Scott,...,28.5,20.5,8.0,5.0,3.0,2.0,9.500000,6.833333,-1.0,2023.0
9220,2024-05-19,Sheffield United,Tottenham,0.0,3.0,A,0.0,1.0,A,A Madley,...,12.5,31.5,-19.0,4.0,6.0,-2.0,0.000000,10.500000,-1.0,2023.0
