In [58]:
import pandas as pd
import math
import numpy as np


DATA_PATH = '../data/full_data_season.csv'
data = pd.read_csv(DATA_PATH, index_col=0)

# Create column for result
def convert_result(df, goals_home, goals_away):
    if math.isnan(df[goals_home]) or math.isnan(df[goals_away]):
        return np.nan
    else:
        if df[goals_home]>df[goals_away]:
            return 'w'
        elif df[goals_home]==df[goals_away]:
            return 'd'
        else:
            return 'l'

        
# Create results feature for current match and previous ones, and convert home_bool feature to 1/0        
data['result'] = data.apply(convert_result, args=('FTHG','FTAG'), axis=1)
match_list = [-3,-2,-1]
for match in match_list:
    data['home_bool%d'%(match)] = data['home_bool%d'%(match)].apply(lambda x: int(x==True)
                                                                    if not math.isnan(x) else np.nan)
    data['result%d'%(match)] = data.apply(convert_result, args=('FTHG%d'%(match),'FTAG%d'%(match)), axis=1)     
    
# Dimension Tests (check how many points are lost with dropping NaNs from historical match data)
print(len(data)-len(data.dropna()))
d1 = data.drop(columns=['FTHG-3','FTAG-3','home_bool-3','result-3'])
print(len(d1)-len(d1.dropna()))
d2 = data.drop(columns=['FTHG-3','FTAG-3','home_bool-3','result-3','FTHG-2','FTAG-2','home_bool-2','result-2'])
print(len(d2)-len(d2.dropna()))

data.tail()

5100
3711
1859


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,home_xG,home_xGA,home_npxG,home_npxGA,home_deep,...,FTHG-2,FTAG-2,home_bool-2,FTHG-1,FTAG-1,home_bool-1,result,result-3,result-2,result-1
13205,2021-12-22,Troyes,Brest,1.0,1.0,16.707601,26.101496,16.707601,24.581299,74.0,...,,,,,,,d,,,
13206,2022-01-07,Bordeaux,Marseille,0.0,1.0,23.760819,35.10839,23.000729,31.307914,89.0,...,0.0,0.0,1.0,2.0,2.0,0.0,l,w,d,d
13207,2022-01-08,Lens,Rennes,1.0,0.0,32.300567,23.155727,30.020267,21.635537,96.0,...,0.0,0.0,1.0,1.0,1.0,0.0,w,l,d,d
13208,2022-01-09,Brest,Nice,0.0,3.0,24.123663,26.411888,20.323186,25.651793,96.0,...,3.0,2.0,0.0,2.0,1.0,0.0,l,w,w,w
13209,2022-01-09,Clermont Foot,Reims,0.0,0.0,20.299663,24.37454,19.539569,22.85436,73.0,...,,,,1.0,0.0,0.0,d,,,w


In [59]:
# TODO 
#Decide to drop NaNs or fill NaNs before applying one-hot encoding


# One-hot encode result of current match and previous ones, and drop unnecessary columns
data = pd.concat([data, pd.get_dummies(data['result'], prefix='result')], axis=1)
data.drop(columns=['result'], inplace=True)
data.drop(columns=['FTHG','FTAG'], inplace=True)    
for match in match_list:
    data = pd.concat([data, pd.get_dummies(data['result%d'%(match)], prefix='result%d'%(match))], axis=1)
    data.drop(columns=['result%d'%(match)], inplace=True)
    data.drop(columns=['FTHG%d'%(match)], inplace=True)
    data.drop(columns=['FTAG%d'%(match)], inplace=True)

data.tail()

13210


Unnamed: 0,Date,HomeTeam,AwayTeam,home_xG,home_xGA,home_npxG,home_npxGA,home_deep,home_deep_allowed,home_xpts,...,result_w,result-3_d,result-3_l,result-3_w,result-2_d,result-2_l,result-2_w,result-1_d,result-1_l,result-1_w
13205,2021-12-22,Troyes,Brest,16.707601,26.101496,16.707601,24.581299,74.0,117.0,18.2942,...,0,0,0,0,0,0,0,0,0,0
13206,2022-01-07,Bordeaux,Marseille,23.760819,35.10839,23.000729,31.307914,89.0,117.0,18.4353,...,0,0,0,1,1,0,0,1,0,0
13207,2022-01-08,Lens,Rennes,32.300567,23.155727,30.020267,21.635537,96.0,104.0,31.874,...,1,0,1,0,1,0,0,1,0,0
13208,2022-01-09,Brest,Nice,24.123663,26.411888,20.323186,25.651793,96.0,116.0,25.8753,...,0,0,0,1,0,0,1,0,0,1
13209,2022-01-09,Clermont Foot,Reims,20.299663,24.37454,19.539569,22.85436,73.0,96.0,21.4382,...,0,0,0,0,0,0,0,0,0,1


In [60]:
#Create train, test sets
from sklearn.model_selection import train_test_split #for random sampling

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)  #random sampling, 20% test data, seed 42 for repeatability

In [61]:
import matplotlib.pyplot as plt

corr_matrix = train_set.copy().corr()  #Compute standard correlation coefficient (Pearson's r)

corr_matrix['result_w'].sort_values(ascending=False)
# corr_matrix['result_l'].sort_values(ascending=False)
# corr_matrix['result_d'].sort_values(ascending=False)

result_w             1.000000
home_npxGD           0.225196
home_oppda_ratio     0.189613
home_wins            0.152522
home_deep            0.137672
home_pts             0.134701
away_loses           0.131216
home_npxG            0.126259
home_xG              0.122665
home_xpts            0.120570
away_ppda_ratio      0.113544
away_npxGA           0.089562
away_xGA             0.087827
away_deep_allowed    0.085239
result-1_l           0.056839
away_draws           0.036739
home_bool-2          0.014816
result-1_d           0.014455
result-3_w           0.010673
result-2_l           0.006278
result-2_w           0.002576
home_bool-3          0.002057
result-3_l          -0.000048
result-2_d          -0.006196
result-3_d          -0.008125
home_draws          -0.015931
home_bool-1         -0.017983
result-1_w          -0.053021
home_xGA            -0.055581
home_npxGA          -0.055951
home_deep_allowed   -0.060249
away_xG             -0.086693
away_xpts           -0.087356
away_npxG 

In [62]:
corr_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,home_xG,home_xGA,home_npxG,home_npxGA,home_deep,home_deep_allowed,home_xpts,home_wins,home_draws,home_loses,home_pts,home_npxGD,home_ppda_ratio,home_oppda_ratio,away_xG,away_xGA,away_npxG,away_npxGA,away_deep,away_deep_allowed,away_xpts,away_wins,away_draws,away_loses,away_pts,away_npxGD,away_ppda_ratio,away_oppda_ratio,home_bool-3,home_bool-2,home_bool-1,result_d,result_l,result_w,result-3_d,result-3_l,result-3_w,result-2_d,result-2_l,result-2_w,result-1_d,result-1_l,result-1_w
home_xG,1.0,0.676985,0.995882,0.673836,0.940458,0.653595,0.975739,0.905675,0.618359,0.43384,0.935554,0.450299,-0.225289,0.246523,0.754322,0.796938,0.750048,0.79528,0.682746,0.780063,0.759665,0.642512,0.687801,0.659166,0.710904,-0.00099,-0.020114,-0.04231,0.005294,-0.016191,-0.424964,-0.053171,-0.082081,0.122665,0.055724,0.087568,0.112102,-0.010819,0.03244,0.036373,0.067313,0.177245,0.009578
home_xGA,0.676985,1.0,0.668832,0.996089,0.585521,0.955566,0.638516,0.482079,0.732342,0.891778,0.574725,-0.343875,0.073609,-0.188936,0.799353,0.840784,0.794858,0.839106,0.7252,0.825915,0.801644,0.676699,0.720605,0.694698,0.748013,0.003317,-0.01734,-0.044616,-0.003939,-0.01131,-0.458032,-0.008754,0.068282,-0.055581,0.057209,0.062376,0.039677,-0.01252,-0.022824,-0.003289,0.072531,0.022461,0.12958
home_npxG,0.995882,0.668832,1.0,0.666917,0.943151,0.64777,0.971976,0.904716,0.611612,0.42622,0.933403,0.463701,-0.225778,0.252711,0.75018,0.792258,0.746877,0.791797,0.680293,0.777264,0.754475,0.638139,0.683025,0.653936,0.706048,-0.000844,-0.019685,-0.041624,0.00424,-0.015453,-0.422393,-0.05395,-0.085223,0.126259,0.053105,0.086214,0.111068,-0.011424,0.029736,0.036513,0.066888,0.177553,0.007344
home_npxGA,0.673836,0.996089,0.666917,1.0,0.584496,0.956926,0.635982,0.47864,0.731149,0.889259,0.571393,-0.350931,0.075528,-0.188388,0.797763,0.839108,0.794459,0.838493,0.725628,0.825813,0.798799,0.67376,0.718411,0.692299,0.744941,0.003543,-0.017302,-0.044425,-0.003415,-0.012713,-0.456597,-0.00874,0.068668,-0.055951,0.055247,0.062365,0.038296,-0.013808,-0.023284,-0.003463,0.071721,0.022697,0.128787
home_deep,0.940458,0.585521,0.943151,0.584496,1.0,0.570056,0.91977,0.869134,0.542304,0.352962,0.88808,0.490261,-0.244515,0.32749,0.682869,0.723653,0.680533,0.723649,0.631255,0.7281,0.686889,0.578862,0.623119,0.599628,0.641134,-0.003358,-0.00779,-0.032895,0.013348,-0.021859,-0.378803,-0.054585,-0.096942,0.137672,0.047946,0.083573,0.114788,-0.008401,0.038332,0.039861,0.052963,0.18437,-0.003033
home_deep_allowed,0.653595,0.955566,0.64777,0.956926,0.570056,1.0,0.631046,0.480025,0.735197,0.850853,0.573414,-0.323776,0.131195,-0.197994,0.781569,0.826233,0.779119,0.826467,0.730517,0.83436,0.782266,0.659036,0.706347,0.681403,0.72935,-0.001459,-0.000416,-0.033948,-0.00427,-0.013835,-0.44415,-0.006048,0.07076,-0.060249,0.055564,0.064805,0.039126,-0.010133,-0.020496,-0.000957,0.073695,0.029477,0.118995
home_xpts,0.975739,0.638516,0.971976,0.635982,0.91977,0.631046,1.0,0.924808,0.658971,0.421212,0.960561,0.465266,-0.243188,0.224977,0.755499,0.800377,0.750148,0.797656,0.683889,0.781096,0.772036,0.650975,0.704957,0.672556,0.721802,-0.00368,-0.0397,-0.057864,0.004406,-0.015572,-0.434461,-0.044997,-0.087556,0.12057,0.053522,0.076197,0.104489,-0.014028,0.022724,0.029218,0.073626,0.179009,0.001545
home_wins,0.905675,0.482079,0.904716,0.47864,0.869134,0.480025,0.924808,1.0,0.431396,0.206323,0.985147,0.567827,-0.258141,0.309729,0.634853,0.674399,0.63006,0.672602,0.573594,0.655843,0.649309,0.546387,0.589692,0.571472,0.605456,-0.006218,-0.033711,-0.045178,0.012875,-0.018584,-0.359646,-0.048879,-0.118372,0.152522,0.045761,0.073869,0.106614,-0.008918,0.032976,0.036692,0.047175,0.223219,-0.043727
home_draws,0.618359,0.732342,0.611612,0.731149,0.542304,0.735197,0.658971,0.431396,1.0,0.54776,0.579901,-0.100757,-0.035761,-0.13433,0.690745,0.723151,0.686162,0.720329,0.62878,0.705226,0.70732,0.590256,0.663499,0.599472,0.65908,0.007638,-0.047532,-0.059908,-0.010079,-0.005616,-0.398404,-0.003969,0.020954,-0.015931,0.049608,0.048041,0.03955,-0.015267,-0.024076,-0.000162,0.14244,0.028602,0.038379
home_loses,0.43384,0.891778,0.42622,0.889259,0.352962,0.850853,0.421212,0.206323,0.54776,1.0,0.290576,-0.521737,0.132371,-0.305367,0.661048,0.698133,0.656266,0.695573,0.599126,0.687163,0.671778,0.569984,0.599803,0.586357,0.628694,-0.000547,-0.024604,-0.051151,-0.008156,-0.007103,-0.383087,0.009368,0.112545,-0.112478,0.039784,0.029381,0.001334,-0.018986,-0.041712,-0.026883,0.033053,-0.055795,0.185849
