In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
SEASON = '22'

In [3]:
# Download latest results in current season
!wget -q https://www.football-data.co.uk/mmz4281/{SEASON}{int(SEASON)+1}/data.zip -O raw_data/data.zip

# Unzip to folder
!unzip -q -o raw_data/data.zip -d raw_data/{SEASON}

In [4]:
from fastai.tabular.all import * 
from utils import *

In [5]:
path_raw = Path('raw_data')
path_data = Path('data')

In [6]:
!ls -lah -t {str(path_raw)}

total 15488
drwxr-xr-x  24 twtang  staff   768B Aug 26 22:32 [1m[36m22[m[m
-rw-r--r--   1 twtang  staff   104K Aug 23 06:20 data.zip
-rw-r--r--@  1 twtang  staff   6.0K Aug  5 20:56 .DS_Store
drwxr-xr-x  22 twtang  staff   704B Aug  5 20:55 [1m[36m..[m[m
drwxr-xr-x  27 twtang  staff   864B Aug  2 21:30 [1m[36m.[m[m
drwx------@ 24 twtang  staff   768B Aug  2 21:29 [1m[36m21[m[m
-rw-r--r--@  1 twtang  staff   1.0M Aug  2 21:28 21.zip
drwx------@ 24 twtang  staff   768B Nov 20  2021 [1m[36m13[m[m
drwx------@ 24 twtang  staff   768B Nov 20  2021 [1m[36m12[m[m
drwx------@ 24 twtang  staff   768B Nov 20  2021 [1m[36m11[m[m
drwx------@ 24 twtang  staff   768B Nov 20  2021 [1m[36m14[m[m
drwx------@ 24 twtang  staff   768B Oct 30  2021 [1m[36m20[m[m
drwx------@ 24 twtang  staff   768B Oct 30  2021 [1m[36m19[m[m
drwx------@ 24 twtang  staff   768B Oct 30  2021 [1m[36m18[m[m
drwx------@ 24 twtang  staff   768B Oct 30  2021 [1m[36m17[m[m
drwx------@ 

In [7]:
sorted(list(path_raw.iterdir()))

[Path('raw_data/.DS_Store'),
 Path('raw_data/11'),
 Path('raw_data/11.zip'),
 Path('raw_data/12'),
 Path('raw_data/12.zip'),
 Path('raw_data/13'),
 Path('raw_data/13.zip'),
 Path('raw_data/14'),
 Path('raw_data/14.zip'),
 Path('raw_data/15'),
 Path('raw_data/15.zip'),
 Path('raw_data/16'),
 Path('raw_data/16.zip'),
 Path('raw_data/17'),
 Path('raw_data/17.zip'),
 Path('raw_data/18'),
 Path('raw_data/18.zip'),
 Path('raw_data/19'),
 Path('raw_data/19.zip'),
 Path('raw_data/20'),
 Path('raw_data/20.zip'),
 Path('raw_data/21'),
 Path('raw_data/21.zip'),
 Path('raw_data/22'),
 Path('raw_data/data.zip')]

In [8]:
usecols = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'HC', 'AC', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST']
dtype = {'HC':'float', 'AC':'float'}
parse_dates = ['Date']

seasons = [str(s).zfill(2) for s in range(11, 23)]

In [9]:
dfs = []

for folder in sorted(path_raw.iterdir()):
    if folder.is_dir() and folder.name in seasons: 
        for file in sorted(folder.glob('*.csv')):
            try:
                df = pd.read_csv(file, usecols=usecols, dtype=dtype, parse_dates=parse_dates, dayfirst=True)
                df['Season'] = folder.name
                dfs.append(df)
            except:
                continue

In [10]:
df = pd.concat(dfs)
df = df.dropna()
df = df.sort_values(['Div', 'Date', 'HomeTeam']).reset_index(drop=True)

In [11]:
# Make features on historical stats (Home and Away)
stats = ['FTHG', 'HS', 'HST', 'HC', 'FTAG', 'AS', 'AST', 'AC']
df_home, df_away = joinLastGamesStatsHomeAway(df, stats)

In [12]:
# Make features on historical stats (For and Against)
stats = [('FTHG', 'FTAG', 'FTG'), ('HS', 'AS', 'S'), ('HST', 'AST', 'ST'), ('HC', 'AC', 'C')]
df_for, df_against = joinLastGamesStatsForAgainst(df, stats)

In [13]:
df = df.merge(df_home, 'left', ['HomeTeam', 'Date']).merge(df_away, 'left', ['AwayTeam', 'Date'])

In [14]:
cols_home = df_for.columns[df_for.columns.str.contains('Avg')]
cols_home = dict(zip(cols_home, 'Home'+cols_home))
cols_home.update({'Team':'HomeTeam'})

cols_away = df_for.columns[df_for.columns.str.contains('Avg')]
cols_away = dict(zip(cols_home, 'Away'+cols_away))
cols_away.update({'Team':'AwayTeam'})

df = df.merge(df_for.rename(columns=cols_home), 'left', ['HomeTeam', 'Date']).merge(df_for.rename(columns=cols_away), 'left', ['AwayTeam', 'Date'])

In [15]:
cols_home = df_against.columns[df_against.columns.str.contains('Avg')]
cols_home = dict(zip(cols_home, 'Home'+cols_home))
cols_home.update({'Team':'HomeTeam'})

cols_away = df_against.columns[df_against.columns.str.contains('Avg')]
cols_away = dict(zip(cols_home, 'Away'+cols_away))
cols_away.update({'Team':'AwayTeam'})

df = df.merge(df_against.rename(columns=cols_home), 'left', ['HomeTeam', 'Date']).merge(df_against.rename(columns=cols_away), 'left', ['AwayTeam', 'Date'])

In [16]:
df = df.sort_values(['Div', 'Date', 'HomeTeam']).reset_index(drop=True)

In [17]:
df.to_csv(path_data/'data.csv', index=False)

In [18]:
df.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HS', 'AS',
       'HST', 'AST', 'HC', 'AC', 'Season', 'FTHGLast5Avg', 'HSLast5Avg',
       'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
       'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
       'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
       'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
       'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
       'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
       'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
       'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg'],
      dtype='object')

In [19]:
df[['HC', 'AC', 'FTHGLast5Avg', 'HSLast5Avg',
    'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
    'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
    'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
    'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
    'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
    'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
    'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
    'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg']].corr()['HC']

HC                        1.000000
AC                       -0.212092
FTHGLast5Avg              0.115524
HSLast5Avg                0.128947
HSTLast5Avg               0.141257
HCLast5Avg                0.143956
FTAGLast5Avg             -0.082864
ASLast5Avg               -0.087861
ASTLast5Avg              -0.068250
ACLast5Avg               -0.060898
HomeFTGForLast5Avg        0.111412
HomeSForLast5Avg          0.132487
HomeSTForLast5Avg         0.142485
HomeCForLast5Avg          0.147292
AwayFTGForLast5Avg       -0.088057
AwaySForLast5Avg         -0.084867
AwaySTForLast5Avg        -0.071270
AwayCForLast5Avg         -0.061582
HomeFTGAgainstLast5Avg   -0.073826
HomeSAgainstLast5Avg     -0.088265
HomeSTAgainstLast5Avg    -0.056608
HomeCAgainstLast5Avg     -0.065793
AwayFTGAgainstLast5Avg    0.095167
AwaySAgainstLast5Avg      0.135627
AwaySTAgainstLast5Avg     0.128859
AwayCAgainstLast5Avg      0.137831
Name: HC, dtype: float64

In [20]:
df[['HC', 'AC', 'FTHGLast5Avg', 'HSLast5Avg',
    'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
    'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
    'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
    'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
    'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
    'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
    'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
    'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg']].corr()['AC']

HC                       -0.212092
AC                        1.000000
FTHGLast5Avg             -0.080898
HSLast5Avg               -0.082692
HSTLast5Avg              -0.072142
HCLast5Avg               -0.055892
FTAGLast5Avg              0.100206
ASLast5Avg                0.114347
ASTLast5Avg               0.122438
ACLast5Avg                0.131701
HomeFTGForLast5Avg       -0.074419
HomeSForLast5Avg         -0.086877
HomeSTForLast5Avg        -0.074066
HomeCForLast5Avg         -0.056471
AwayFTGForLast5Avg        0.102721
AwaySForLast5Avg          0.114743
AwaySTForLast5Avg         0.126632
AwayCForLast5Avg          0.141172
HomeFTGAgainstLast5Avg    0.088237
HomeSAgainstLast5Avg      0.118271
HomeSTAgainstLast5Avg     0.115207
HomeCAgainstLast5Avg      0.140014
AwayFTGAgainstLast5Avg   -0.073325
AwaySAgainstLast5Avg     -0.094724
AwaySTAgainstLast5Avg    -0.064278
AwayCAgainstLast5Avg     -0.058662
Name: AC, dtype: float64

## END