In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
SEASON = '23'

In [3]:
# Download latest results in current season
!wget -q https://www.football-data.co.uk/mmz4281/{SEASON}{int(SEASON)+1}/data.zip -O raw_data/data.zip

# Unzip to folder
!unzip -q -o raw_data/data.zip -d raw_data/{SEASON}

In [4]:
from fastai.tabular.all import * 
from utils import *

In [5]:
path_raw = Path('raw_data')
path_data = Path('data')

In [6]:
!ls -lah -t {str(path_raw)}

total 17768
drwxr-xr-x  24 twtang  staff   768B Oct 21 00:39 [1m[36m23[m[m
-rw-r--r--   1 twtang  staff   258K Oct 17 18:00 data.zip
-rw-r--r--@  1 twtang  staff   6.0K Sep 30 11:50 .DS_Store
drwxr-xr-x  29 twtang  staff   928B Aug 11 21:18 [1m[36m.[m[m
drwxr-xr-x  24 twtang  staff   768B Aug 11 21:18 [1m[36m22[m[m
-rw-r--r--   1 twtang  staff   1.0M Jun 22 18:08 22.zip
drwxr-xr-x  22 twtang  staff   704B Sep 12  2022 [1m[36m..[m[m
drwx------@ 24 twtang  staff   768B Aug  2  2022 [1m[36m21[m[m
-rw-r--r--@  1 twtang  staff   1.0M Aug  2  2022 21.zip
drwx------@ 24 twtang  staff   768B Nov 20  2021 [1m[36m13[m[m
drwx------@ 24 twtang  staff   768B Nov 20  2021 [1m[36m12[m[m
drwx------@ 24 twtang  staff   768B Nov 20  2021 [1m[36m11[m[m
drwx------@ 24 twtang  staff   768B Nov 20  2021 [1m[36m14[m[m
drwx------@ 24 twtang  staff   768B Oct 30  2021 [1m[36m20[m[m
drwx------@ 24 twtang  staff   768B Oct 30  2021 [1m[36m19[m[m
drwx------@ 24 twtang  

In [7]:
sorted(list(path_raw.iterdir()))

[Path('raw_data/.DS_Store'),
 Path('raw_data/11'),
 Path('raw_data/11.zip'),
 Path('raw_data/12'),
 Path('raw_data/12.zip'),
 Path('raw_data/13'),
 Path('raw_data/13.zip'),
 Path('raw_data/14'),
 Path('raw_data/14.zip'),
 Path('raw_data/15'),
 Path('raw_data/15.zip'),
 Path('raw_data/16'),
 Path('raw_data/16.zip'),
 Path('raw_data/17'),
 Path('raw_data/17.zip'),
 Path('raw_data/18'),
 Path('raw_data/18.zip'),
 Path('raw_data/19'),
 Path('raw_data/19.zip'),
 Path('raw_data/20'),
 Path('raw_data/20.zip'),
 Path('raw_data/21'),
 Path('raw_data/21.zip'),
 Path('raw_data/22'),
 Path('raw_data/22.zip'),
 Path('raw_data/23'),
 Path('raw_data/data.zip')]

In [8]:
usecols = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'HC', 'AC', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST']
dtype = {'HC':'float', 'AC':'float'}
parse_dates = ['Date']

seasons = [str(s).zfill(2) for s in range(11, int(SEASON)+1)]

In [9]:
dfs = []

for folder in sorted(path_raw.iterdir()):
    if folder.is_dir() and folder.name in seasons: 
        for file in sorted(folder.glob('*.csv')):
            try:
                df = pd.read_csv(file, usecols=usecols, dtype=dtype, parse_dates=parse_dates, dayfirst=True)
                df['Season'] = folder.name
                dfs.append(df)
            except:
                continue

In [10]:
df = pd.concat(dfs)
df = df.dropna()
df = df.sort_values(['Div', 'Date', 'HomeTeam']).reset_index(drop=True)

In [11]:
# Make features on historical stats (Home and Away)
stats = ['FTHG', 'HS', 'HST', 'HC', 'FTAG', 'AS', 'AST', 'AC']
df_home, df_away = joinLastGamesStatsHomeAway(df, stats)

In [12]:
# Make features on historical stats (For and Against)
stats = [('FTHG', 'FTAG', 'FTG'), ('HS', 'AS', 'S'), ('HST', 'AST', 'ST'), ('HC', 'AC', 'C')]
df_for, df_against = joinLastGamesStatsForAgainst(df, stats)

In [13]:
df = df.merge(df_home, 'left', ['HomeTeam', 'Date']).merge(df_away, 'left', ['AwayTeam', 'Date'])

In [14]:
cols_home = df_for.columns[df_for.columns.str.contains('Avg')]
cols_home = dict(zip(cols_home, 'Home'+cols_home))
cols_home.update({'Team':'HomeTeam'})

cols_away = df_for.columns[df_for.columns.str.contains('Avg')]
cols_away = dict(zip(cols_home, 'Away'+cols_away))
cols_away.update({'Team':'AwayTeam'})

df = df.merge(df_for.rename(columns=cols_home), 'left', ['HomeTeam', 'Date']).merge(df_for.rename(columns=cols_away), 'left', ['AwayTeam', 'Date'])

In [15]:
cols_home = df_against.columns[df_against.columns.str.contains('Avg')]
cols_home = dict(zip(cols_home, 'Home'+cols_home))
cols_home.update({'Team':'HomeTeam'})

cols_away = df_against.columns[df_against.columns.str.contains('Avg')]
cols_away = dict(zip(cols_home, 'Away'+cols_away))
cols_away.update({'Team':'AwayTeam'})

df = df.merge(df_against.rename(columns=cols_home), 'left', ['HomeTeam', 'Date']).merge(df_against.rename(columns=cols_away), 'left', ['AwayTeam', 'Date'])

In [16]:
df = df.sort_values(['Div', 'Date', 'HomeTeam']).reset_index(drop=True)

In [17]:
df.to_csv(path_data/'data.csv', index=False)

In [18]:
df.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HS', 'AS',
       'HST', 'AST', 'HC', 'AC', 'Season', 'FTHGLast5Avg', 'HSLast5Avg',
       'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
       'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
       'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
       'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
       'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
       'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
       'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
       'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg'],
      dtype='object')

In [19]:
df[['HC', 'AC', 'FTHGLast5Avg', 'HSLast5Avg',
    'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
    'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
    'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
    'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
    'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
    'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
    'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
    'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg']].corr()['HC']

HC                        1.000000
AC                       -0.215549
FTHGLast5Avg              0.117667
HSLast5Avg                0.130985
HSTLast5Avg               0.141579
HCLast5Avg                0.147214
FTAGLast5Avg             -0.082327
ASLast5Avg               -0.088679
ASTLast5Avg              -0.068605
ACLast5Avg               -0.062130
HomeFTGForLast5Avg        0.113322
HomeSForLast5Avg          0.135148
HomeSTForLast5Avg         0.142587
HomeCForLast5Avg          0.149133
AwayFTGForLast5Avg       -0.087431
AwaySForLast5Avg         -0.086726
AwaySTForLast5Avg        -0.072691
AwayCForLast5Avg         -0.064305
HomeFTGAgainstLast5Avg   -0.075209
HomeSAgainstLast5Avg     -0.090700
HomeSTAgainstLast5Avg    -0.058871
HomeCAgainstLast5Avg     -0.067748
AwayFTGAgainstLast5Avg    0.097125
AwaySAgainstLast5Avg      0.135916
AwaySTAgainstLast5Avg     0.129868
AwayCAgainstLast5Avg      0.138072
Name: HC, dtype: float64

In [20]:
df[['HC', 'AC', 'FTHGLast5Avg', 'HSLast5Avg',
    'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
    'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
    'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
    'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
    'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
    'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
    'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
    'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg']].corr()['AC']

HC                       -0.215549
AC                        1.000000
FTHGLast5Avg             -0.085666
HSLast5Avg               -0.089362
HSTLast5Avg              -0.075328
HCLast5Avg               -0.059826
FTAGLast5Avg              0.101157
ASLast5Avg                0.112730
ASTLast5Avg               0.121108
ACLast5Avg                0.130170
HomeFTGForLast5Avg       -0.079458
HomeSForLast5Avg         -0.092689
HomeSTForLast5Avg        -0.076866
HomeCForLast5Avg         -0.060471
AwayFTGForLast5Avg        0.104194
AwaySForLast5Avg          0.113593
AwaySTForLast5Avg         0.125156
AwayCForLast5Avg          0.141241
HomeFTGAgainstLast5Avg    0.093009
HomeSAgainstLast5Avg      0.122797
HomeSTAgainstLast5Avg     0.119967
HomeCAgainstLast5Avg      0.142030
AwayFTGAgainstLast5Avg   -0.074139
AwaySAgainstLast5Avg     -0.095225
AwaySTAgainstLast5Avg    -0.065749
AwayCAgainstLast5Avg     -0.059024
Name: AC, dtype: float64

## END