In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
SEASON = '21'

In [47]:
# Download latest results in current season
!wget -q https://www.football-data.co.uk/mmz4281/{SEASON}{int(SEASON)+1}/data.zip -O raw_data/data.zip

# Unzip to folder
!unzip -q -o raw_data/data.zip -d raw_data/{SEASON}

In [4]:
from fastai.tabular.all import * 
from utils import *

In [5]:
path_raw = Path('raw_data')
path_data = Path('data')

In [6]:
!ls -lah -t {str(path_raw)}

total 11M
drwxrwxr-x  2 twtang twtang 4.0K Oct 29 22:29 21
drwxrwxr-x 10 twtang twtang 4.0K Oct 29 22:29 ..
-rw-rw-r--  1 twtang twtang 314K Oct 29 06:22 data.zip
drwxrwxr-x 24 twtang twtang 4.0K Oct 26 21:23 .
drwxrwxr-x  2 twtang twtang 4.0K Sep 16 16:22 20
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:23 00
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 07
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 08
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 09
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 06
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 05
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 04
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 03
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 02
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 01
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 10
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 11
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:22 12
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:21 13
drwxrwxr-x  2 twtang twtang 4.0K Sep  9 16:21 14
drwxr

In [7]:
sorted(list(path_raw.iterdir()))

[Path('raw_data/00'),
 Path('raw_data/00.zip'),
 Path('raw_data/01'),
 Path('raw_data/01.zip'),
 Path('raw_data/02'),
 Path('raw_data/02.zip'),
 Path('raw_data/03'),
 Path('raw_data/03.zip'),
 Path('raw_data/04'),
 Path('raw_data/04.zip'),
 Path('raw_data/05'),
 Path('raw_data/05.zip'),
 Path('raw_data/06'),
 Path('raw_data/06.zip'),
 Path('raw_data/07'),
 Path('raw_data/07.zip'),
 Path('raw_data/08'),
 Path('raw_data/08.zip'),
 Path('raw_data/09'),
 Path('raw_data/09.zip'),
 Path('raw_data/10'),
 Path('raw_data/10.zip'),
 Path('raw_data/11'),
 Path('raw_data/11.zip'),
 Path('raw_data/12'),
 Path('raw_data/12.zip'),
 Path('raw_data/13'),
 Path('raw_data/13.zip'),
 Path('raw_data/14'),
 Path('raw_data/14.zip'),
 Path('raw_data/15'),
 Path('raw_data/16'),
 Path('raw_data/16.zip'),
 Path('raw_data/17'),
 Path('raw_data/17.zip'),
 Path('raw_data/18'),
 Path('raw_data/18.zip'),
 Path('raw_data/19'),
 Path('raw_data/19.zip'),
 Path('raw_data/20'),
 Path('raw_data/20.zip'),
 Path('raw_data/21

In [60]:
usecols = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'HC', 'AC', 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST']
dtype = {'HC':'float', 'AC':'float'}
parse_dates = ['Date']

seasons = [str(s).zfill(2) for s in range(10, 22)]

In [61]:
dfs = []

for folder in sorted(path_raw.iterdir()):
    if folder.is_dir() and folder.name in seasons: 
        for file in sorted(folder.glob('*.csv')):
            try:
                df = pd.read_csv(file, usecols=usecols, dtype=dtype, parse_dates=parse_dates, dayfirst=True)
                df['Season'] = folder.name
                dfs.append(df)
            except:
                continue

In [62]:
df = pd.concat(dfs)
df = df.dropna()
df = df.sort_values(['Div', 'Date', 'HomeTeam']).reset_index(drop=True)

In [63]:
# Make features on historical stats (Home and Away)
stats = ['FTHG', 'HS', 'HST', 'HC', 'FTAG', 'AS', 'AST', 'AC']
df_home, df_away = joinLastGamesStatsHomeAway(df, stats)

In [64]:
# Make features on historical stats (For and Against)
stats = [('FTHG', 'FTAG', 'FTG'), ('HS', 'AS', 'S'), ('HST', 'AST', 'ST'), ('HC', 'AC', 'C')]
df_for, df_against = joinLastGamesStatsForAgainst(df, stats)

In [65]:
df = df.merge(df_home, 'left', ['HomeTeam', 'Date']).merge(df_away, 'left', ['AwayTeam', 'Date'])

In [66]:
cols_home = df_for.columns[df_for.columns.str.contains('Avg')]
cols_home = dict(zip(cols_home, 'Home'+cols_home))
cols_home.update({'Team':'HomeTeam'})

cols_away = df_for.columns[df_for.columns.str.contains('Avg')]
cols_away = dict(zip(cols_home, 'Away'+cols_away))
cols_away.update({'Team':'AwayTeam'})

df = df.merge(df_for.rename(columns=cols_home), 'left', ['HomeTeam', 'Date']).merge(df_for.rename(columns=cols_away), 'left', ['AwayTeam', 'Date'])

In [67]:
cols_home = df_against.columns[df_against.columns.str.contains('Avg')]
cols_home = dict(zip(cols_home, 'Home'+cols_home))
cols_home.update({'Team':'HomeTeam'})

cols_away = df_against.columns[df_against.columns.str.contains('Avg')]
cols_away = dict(zip(cols_home, 'Away'+cols_away))
cols_away.update({'Team':'AwayTeam'})

df = df.merge(df_against.rename(columns=cols_home), 'left', ['HomeTeam', 'Date']).merge(df_against.rename(columns=cols_away), 'left', ['AwayTeam', 'Date'])

In [68]:
df = df.sort_values(['Div', 'Date', 'HomeTeam']).reset_index(drop=True)

In [69]:
df.to_csv(path_data/'data.csv', index=False)

In [70]:
df.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HS', 'AS',
       'HST', 'AST', 'HC', 'AC', 'Season', 'FTHGLast5Avg', 'HSLast5Avg',
       'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
       'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
       'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
       'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
       'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
       'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
       'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
       'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg'],
      dtype='object')

In [71]:
df[['HC', 'AC', 'FTHGLast5Avg', 'HSLast5Avg',
    'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
    'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
    'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
    'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
    'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
    'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
    'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
    'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg']].corr()['HC']

HC                        1.000000
AC                       -0.201772
FTHGLast5Avg              0.109885
HSLast5Avg                0.124224
HSTLast5Avg               0.138204
HCLast5Avg                0.145553
FTAGLast5Avg             -0.078255
ASLast5Avg               -0.085692
ASTLast5Avg              -0.061220
ACLast5Avg               -0.056391
HomeFTGForLast5Avg        0.107286
HomeSForLast5Avg          0.126753
HomeSTForLast5Avg         0.139080
HomeCForLast5Avg          0.147651
AwayFTGForLast5Avg       -0.083235
AwaySForLast5Avg         -0.080685
AwaySTForLast5Avg        -0.060688
AwayCForLast5Avg         -0.056640
HomeFTGAgainstLast5Avg   -0.070043
HomeSAgainstLast5Avg     -0.080986
HomeSTAgainstLast5Avg    -0.046370
HomeCAgainstLast5Avg     -0.058521
AwayFTGAgainstLast5Avg    0.095180
AwaySAgainstLast5Avg      0.131915
AwaySTAgainstLast5Avg     0.131754
AwayCAgainstLast5Avg      0.139409
Name: HC, dtype: float64

In [72]:
df[['HC', 'AC', 'FTHGLast5Avg', 'HSLast5Avg',
    'HSTLast5Avg', 'HCLast5Avg', 'FTAGLast5Avg', 'ASLast5Avg',
    'ASTLast5Avg', 'ACLast5Avg', 'HomeFTGForLast5Avg', 'HomeSForLast5Avg',
    'HomeSTForLast5Avg', 'HomeCForLast5Avg', 'AwayFTGForLast5Avg',
    'AwaySForLast5Avg', 'AwaySTForLast5Avg', 'AwayCForLast5Avg',
    'HomeFTGAgainstLast5Avg', 'HomeSAgainstLast5Avg',
    'HomeSTAgainstLast5Avg', 'HomeCAgainstLast5Avg',
    'AwayFTGAgainstLast5Avg', 'AwaySAgainstLast5Avg',
    'AwaySTAgainstLast5Avg', 'AwayCAgainstLast5Avg']].corr()['AC']

HC                       -0.201772
AC                        1.000000
FTHGLast5Avg             -0.073731
HSLast5Avg               -0.081401
HSTLast5Avg              -0.063083
HCLast5Avg               -0.051128
FTAGLast5Avg              0.096929
ASLast5Avg                0.105602
ASTLast5Avg               0.122032
ACLast5Avg                0.129273
HomeFTGForLast5Avg       -0.069478
HomeSForLast5Avg         -0.084400
HomeSTForLast5Avg        -0.064282
HomeCForLast5Avg         -0.050652
AwayFTGForLast5Avg        0.097883
AwaySForLast5Avg          0.107194
AwaySTForLast5Avg         0.124387
AwayCForLast5Avg          0.138827
HomeFTGAgainstLast5Avg    0.087523
HomeSAgainstLast5Avg      0.112796
HomeSTAgainstLast5Avg     0.115713
HomeCAgainstLast5Avg      0.138411
AwayFTGAgainstLast5Avg   -0.070855
AwaySAgainstLast5Avg     -0.093404
AwaySTAgainstLast5Avg    -0.057423
AwayCAgainstLast5Avg     -0.054753
Name: AC, dtype: float64