Extract the data from http://www.football-data.co.uk/, Transform them and Load them in a Database.

In [1]:
# Import library
import json
import sqlite3
import requests
import io
import pandas as pd # data processing, CSV file I/O
from IPython.display import display # Manage multiple output per cell

In [2]:
# DB Sqlite connection
db = "/Users/thibaultclement/Project/ligue1-predict/src/notebook/data/db/soccer_predict.sqlite"
conn = sqlite3.connect(db)
cur = conn.cursor()

In [3]:
# Clean Database
cur.execute('DROP TABLE IF EXISTS macths_raw')
cur.execute('DROP TABLE IF EXISTS pre_matchs')
conn.commit()

In [4]:
# Configuration
leagues = ['D1', 'E0', 'E1', 'E2', 'F1', 'I1', 'SP1', 'SC0']
seasons = ['1314', '1415', '1516', '1617', '0506', '0607', '0708', '0809','0910', '1011', '1112', '1213']
website = "http://www.football-data.co.uk/mmz4281"

In [5]:
# Crawl Data from internet to get result on a league for a specific season
def crawlLeagueBySeason( season, league ):
    url = website+"/"+season+"/"+league+".csv"
    req = requests.get(url).content
    df = pd.read_csv(io.StringIO(req.decode('utf-8')))
    # Remove not use columns only for premier league
    if 'Attendance' in df.columns:
        df = df[df.columns.drop(['Attendance'])]
    if 'Referee' in df.columns:
        df = df[df.columns.drop(['Referee'])]
    if 'SBH' in df.columns:
        df = df[df.columns.drop(['SBH'])]
    if 'SBD' in df.columns:
        df = df[df.columns.drop(['SBD'])]
    if 'SBA' in df.columns:
        df = df[df.columns.drop(['SBA'])]
    if 'GBH' in df.columns:
        df = df[df.columns.drop(['GBH'])]
    if 'GBA' in df.columns:
        df = df[df.columns.drop(['GBA'])]
    if 'GBD' in df.columns:
        df = df[df.columns.drop(['GBD'])]
    if 'BSH' in df.columns:
        df = df[df.columns.drop(['BSH'])]
    if 'BSD' in df.columns:
        df = df[df.columns.drop(['BSD'])]
    if 'BSA' in df.columns:
        df = df[df.columns.drop(['BSA'])]
    if 'Unnamed: 70' in df.columns:
        df = df[df.columns.drop(['Unnamed: 70'])]
    if 'Unnamed: 71' in df.columns:
        df = df[df.columns.drop(['Unnamed: 71'])]
    if 'Unnamed: 72' in df.columns:
        df = df[df.columns.drop(['Unnamed: 72'])]
    # Transform Date column to Date Type
    if len(df['Date'][0]) == 8:
        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y')
    else:
        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
    # Insert to DB
    df.to_sql("macths_raw", conn, if_exists="append")
    display(league, season,100. * df.FTR.value_counts() / len(df.FTR))

In [6]:
# Crawl all seasons for all leagues
for league in leagues:
    for season in seasons:
        crawlLeagueBySeason(season, league)

'D1'

'1314'

H    47.385621
A    31.699346
D    20.915033
Name: FTR, dtype: float64

'D1'

'1415'

H    47.385621
D    26.797386
A    25.816993
Name: FTR, dtype: float64

'D1'

'1516'

H    44.117647
A    32.679739
D    23.202614
Name: FTR, dtype: float64

'D1'

'1617'

H    49.019608
A    26.797386
D    24.183007
Name: FTR, dtype: float64

'D1'

'0506'

H    42.810458
D    31.372549
A    25.816993
Name: FTR, dtype: float64

'D1'

'0607'

H    43.790850
A    30.392157
D    25.816993
Name: FTR, dtype: float64

'D1'

'0708'

H    46.732026
A    27.777778
D    25.490196
Name: FTR, dtype: float64

'D1'

'0809'

H    48.039216
A    27.777778
D    24.183007
Name: FTR, dtype: float64

'D1'

'0910'

H    40.849673
A    31.045752
D    28.104575
Name: FTR, dtype: float64

'D1'

'1011'

H    46.078431
A    33.333333
D    20.588235
Name: FTR, dtype: float64

'D1'

'1112'

H    45.424837
A    28.758170
D    25.816993
Name: FTR, dtype: float64

'D1'

'1213'

H    42.483660
A    32.026144
D    25.490196
Name: FTR, dtype: float64

'E0'

'1314'

H    47.105263
A    32.368421
D    20.526316
Name: FTR, dtype: float64

'E0'

'1415'

H    45.144357
A    30.183727
D    24.409449
Name: FTR, dtype: float64

'E0'

'1516'

H    41.315789
A    30.526316
D    28.157895
Name: FTR, dtype: float64

'E0'

'1617'

H    49.210526
A    28.684211
D    22.105263
Name: FTR, dtype: float64

'E0'

'0506'

H    50.526316
A    29.210526
D    20.263158
Name: FTR, dtype: float64

'E0'

'0607'

H    47.894737
A    26.315789
D    25.789474
Name: FTR, dtype: float64

'E0'

'0708'

H    46.315789
A    27.368421
D    26.315789
Name: FTR, dtype: float64

'E0'

'0809'

H    45.526316
A    28.947368
D    25.526316
Name: FTR, dtype: float64

'E0'

'0910'

H    50.789474
D    25.263158
A    23.947368
Name: FTR, dtype: float64

'E0'

'1011'

H    47.105263
D    29.210526
A    23.684211
Name: FTR, dtype: float64

'E0'

'1112'

H    45.000000
A    30.526316
D    24.473684
Name: FTR, dtype: float64

'E0'

'1213'

H    43.684211
D    28.421053
A    27.894737
Name: FTR, dtype: float64

'E1'

'1314'

H    41.304348
A    30.434783
D    28.260870
Name: FTR, dtype: float64

'E1'

'1415'

H    41.229656
A    30.018083
D    28.571429
Name: FTR, dtype: float64

'E1'

'1516'

H    41.123188
D    31.159420
A    27.717391
Name: FTR, dtype: float64

'E1'

'1617'

H    47.463768
A    28.985507
D    23.550725
Name: FTR, dtype: float64

'E1'

'0506'

H    42.391304
D    31.340580
A    26.268116
Name: FTR, dtype: float64

'E1'

'0607'

H    48.188406
A    29.528986
D    22.282609
Name: FTR, dtype: float64

'E1'

'0708'

H    42.391304
D    30.978261
A    26.630435
Name: FTR, dtype: float64

'E1'

'0809'

H    43.297101
D    29.347826
A    27.355072
Name: FTR, dtype: float64

'E1'

'0910'

H    45.289855
D    29.347826
A    25.362319
Name: FTR, dtype: float64

'E1'

'1011'

H    44.565217
A    28.623188
D    26.811594
Name: FTR, dtype: float64

'E1'

'1112'

H    42.753623
A    30.253623
D    26.992754
Name: FTR, dtype: float64

'E1'

'1213'

H    44.565217
A    29.166667
D    26.268116
Name: FTR, dtype: float64

'E2'

'1314'

H    44.202899
A    30.072464
D    25.724638
Name: FTR, dtype: float64

'E2'

'1415'

H    40.398551
A    33.333333
D    26.268116
Name: FTR, dtype: float64

'E2'

'1516'

H    42.857143
A    32.007233
D    24.954792
Name: FTR, dtype: float64

'E2'

'1617'

H    44.927536
D    27.898551
A    27.173913
Name: FTR, dtype: float64

'E2'

'0506'

H    42.391304
D    31.340580
A    26.268116
Name: FTR, dtype: float64

'E2'

'0607'

H    45.108696
A    29.710145
D    25.181159
Name: FTR, dtype: float64

'E2'

'0708'

H    46.557971
A    28.079710
D    25.362319
Name: FTR, dtype: float64

'E2'

'0809'

H    42.753623
A    32.427536
D    24.818841
Name: FTR, dtype: float64

'E2'

'0910'

H    47.282609
D    27.536232
A    25.181159
Name: FTR, dtype: float64

'E2'

'1011'

H    45.108696
A    30.072464
D    24.818841
Name: FTR, dtype: float64

'E2'

'1112'

H    41.847826
D    29.891304
A    28.260870
Name: FTR, dtype: float64

'E2'

'1213'

H    40.144665
A    32.730561
D    26.943942
Name: FTR, dtype: float64

'F1'

'1314'

H    44.094488
D    28.346457
A    27.296588
Name: FTR, dtype: float64

'F1'

'1415'

H    47.631579
A    29.210526
D    23.157895
Name: FTR, dtype: float64

'F1'

'1516'

H    41.994751
A    29.396325
D    28.346457
Name: FTR, dtype: float64

'F1'

'1617'

H    48.947368
A    26.315789
D    24.736842
Name: FTR, dtype: float64

'F1'

'0506'

H    44.736842
D    31.052632
A    24.210526
Name: FTR, dtype: float64

'F1'

'0607'

H    47.631579
D    30.789474
A    21.578947
Name: FTR, dtype: float64

'F1'

'0708'

H    43.947368
D    30.526316
A    25.526316
Name: FTR, dtype: float64

'F1'

'0809'

H    43.421053
D    29.473684
A    27.105263
Name: FTR, dtype: float64

'F1'

'0910'

H    47.105263
A    27.368421
D    25.526316
Name: FTR, dtype: float64

'F1'

'1011'

H    41.315789
D    34.210526
A    24.473684
Name: FTR, dtype: float64

'F1'

'1112'

H    47.105263
D    28.421053
A    24.473684
Name: FTR, dtype: float64

'F1'

'1213'

H    44.736842
D    28.421053
A    26.842105
Name: FTR, dtype: float64

'I1'

'1314'

H    47.631579
A    28.684211
D    23.684211
Name: FTR, dtype: float64

'I1'

'1415'

H    39.895013
D    31.496063
A    28.346457
Name: FTR, dtype: float64

'I1'

'1516'

H    45.931759
A    28.871391
D    24.934383
Name: FTR, dtype: float64

'I1'

'1617'

H    48.421053
A    30.526316
D    21.052632
Name: FTR, dtype: float64

'I1'

'0506'

H    46.315789
D    28.421053
A    25.263158
Name: FTR, dtype: float64

'I1'

'0607'

H    45.526316
D    30.000000
A    24.473684
Name: FTR, dtype: float64

'I1'

'0708'

H    46.052632
D    29.473684
A    24.473684
Name: FTR, dtype: float64

'I1'

'0809'

H    50.526316
D    25.000000
A    24.473684
Name: FTR, dtype: float64

'I1'

'0910'

H    48.947368
D    26.842105
A    24.210526
Name: FTR, dtype: float64

'I1'

'1011'

H    47.105263
A    27.368421
D    25.526316
Name: FTR, dtype: float64

'I1'

'1112'

H    45.526316
D    29.210526
A    25.263158
Name: FTR, dtype: float64

'I1'

'1213'

H    46.214099
A    27.937337
D    25.065274
Name: FTR, dtype: float64

'SP1'

'1314'

H    47.105263
A    30.263158
D    22.631579
Name: FTR, dtype: float64

'SP1'

'1415'

H    45.000000
A    31.052632
D    23.947368
Name: FTR, dtype: float64

'SP1'

'1516'

H    48.157895
A    27.631579
D    24.210526
Name: FTR, dtype: float64

'SP1'

'1617'

H    47.631579
A    28.947368
D    23.421053
Name: FTR, dtype: float64

'SP1'

'0506'

H    42.631579
A    29.736842
D    27.631579
Name: FTR, dtype: float64

'SP1'

'0607'

H    45.263158
A    28.947368
D    25.789474
Name: FTR, dtype: float64

'SP1'

'0708'

H    47.894737
A    29.210526
D    22.894737
Name: FTR, dtype: float64

'SP1'

'0809'

H    48.421053
A    29.736842
D    21.842105
Name: FTR, dtype: float64

'SP1'

'0910'

H    51.052632
D    25.000000
A    23.947368
Name: FTR, dtype: float64

'SP1'

'1011'

H    51.842105
A    27.368421
D    20.789474
Name: FTR, dtype: float64

'SP1'

'1112'

H    49.473684
A    25.789474
D    24.736842
Name: FTR, dtype: float64

'SP1'

'1213'

H    49.736842
A    28.157895
D    22.105263
Name: FTR, dtype: float64

'SC0'

'1314'

H    44.736842
A    33.333333
D    21.929825
Name: FTR, dtype: float64

'SC0'

'1415'

H    44.736842
A    35.964912
D    19.298246
Name: FTR, dtype: float64

'SC0'

'1516'

H    40.789474
A    36.403509
D    22.807018
Name: FTR, dtype: float64

'SC0'

'1617'

H    41.228070
A    33.333333
D    25.438596
Name: FTR, dtype: float64

'SC0'

'0506'

H    42.54386
A    32.45614
D    25.00000
Name: FTR, dtype: float64

'SC0'

'0607'

H    43.859649
A    32.456140
D    23.684211
Name: FTR, dtype: float64

'SC0'

'0708'

H    50.000000
A    28.947368
D    21.052632
Name: FTR, dtype: float64

'SC0'

'0809'

H    42.105263
A    32.017544
D    25.877193
Name: FTR, dtype: float64

'SC0'

'0910'

H    41.228070
A    31.140351
D    27.631579
Name: FTR, dtype: float64

'SC0'

'1011'

H    41.228070
A    37.280702
D    21.491228
Name: FTR, dtype: float64

'SC0'

'1112'

H    39.473684
A    35.526316
D    25.000000
Name: FTR, dtype: float64

'SC0'

'1213'

H    39.035088
D    32.017544
A    28.947368
Name: FTR, dtype: float64

In [7]:
# Get all data for pre match on away team
def homeData( date, team, div, nb_matches, nb_matches_string ):
    # Dataframe to return with all info
    dic = {}
    # Home team query
    #TODO Recuperer aussi combien de buts ils se sont pris dans la tronche et tout et tout
    queryHome = '''
            SELECT Date, FTHG, FTR, HTHG, HTR, HS, HST, HF, HC, HY, HR, FTAG, HTAG, `AS`, AST, AF, AC, AY, AR
            FROM macths_raw
            WHERE Date < ? AND HomeTeam = ? AND Div = ? ORDER BY Date DESC LIMIT ?'''
    # Get the previous home game of the Home Team
    df_home = pd.read_sql(queryHome, conn, params=[date, team, div, nb_matches])
    # Hot-encode Category Full Time Result and Half Time Result
    df_home = pd.get_dummies(df_home, columns=['FTR', 'HTR'])
    # Calculate the mean of all columns
    #display(df_home.head())
    dic['H_MEANS_'+nb_matches_string+'_FTHG'] = round(df_home.FTHG.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_FTR_H'] = 0 if 'FTR_H'not in df_home.columns else round(df_home.FTR_H.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_FTR_D'] = 0 if 'FTR_D' not in df_home.columns else round(df_home.FTR_D.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_FTR_A'] = 0 if 'FTR_A' not in df_home.columns else round(df_home.FTR_A.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTHG'] = round(df_home.HTHG.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTR_H'] = 0 if 'HTR_H' not in df_home.columns else round(df_home.HTR_H.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTR_D'] = 0 if 'HTR_D' not in df_home.columns else round(df_home.HTR_D.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTR_A'] = 0 if 'HTR_A' not in df_home.columns else round(df_home.HTR_A.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HS'] = round(df_home.HS.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HST'] = round(df_home.HST.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HF'] = round( df_home.HF.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HC'] = round(df_home.HC.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HY'] = round(df_home.HY.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HR'] = round(df_home.HR.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_FTAG'] = round(df_home.FTAG.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_HTAG'] = round(df_home.HTAG.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AS'] = round(df_home.AS.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AST'] = round(df_home.AST.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AF'] = round(df_home.AF.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AC'] = round(df_home.AC.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AY'] = round(df_home.AY.mean(), 2)
    dic['H_MEANS_'+nb_matches_string+'_AR'] = round(df_home.AR.mean(), 2)
    dic['H_STD_'+nb_matches_string+'_FTHG'] = round(df_home.FTHG.std(), 3)
    dic['H_STD_'+nb_matches_string+'_FTR_H'] = 0 if 'FTR_H'not in df_home.columns else round(df_home.FTR_H.std(), 3)
    dic['H_STD_'+nb_matches_string+'_FTR_D'] = 0 if 'FTR_D' not in df_home.columns else round(df_home.FTR_D.std(), 3)
    dic['H_STD_'+nb_matches_string+'_FTR_A'] = 0 if 'FTR_A' not in df_home.columns else round(df_home.FTR_A.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTHG'] = round(df_home.HTHG.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTR_H'] = 0 if 'HTR_H' not in df_home.columns else round(df_home.HTR_H.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTR_D'] = 0 if 'HTR_D' not in df_home.columns else round(df_home.HTR_D.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTR_A'] = 0 if 'HTR_A' not in df_home.columns else round(df_home.HTR_A.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HS'] = round(df_home.HS.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HST'] = round(df_home.HST.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HF'] = round( df_home.HF.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HC'] = round(df_home.HC.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HY'] = round(df_home.HY.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HR'] = round(df_home.HR.std(), 3)
    dic['H_STD_'+nb_matches_string+'_FTAG'] = round(df_home.FTAG.std(), 3)
    dic['H_STD_'+nb_matches_string+'_HTAG'] = round(df_home.HTAG.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AS'] = round(df_home.AS.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AST'] = round(df_home.AST.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AF'] = round(df_home.AF.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AC'] = round(df_home.AC.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AY'] = round(df_home.AY.std(), 3)
    dic['H_STD_'+nb_matches_string+'_AR'] = round(df_home.AR.std(), 3)
    return dic

In [8]:
# Get all data for pre match on away team
def awayData( date, team, div, nb_matches, nb_matches_string ):
    # Dataframe to return with all info
    dic = {}
    # away team query
    #TODO Recuperer aussi combien de buts ils se sont pris dans la tronche et tout et tout
    queryAway = '''
            SELECT Date, FTAG, FTR, HTAG, HTR, `AS`, AST, AF, AC, AY, AR, FTHG, HTHG, HS, HST, HF, HC, HY, HR
            FROM macths_raw
            WHERE Date < ? AND AwayTeam = ? AND Div = ? ORDER BY Date DESC LIMIT ?'''
    # Get the previous away game of the away Team
    df_away = pd.read_sql(queryAway, conn, params=[date, team, div, nb_matches])
    # Hot-encode Category Full Time Result and Half Time Result
    df_away = pd.get_dummies(df_away, columns=['FTR', 'HTR'])
    # Calculate the mean of all columns
    #display(df_away.head())
    dic['A_MEANS_'+nb_matches_string+'_FTAG'] = round(df_away.FTAG.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_FTR_H'] = 0 if 'FTR_H'not in df_away.columns else round(df_away.FTR_H.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_FTR_D'] = 0 if 'FTR_D' not in df_away.columns else round(df_away.FTR_D.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_FTR_A'] = 0 if 'FTR_A' not in df_away.columns else round(df_away.FTR_A.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTAG'] = round(df_away.HTAG.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTR_H'] = 0 if 'HTR_H' not in df_away.columns else round(df_away.HTR_H.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTR_D'] = 0 if 'HTR_D' not in df_away.columns else round(df_away.HTR_D.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTR_A'] = 0 if 'HTR_A' not in df_away.columns else round(df_away.HTR_A.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AS'] = round(df_away.AS.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AST'] = round(df_away.AST.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AF'] = round(df_away.AF.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AC'] = round(df_away.AC.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AY'] = round(df_away.AY.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_AR'] = round(df_away.AR.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_FTHG'] = round(df_away.FTHG.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HTHG'] = round(df_away.HTHG.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HS'] = round(df_away.HS.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HST'] = round(df_away.HST.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HF'] = round( df_away.HF.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HC'] = round(df_away.HC.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HY'] = round(df_away.HY.mean(), 2)
    dic['A_MEANS_'+nb_matches_string+'_HR'] = round(df_away.HR.mean(), 2)
    dic['A_STD_'+nb_matches_string+'_FTAG'] = round(df_away.FTAG.std(), 3)
    dic['A_STD_'+nb_matches_string+'_FTR_H'] = 0 if 'FTR_H'not in df_away.columns else round(df_away.FTR_H.std(), 3)
    dic['A_STD_'+nb_matches_string+'_FTR_D'] = 0 if 'FTR_D' not in df_away.columns else round(df_away.FTR_D.std(), 3)
    dic['A_STD_'+nb_matches_string+'_FTR_A'] = 0 if 'FTR_A' not in df_away.columns else round(df_away.FTR_A.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTAG'] = round(df_away.HTAG.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTR_H'] = 0 if 'HTR_H' not in df_away.columns else round(df_away.HTR_H.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTR_D'] = 0 if 'HTR_D' not in df_away.columns else round(df_away.HTR_D.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTR_A'] = 0 if 'HTR_A' not in df_away.columns else round(df_away.HTR_A.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AS'] = round(df_away.AS.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AST'] = round(df_away.AST.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AF'] = round(df_away.AF.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AC'] = round(df_away.AC.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AY'] = round(df_away.AY.std(), 3)
    dic['A_STD_'+nb_matches_string+'_AR'] = round(df_away.AR.std(), 3)
    dic['A_STD_'+nb_matches_string+'_FTHG'] = round(df_away.FTHG.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HTHG'] = round(df_away.HTHG.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HS'] = round(df_away.HS.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HST'] = round(df_away.HST.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HF'] = round( df_away.HF.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HC'] = round(df_away.HC.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HY'] = round(df_away.HY.std(), 3)
    dic['A_STD_'+nb_matches_string+'_HR'] = round(df_away.HR.std(), 3)
    return dic

In [9]:
# Compute all pre matchs and insert them in pre_match table
df_macths_raw = pd.read_sql_query("SELECT Div, Date, HomeTeam, AwayTeam, FTHG, FTAG, FTR, HTHG, HTAG, HTR, HS, `AS`, HST, AST, HF, AF, HC, AC, HY, AY, HR, AR, PSH, PSD, PSA, BbAvH, BbAvD, BbAvA FROM macths_raw ORDER BY Date ASC;", conn)
for index, row in df_macths_raw.iterrows():
    #display(row)
    
    # HOME TEAM
    # Get the 5 previous home game of the Home Team
    dic_home_five = homeData(row.Date, row.HomeTeam, row.Div, 5, 'FIVE')
    #display(df_home_five)
    # Get the 3 previous home game of the Home Team
    dic_home_three = homeData(row.Date, row.HomeTeam, row.Div, 3, 'THREE')
    #display(df_home_three)
    
    # AWAY TEAM
    # Get the 5 previous away game of the Away Team
    dic_away_five = awayData(row.Date, row.AwayTeam, row.Div, 5, 'FIVE')
    #display(df_away_five)
    # Get the 3 previous away game of the Away Team
    dic_away_three = awayData(row.Date, row.AwayTeam, row.Div, 3, 'THREE')
    #display(df_away_three)
    
    # Merge different dataset together
    #dic_all = dic_home_five.copy().update(dic_home_three)
    dic_all = dict(dic_home_five.items() + dic_home_three.items() + dic_away_five.items() + dic_away_three.items())
    # Add division and date
    dic_all['INFO_Div'] = row['Div']
    dic_all['INFO_Date'] = row['Date']
    # Add info on result
    dic_all['INFO_HomeTeam'] = row['HomeTeam']
    dic_all['INFO_AwayTeam'] = row['AwayTeam']
    dic_all['INFO_FTR'] = row['FTR']
    dic_all['INFO_HTR'] = row['HTR']
    dic_all['INFO_FTHG'] = row['FTHG']
    dic_all['INFO_FTAG'] = row['FTAG']
    # Add all Bet
    dic_all['INFO_PSH'] = row['PSH']
    dic_all['INFO_PSD'] = row['PSD']
    dic_all['INFO_PSA'] = row['PSA']
    dic_all['INFO_BbAvH'] = row['BbAvH']
    dic_all['INFO_BbAvD'] = row['BbAvD']
    dic_all['INFO_BbAvA'] = row['BbAvA']
    df_pre_matches = pd.DataFrame.from_dict([dic_all], orient='columns')
    df_pre_matches.to_sql("pre_matchs", conn, if_exists="append")

df_macths_raw.shape

(37907, 28)