In [1]:
import pandas as pd
import numpy as np

In [2]:
sl_base_df = pd.read_csv("INPUTS/sl_base_dataset_20230621.csv")

In [3]:
print('sl_base_df Size: ', sl_base_df.shape)
sl_base_df.head(4)

sl_base_df Size:  (8020, 22)


Unnamed: 0,season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,1,2000-08-19,Charlton,Man City,4,0,H,2,0,H,...,14,4,13,12,6,6,1,2,0,0
1,1,2000-08-19,Chelsea,West Ham,4,2,H,1,0,H,...,10,5,19,14,7,7,1,2,0,0
2,1,2000-08-19,Coventry,Middlesbrough,1,3,A,1,1,D,...,3,9,15,21,8,4,5,3,1,0
3,1,2000-08-19,Derby,Southampton,2,2,D,1,2,A,...,4,6,11,13,5,8,1,1,0,0


In [4]:
season_numbers = list(set(sl_base_df.season))
season_numbers

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

In [5]:
# Note: previous dataset 2009-2022 had 39 unique teams
teams_all = sorted(list(set(sl_base_df.HomeTeam)))
len(teams_all) # 45 unique teams in the dataset

45

# 1. List of Dicts for new dataframe

In [6]:
# 1. CREATING LIST OF MATCH DICTS
df = sl_base_df.copy()

date_vals = df.Date.tolist()
season_vals = df.season.tolist()
home_teams = df.HomeTeam.tolist()
away_teams = df.AwayTeam.tolist()
winner_vals = df.FTR.tolist()  # FULL TIME RESULT (A for away wins, H for home team wins, D for a draw result)

match_dicts = []
for i in range(len(date_vals)):
    match_dict = {}
    match_dict.update({'date': date_vals[i], 'season': season_vals[i],
                       'home_team': home_teams[i], 'away_team': away_teams[i], 'FTR': winner_vals[i]})
    match_dicts.append(match_dict)

print('Total dicts: ', len(match_dicts))  # 8020. formerly had 4600 dicts using 09-22 dataset
del(date_vals, season_vals, home_teams, away_teams, winner_vals)

Total dicts:  8020


In [7]:
# example:
match_dicts[3489]

{'date': '2009-10-03',
 'season': 10,
 'home_team': 'Man United',
 'away_team': 'Sunderland',
 'FTR': 'D'}

# 2. Obtain all the home and away teams' match results leading up to their match 

In [8]:
def filtered_df_up_to_match(dataset_df, match_dict):
    # Returns all the home and away teams' match results leading up to their match
    date, hometeam, awayteam = match_dict['date'], match_dict['home_team'], match_dict['away_team']
    df = dataset_df.copy()
    
    df = df[df['season'] == match_dict['season']]
    df = df[df['Date'] <= date]
    
    teams = [hometeam, awayteam]
    df = df[(df.HomeTeam.isin(teams)) | (df.AwayTeam.isin(teams))]
    df = df[:-1] # drop the last row containing the given football match's own row
    
    return df

In [9]:
# SAMPLE Usage of filtered_df_up_to_match()
#  This function filters both teams' results leading up to their match.
#  Note: only same-season results are considered as teams rosters can change between seasons which 
#        reduces the relevance of last season's matches in the supervised learning models.

test_dict = match_dicts[3489].copy()
test_df = filtered_df_up_to_match(df, test_dict)
test_df

Unnamed: 0,season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
3422,10,2009-08-15,Bolton,Sunderland,0,1,A,0,1,A,...,3,13,16,10,4,7,2,1,0,0
3428,10,2009-08-16,Man United,Birmingham,1,0,H,1,0,H,...,17,4,13,7,13,2,1,1,0,0
3430,10,2009-08-18,Sunderland,Chelsea,1,3,A,1,0,H,...,3,9,14,10,1,14,2,2,0,0
3433,10,2009-08-19,Burnley,Man United,1,0,H,1,0,H,...,2,9,8,12,1,12,2,1,0,0
3440,10,2009-08-22,Sunderland,Blackburn,2,1,H,1,1,D,...,5,5,12,14,0,10,2,1,0,0
3441,10,2009-08-22,Wigan,Man United,0,5,A,0,0,D,...,7,13,11,8,3,5,2,2,0,0
3449,10,2009-08-29,Man United,Arsenal,2,1,H,0,1,A,...,4,3,21,15,6,5,3,6,0,0
3450,10,2009-08-29,Stoke,Sunderland,1,0,H,1,0,H,...,7,6,9,8,10,10,0,2,0,0
3461,10,2009-09-12,Sunderland,Hull,4,1,H,1,1,D,...,4,5,18,15,7,8,2,2,0,0
3462,10,2009-09-12,Tottenham,Man United,1,3,A,1,2,A,...,7,13,16,13,3,9,3,2,0,1


**{'date': '2009-10-03',
 'season': 10,
 'home_team': 'Man United',
 'away_team': 'Sunderland',
 'FTR': 'D'}**

_The df above depicts all games in season 10 up to the match Man United vs Sunderland on 2009/10/03.
From this abbreviated in-season DF, we will derive season to date (s2d) statistics for both teams to boost the model's prediction power._

# 3.  Helper Functions to derive insights from original dataset

#### Number of games Played By Team (home games, away games, total)

In [10]:
def games_played_s2d(df, team):
    """Takes in DF from function filtered_df_up_to_match() and counts how many games a team has played this season
    returns: list with 3 counts: [home_count, away_count, total_count]
    """
    homegames_s2d_count = df['HomeTeam'].value_counts()[team]
    awaygames_s2d_count = df['AwayTeam'].value_counts()[team]
    total_games_played = homegames_s2d_count + awaygames_s2d_count
    
    return [homegames_s2d_count, awaygames_s2d_count, total_games_played]

#### Extract season to date (s2d) insights

In [11]:
def team_var_stats(df, team, extract_home_var, extract_away_var):
    """Returns a given team's stats incurred as home team, away team, and total
    params:
    df: abbreviated DF containing all s2d records for the two teams involved in a match
    team: single team name (str)
    """
    df = df[(df.HomeTeam == team) | (df.AwayTeam == team)]
    home_away_played = games_played_s2d(df, team)

    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    # = = = = = = = =   Variable in favor of team    = = = = = = = =
    #example: goals scored as a home team; goals scored as away team; total goals scored
    
    home_var_s2d = df.loc[df['HomeTeam'] == team, extract_home_var].sum() #7
    away_var_s2d = df.loc[df['AwayTeam'] == team, extract_away_var].sum() #10
    total_var_s2d = home_var_s2d + away_var_s2d
    
    avg_home_var_s2d = home_var_s2d / home_away_played[0]
    avg_away_var_s2d = away_var_s2d / home_away_played[1]
    avg_var_s2d = total_var_s2d / home_away_played[2]
    
    var_made_stats = [home_var_s2d, away_var_s2d, total_var_s2d,
                    avg_home_var_s2d, avg_away_var_s2d, avg_var_s2d
                   ]
    
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
    # = = = = = = = = = =   Variable AGAINST team  = = = = = = = = =
    #example: goals conceded as a home team; goals conceded as away team; total goals conceded
    
    home_var_conceded_s2d = df.loc[df['HomeTeam'] == team, extract_away_var].sum() #4
    away_var_conceded_s2d = df.loc[df['AwayTeam'] == team, extract_home_var].sum() #2
    total_conceded_s2d = home_var_conceded_s2d + away_var_conceded_s2d
    
    avg_home_var_conceded_s2d = home_var_conceded_s2d / home_away_played[0]
    avg_away_var_conceded_s2d = away_var_conceded_s2d / home_away_played[1]
    avg_conceded_s2d = total_conceded_s2d / home_away_played[2]
    
    var_against_stats = [home_var_conceded_s2d, away_var_conceded_s2d, total_conceded_s2d,
                      avg_home_var_conceded_s2d, avg_away_var_conceded_s2d, avg_conceded_s2d
                     ]
    
    return var_made_stats, var_against_stats

# 4. The two functions below are used to update each match dictionary from first step 

In [12]:
def update_match_dict(match_dict, var_key, list_var_for, list_var_against, home=1):
    if home == 1:
        prefix = 'H_'
    else:
        prefix = 'A_'
    
    m_dict = match_dict.copy()
    
    m_dict.update({prefix +'home_'+ var_key + '_s2d': list_var_for[0],
                        prefix + 'away_'+var_key + '_s2d': list_var_for[1],
                        prefix + 'total_'+var_key + '_s2d': list_var_for[2], 
                        prefix + 'avg_home_'+var_key + '_s2d': list_var_for[3],
                        prefix + 'avg_away_'+var_key + '_s2d': list_var_for[4],
                        prefix + 'avg_'+var_key + '_s2d': list_var_for[5]
                       }
                      )
    
    m_dict.update({prefix + 'home_'+ var_key + '_against_s2d': list_var_against[0],
                        prefix + 'away_'+var_key + '_against_s2d': list_var_against[1],
                        prefix + 'total_'+var_key + '_against_s2d': list_var_against[2], 
                        prefix + 'avg_home_'+var_key + '_against_s2d': list_var_against[3],
                        prefix + 'avg_away_'+var_key + '_against_s2d': list_var_against[4],
                        prefix + 'avg_'+var_key + '_against_s2d': list_var_against[5]
                       }
                      )
    
    return m_dict

In [13]:
def update_team_dict_vals(df, vars_dict, match_dict):
    home_team = match_dict['home_team']
    away_team = match_dict['away_team']
    x = 1
    for key, value in vars_dict.items():
        extract_home_var = value[0]
        extract_away_var = value[1]
        var_for, var_against = team_var_stats(df, home_team, extract_home_var, extract_away_var)
        if x < 2:
            d2 = update_match_dict(match_dict, key, var_for, var_against, home=1)
        else:
            d2 = update_match_dict(d2, key, var_for, var_against, home=1)
        x += 1
    
    for key, value in vars_dict.items():
        extract_home_var = value[0]
        extract_away_var = value[1]
        var_for, var_against = team_var_stats(df, away_team, extract_home_var, extract_away_var)
        d2 = update_match_dict(d2, key, var_for, var_against, home=0)
    
    return d2

# 5. Update all match dicts with derived features

#### vars_dict: the features we want to extract from the original dataset as recalculated **season-to-date** insights

In [14]:
vars_dict = {'goals': ('FTHG', 'FTAG'), 'shots': ('HS', 'AS'), 'shots_on_target': ('HST', 'AST'),
             'corners': ('HC', 'AC')}

In [15]:
# test_dict = match_dicts[3489]
# d4 = update_team_dict_vals(test_df, vars_dict, test_dict)
# d4

In [16]:
%%time

# df: dataframe with original dataset (only includes season 1 records totaling 380 rows)
# match_dicts: Contains 380 match records for season 1

for i in range(len(match_dicts)):
    # Filter dataset: on season and all games played by either team leading up to this match
    df2 = filtered_df_up_to_match(df, match_dicts[i]) 
    
    # update dicts where we have enough data to derive meaningful feature values 
    if len(df2) < 8: 
        pass   # pass because it doesn't contain enough history
    else:
        # update the relevant dictionary
        match_dicts[i] = update_team_dict_vals(df2, vars_dict, match_dicts[i])

CPU times: total: 2min 10s
Wall time: 2min 10s


# 6. Use updated dicts to create and save New DF with derived features

In [17]:
final_df = pd.DataFrame(match_dicts)
final_df.shape  # note 2009-2022 df had 4600 records

(8020, 101)

In [18]:
# Early-season games sometimes don't produce meaningful insights due to low amounts of data to go by
# It's common that the first 3-4 weeks of the season are dropped with the drop.na procedure.
# In these seasons that run August - May, we have very few records in august
# Conclusion: We can start making match predictions for FTR (full time-result) beginning in September.

# final_df.dropna(inplace=True)
print(final_df.shape)
final_df.sample(10)

(8020, 101)


Unnamed: 0,date,season,home_team,away_team,FTR,H_home_goals_s2d,H_away_goals_s2d,H_total_goals_s2d,H_avg_home_goals_s2d,H_avg_away_goals_s2d,...,A_total_corners_s2d,A_avg_home_corners_s2d,A_avg_away_corners_s2d,A_avg_corners_s2d,A_home_corners_against_s2d,A_away_corners_against_s2d,A_total_corners_against_s2d,A_avg_home_corners_against_s2d,A_avg_away_corners_against_s2d,A_avg_corners_against_s2d
1847,2005-04-20,5,Aston Villa,Charlton,D,24.0,17.0,41.0,1.5,1.0,...,148.0,5.058824,3.875,4.484848,104.0,114.0,218.0,6.117647,7.125,6.606061
362,2001-05-05,1,Man United,Derby,A,49.0,28.0,77.0,2.722222,1.647059,...,155.0,4.277778,4.333333,4.305556,109.0,131.0,240.0,6.055556,7.277778,6.666667
3947,2010-11-27,11,Wolves,Sunderland,H,9.0,5.0,14.0,1.285714,0.714286,...,66.0,5.857143,3.571429,4.714286,44.0,48.0,92.0,6.285714,6.857143,6.571429
7605,2021-05-09,21,Arsenal,West Brom,H,19.0,27.0,46.0,1.117647,1.588235,...,121.0,5.0,2.117647,3.558824,105.0,127.0,232.0,6.176471,7.470588,6.823529
52,2000-09-16,1,Everton,Man United,A,5.0,4.0,9.0,2.5,1.333333,...,27.0,5.333333,5.5,5.4,6.0,11.0,17.0,2.0,5.5,3.4
5058,2013-11-24,14,Man City,Tottenham,H,20.0,8.0,28.0,4.0,1.333333,...,79.0,7.166667,7.2,7.181818,24.0,19.0,43.0,4.0,3.8,3.909091
1313,2003-12-26,4,Leeds,Aston Villa,D,9.0,8.0,17.0,1.0,1.0,...,104.0,8.125,4.333333,6.117647,32.0,55.0,87.0,4.0,6.111111,5.117647
609,2002-01-23,2,Leicester,Arsenal,A,5.0,9.0,14.0,0.5,0.75,...,147.0,7.727273,5.636364,6.681818,28.0,58.0,86.0,2.545455,5.272727,3.909091
2436,2006-12-06,7,West Ham,Wigan,A,8.0,2.0,10.0,1.142857,0.25,...,64.0,4.142857,5.0,4.571429,34.0,32.0,66.0,4.857143,4.571429,4.714286
1402,2004-02-28,4,Leicester,Wolves,D,14.0,23.0,37.0,1.076923,1.769231,...,116.0,4.692308,4.230769,4.461538,75.0,92.0,167.0,5.769231,7.076923,6.423077


In [19]:
final_df.to_csv('INPUTS/sl_derived_dataset_20230621.csv', index=False)

# with pd.ExcelWriter('INPUTS/derived_dataset_SL_TEST_v1.xlsx') as writer:  
#     final_df.to_excel(writer, sheet_name='all_seasons', index=False)