# Wrangling 2
### In this notebook, we will further wrangle the data to create our features (player rating, team, team form) and our labels (outcome) for our machine learning models.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import time

In [6]:
# Now that we have a dict with id pairs for Premier League players and Sofifa players, we can create vectors of player ratings for each match.
matches = pd.read_pickle("all_seasons.pkl")
fifa = pd.read_pickle("all_games.pkl")

## FIFA IDs column

In [7]:
# Import id dictionary
id_dict = pd.read_pickle("id_df.pkl")
id_dict = id_dict.set_index('keys')['values'].to_dict()

In [8]:
# Nested list of all player fifa ids for all matches
all_player_fifa_ids = []

# Iterate over every match (row)
for _, row in matches.iterrows():
    # Make blank list for players ids in a new match
    match_player_fifa_ids = []
    # Iterate over all players in a match, get their PL ids
    for player in row['players']:
        pl_player_id = player[-1]
        # Get corresponding FIFA id in dict
        fifa_player_id = id_dict[pl_player_id]
        # Append all ids for a match
        match_player_fifa_ids.append(fifa_player_id)
    # Append all matches    
    all_player_fifa_ids.append(match_player_fifa_ids)

# Create dataframe column
matches['fifa_ids'] = all_player_fifa_ids

In [9]:
matches

Unnamed: 0,match,teams,score,ht_score,date,formations,players,fifa_ids
0,38308,"[Arsenal, Man City]","[0, 2]","[0, 1]",Sun 12 Aug 2018,"[4-2-3-1, 4-2-3-1]","[[Petr Cech, Goalkeeper, 2651], [Shkodran Must...","[48940, 192227, 203747, 225782, 172879, 186561..."
1,38309,"[Bournemouth, Cardiff]","[2, 0]","[1, 0]",Sat 11 Aug 2018,"[4-4-2, 4-1-4-1]","[[Asmir Begovic, Goalkeeper, 2537], [Steve Coo...","[172723, 193011, 190885, 208920, 169638, 20780..."
2,38310,"[Fulham, Crystal Palace]","[0, 2]","[0, 1]",Sat 11 Aug 2018,"[4-3-3, 4-4-2]","[[Fabri, Goalkeeper, 19771], [Calum Chambers, ...","[177723, 205989, 200778, 203505, 192725, 17531..."
3,38311,"[Huddersfield, Chelsea]","[0, 3]","[0, 2]",Sat 11 Aug 2018,"[3-5-1-1, 4-3-3]","[[Ben Hamer, Goalkeeper, 3183], [Christopher S...","[170008, 200607, 183491, 203485, 183546, 19495..."
4,38312,"[Liverpool, West Ham]","[4, 0]","[2, 0]",Sun 12 Aug 2018,"[4-3-3, 4-2-3-1]","[[Alisson Becker, Goalkeeper, 20559], [Virgil ...","[212831, 203376, 225100, 231281, 259677, 18129..."
...,...,...,...,...,...,...,...,...
2205,93633,"[Crystal Palace, Man City]","[2, 4]","[1, 1]",Sat 6 Apr 2024,"[3-4-3, ]","[[Dean Henderson, Goalkeeper, 13988], [Joel Wa...","[233306, 186392, 213991, 224221, 259240, 23764..."
2206,93634,"[Everton, Burnley]","[1, 0]","[1, 0]",Sat 6 Apr 2024,"[4-4-1-1, 4-4-2]","[[Jordan Pickford, Goalkeeper, 4640], [Séamus ...","[204935, 180216, 247649, 202695, 244380, 24365..."
2207,93635,"[Fulham, Newcastle]","[0, 1]","[0, 0]",Sat 6 Apr 2024,"[4-2-3-1, 4-3-3]","[[Bernd Leno, Goalkeeper, 4985], [Calvin Basse...","[192563, 241436, 222104, 222501, 229348, 20845..."
2208,93636,"[Luton, Bournemouth]","[2, 1]","[0, 0]",Sat 6 Apr 2024,"[3-4-2-1, 4-2-3-1]","[[Thomas Kaminski, Goalkeeper, 5844], [Teden M...","[188400, 253465, 221456, 253052, 202464, 25248..."


## Update dates columns

In [10]:
# Change update_date column in fifa to datetime format instead of string.
fifa['update_date'] = fifa['update_date'].apply(lambda x: datetime.strptime(x, "%b %d, %Y").date())

# Change date column in matches to datetime format instead of string.
matches['date'] = matches['date'].apply(lambda x: datetime.strptime(x, "%a %d %b %Y").date())

In [11]:
# Sort dataframes by date
matches = matches.sort_values(by='date')
fifa = fifa.sort_values(by='update_date')

In [12]:
# Find closest FIFA update in the past for every match in matches
# We prioritize the closest update in the past, then the 2nd closest update in the past, then the closest future update.
# Get unique update dates from fifa
update_dates = fifa['update_date'].unique()

# Sort the dataframes by date first to speed up process
matches = matches.sort_values(by='date')
update_dates = sorted(update_dates)

closest_updates = []
second_closest_updates = []
closest_future_updates = []

# Iterate over match dates and find the closest update within the dates that are before match_date
for match_date in matches['date']:
    closest_update = max(update_date for update_date in update_dates if update_date <= match_date)
    # Find closest future update, incase a player joins a team and plays a match between 2 updates
    closest_future_update = min(update_date for update_date in update_dates if update_date > match_date)
    # Append dates
    closest_updates.append(closest_update)
    closest_future_updates.append(closest_future_update)

# Do the same for second closest update.
for closest_update in closest_updates:
    # If there is no update before closest_update, we set second_closest_update = closest_update
    second_closest_update = max((update_date for update_date in update_dates if update_date < closest_update), default=closest_update) 
    second_closest_updates.append(second_closest_update)

# Create closest_update and second_closest_update columns
matches['closest_update'] = closest_updates
matches['second_closest_update'] = second_closest_updates
matches['closest_future_update'] = closest_future_updates

In [13]:
matches

Unnamed: 0,match,teams,score,ht_score,date,formations,players,fifa_ids,closest_update,second_closest_update,closest_future_update
5,38313,"[Man Utd, Leicester]","[2, 1]","[1, 0]",2018-08-10,"[4-3-3, 4-2-3-1]","[[David de Gea, Goalkeeper, 4330], [Luke Shaw,...","[193080, 205988, 184392, 221660, 225508, 20929...",2018-07-19,2018-07-19,2018-08-21
1,38309,"[Bournemouth, Cardiff]","[2, 0]","[1, 0]",2018-08-11,"[4-4-2, 4-1-4-1]","[[Asmir Begovic, Goalkeeper, 2537], [Steve Coo...","[172723, 193011, 190885, 208920, 169638, 20780...",2018-07-19,2018-07-19,2018-08-21
2,38310,"[Fulham, Crystal Palace]","[0, 2]","[0, 1]",2018-08-11,"[4-3-3, 4-4-2]","[[Fabri, Goalkeeper, 19771], [Calum Chambers, ...","[177723, 205989, 200778, 203505, 192725, 17531...",2018-07-19,2018-07-19,2018-08-21
3,38311,"[Huddersfield, Chelsea]","[0, 3]","[0, 2]",2018-08-11,"[3-5-1-1, 4-3-3]","[[Ben Hamer, Goalkeeper, 3183], [Christopher S...","[170008, 200607, 183491, 203485, 183546, 19495...",2018-07-19,2018-07-19,2018-08-21
6,38314,"[Newcastle, Spurs]","[1, 2]","[1, 2]",2018-08-11,"[4-4-1-1, 4-3-2-1]","[[Martin Dúbravka, Goalkeeper, 6451], [Ciaran ...","[220407, 183129, 203487, 204355, 212722, 18916...",2018-07-19,2018-07-19,2018-08-21
...,...,...,...,...,...,...,...,...,...,...,...
2207,93635,"[Fulham, Newcastle]","[0, 1]","[0, 0]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bernd Leno, Goalkeeper, 4985], [Calvin Basse...","[192563, 241436, 222104, 222501, 229348, 20845...",2024-03-27,2024-03-20,2024-04-09
2203,93631,"[Aston Villa, Brentford]","[3, 3]","[1, 0]",2024-04-06,"[4-4-1-1, 3-5-2]","[[Emiliano Martínez, Goalkeeper, 4245], [Lucas...","[202811, 200458, 227678, 241464, 219693, 22990...",2024-03-27,2024-03-20,2024-04-09
2208,93636,"[Luton, Bournemouth]","[2, 1]","[0, 0]",2024-04-06,"[3-4-2-1, 4-2-3-1]","[[Thomas Kaminski, Goalkeeper, 5844], [Teden M...","[188400, 253465, 221456, 253052, 202464, 25248...",2024-03-27,2024-03-20,2024-04-09
2204,93632,"[Brighton, Arsenal]","[0, 3]","[0, 1]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bart Verbruggen, Goalkeeper, 75709], [Pervis...","[258498, 237942, 258908, 199915, 242418, 23977...",2024-03-27,2024-03-20,2024-04-09


## Player ratings column

In [14]:
start_time = time.time()
none_count = 0

# Create dict of update dates as keys and subset fifa dataframes for that date as values
fifa_dict = {update_date: fifa[fifa['update_date'] == update_date] for update_date in matches['closest_update'].unique()}
fifa_dict_2 = {update_date: fifa[fifa['update_date'] == update_date] for update_date in matches['second_closest_update'].unique()}
fifa_dict_3 = {update_date: fifa[fifa['update_date'] == update_date] for update_date in matches['closest_future_update'].unique()}

mid_time = time.time()
# Initialize list to store player ratings for each match
player_ratings = []

# Iterate over each row in matches DataFrame
for _, row in matches.iterrows():
    # Get corresponding FIFA DataFrame for the closest update date
    fifa_subset = fifa_dict.get(row['closest_update'])
    fifa_subset_2 = fifa_dict_2.get(row['second_closest_update'])
    fifa_subset_3 = fifa_dict_3.get(row['closest_future_update'])
    if all(subset is not None for subset in [fifa_subset, fifa_subset_2, fifa_subset_3]):
        # Filter player ratings for each FIFA ID in the match
        match_ratings = []
        for id in row['fifa_ids']:
            player_rating = fifa_subset.loc[fifa_subset['id'] == id, 'rating']
            # Check if rating is in subset 1
            if not player_rating.empty:
                match_ratings.append(player_rating.iloc[0])
            else:
                # Check if rating is in subset 2
                player_rating = fifa_subset_2.loc[fifa_subset_2['id'] == id, 'rating']
                if not player_rating.empty:
                    match_ratings.append(player_rating.iloc[0])
                else:
                    # Check if rating is in subset 3
                    player_rating = fifa_subset_3.loc[fifa_subset_3['id'] == id, 'rating']
                    if not player_rating.empty:
                        match_ratings.append(player_rating.iloc[0])
                    else:
                        none_count +=1
                        match_ratings.append(None)
        player_ratings.append(match_ratings)
        
end_time = time.time()

print("Took {} seconds to create subset dataframes.".format(round(mid_time - start_time, 2)))
print("Took {} seconds to find all ratings.".format(round(end_time - mid_time, 2)))
print("Failed to find rating {} times.".format(none_count))

Took 8.08 seconds to create subset dataframes.
Took 12.6 seconds to find all ratings.
Failed to find rating 685 times.


In [15]:
# There are still many players which ratings do not appear in the 2 last updates or the 1st future update. We know that we have at least 1 rating for each player, so we will complete the data with a slower but sure method.

match_counter = 0
none_counter = 0

start_time = time.time()
# Iterate through all matches
for match in player_ratings:
    player_counter = 0
    # Iterate through all ratings in a match, and check if there is a None
    for rating in match:
        if rating is None:
            # Get corresponding player id
            fifa_id = matches['fifa_ids'][match_counter][player_counter]
            # Create subset of dataframe for that player, before match date. Get rating on last row, which is the most recent update in the past.
            fifa_subset = fifa[(fifa['id'] == fifa_id) & (fifa['update_date'] <= match_date)]
            if not fifa_subset.empty:
                player_rating = fifa_subset.iloc[-1]['rating']
            else:
                # If dataframe is empty, get closest rating in the future.
                fifa_subset = fifa[(fifa['id'] == id) & (fifa['update_date'] > match_date)]
                player_rating = fifa_subset.iloc[0]['rating']
            player_ratings[match_counter][player_counter] = player_rating
            none_counter +=1
    # Add 1 to counters.
        player_counter +=1
    match_counter +=1

end_time = time.time()
print("{} Nones replaced in {} seconds.".format(none_counter, round(end_time - start_time, 2)))

685 Nones replaced in 18.61 seconds.


In [16]:
# Create player_ratings column in dataframe
matches['player_ratings'] = player_ratings

In [17]:
matches

Unnamed: 0,match,teams,score,ht_score,date,formations,players,fifa_ids,closest_update,second_closest_update,closest_future_update,player_ratings
5,38313,"[Man Utd, Leicester]","[2, 1]","[1, 0]",2018-08-10,"[4-3-3, 4-2-3-1]","[[David de Gea, Goalkeeper, 4330], [Luke Shaw,...","[193080, 205988, 184392, 221660, 225508, 20929...",2018-07-19,2018-07-19,2018-08-21,"[91, 78, 77, 79, 83, 82, 78, 88, 83, 87, 81, 8..."
1,38309,"[Bournemouth, Cardiff]","[2, 0]","[1, 0]",2018-08-11,"[4-4-2, 4-1-4-1]","[[Asmir Begovic, Goalkeeper, 2537], [Steve Coo...","[172723, 193011, 190885, 208920, 169638, 20780...",2018-07-19,2018-07-19,2018-08-21,"[79, 77, 75, 79, 75, 75, 73, 76, 69, 77, 78, 7..."
2,38310,"[Fulham, Crystal Palace]","[0, 2]","[0, 1]",2018-08-11,"[4-3-3, 4-4-2]","[[Fabri, Goalkeeper, 19771], [Calum Chambers, ...","[177723, 205989, 200778, 203505, 192725, 17531...",2018-07-19,2018-07-19,2018-08-21,"[80, 75, 72, 75, 76, 75, 79, 82, 76, 75, 78, 7..."
3,38311,"[Huddersfield, Chelsea]","[0, 3]","[0, 2]",2018-08-11,"[3-5-1-1, 4-3-3]","[[Ben Hamer, Goalkeeper, 3183], [Christopher S...","[170008, 200607, 183491, 203485, 183546, 19495...",2018-07-19,2018-07-19,2018-08-21,"[68, 78, 76, 76, 76, 79, 72, 73, 68, 73, 76, 8..."
6,38314,"[Newcastle, Spurs]","[1, 2]","[1, 2]",2018-08-11,"[4-4-1-1, 4-3-2-1]","[[Martin Dúbravka, Goalkeeper, 6451], [Ciaran ...","[220407, 183129, 203487, 204355, 212722, 18916...",2018-07-19,2018-07-19,2018-08-21,"[77, 76, 78, 74, 76, 78, 77, 76, 75, 74, 76, 8..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2207,93635,"[Fulham, Newcastle]","[0, 1]","[0, 0]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bernd Leno, Goalkeeper, 4985], [Calvin Basse...","[192563, 241436, 222104, 222501, 229348, 20845...",2024-03-27,2024-03-20,2024-04-09,"[82, 75, 76, 76, 78, 77, 84, 77, 76, 77, 71, 7..."
2203,93631,"[Aston Villa, Brentford]","[3, 3]","[1, 0]",2024-04-06,"[4-4-1-1, 3-5-2]","[[Emiliano Martínez, Goalkeeper, 4245], [Lucas...","[202811, 200458, 227678, 241464, 219693, 22990...",2024-03-27,2024-03-20,2024-04-09,"[86, 79, 80, 82, 81, 82, 82, 81, 67, 82, 80, 8..."
2208,93636,"[Luton, Bournemouth]","[2, 1]","[0, 0]",2024-04-06,"[3-4-2-1, 4-2-3-1]","[[Thomas Kaminski, Goalkeeper, 5844], [Teden M...","[188400, 253465, 221456, 253052, 202464, 25248...",2024-03-27,2024-03-20,2024-04-09,"[75, 72, 70, 73, 71, 73, 70, 76, 72, 71, 74, 7..."
2204,93632,"[Brighton, Arsenal]","[0, 3]","[0, 1]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bart Verbruggen, Goalkeeper, 75709], [Pervis...","[258498, 237942, 258908, 199915, 242418, 23977...",2024-03-27,2024-03-20,2024-04-09,"[75, 81, 74, 81, 75, 73, 69, 74, 71, 80, 75, 8..."


## Teams boolean column

In [18]:
# We will create a vector that will represent the home and away teams

# Create list of all teams in 'teams' column, and find unique teams
all_teams = []
for teams in matches['teams']:
    all_teams.extend(teams)

unique_teams = sorted(list(set(all_teams)))
len(unique_teams)

# Create vectors of zeros of length 2 * len(unique_teams) (for home and away team) for all rows in matches.
matches['teams_bool'] = [np.zeros(2*len(unique_teams)) for row in range(len(matches))]

# Replace playing teams with 1
for _, row in matches.iterrows():
    # Find home and away teams
    home_team = row['teams'][0]
    away_team = row['teams'][1]
    # Find team indexes
    home_index = unique_teams.index(home_team)
    away_index = unique_teams.index(away_team)
    # Set indexes of team_bool to 
    row['teams_bool'][home_index] = 1
    row['teams_bool'][away_index + len(unique_teams)] = 1

In [19]:
matches

Unnamed: 0,match,teams,score,ht_score,date,formations,players,fifa_ids,closest_update,second_closest_update,closest_future_update,player_ratings,teams_bool
5,38313,"[Man Utd, Leicester]","[2, 1]","[1, 0]",2018-08-10,"[4-3-3, 4-2-3-1]","[[David de Gea, Goalkeeper, 4330], [Luke Shaw,...","[193080, 205988, 184392, 221660, 225508, 20929...",2018-07-19,2018-07-19,2018-08-21,"[91, 78, 77, 79, 83, 82, 78, 88, 83, 87, 81, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,38309,"[Bournemouth, Cardiff]","[2, 0]","[1, 0]",2018-08-11,"[4-4-2, 4-1-4-1]","[[Asmir Begovic, Goalkeeper, 2537], [Steve Coo...","[172723, 193011, 190885, 208920, 169638, 20780...",2018-07-19,2018-07-19,2018-08-21,"[79, 77, 75, 79, 75, 75, 73, 76, 69, 77, 78, 7...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,38310,"[Fulham, Crystal Palace]","[0, 2]","[0, 1]",2018-08-11,"[4-3-3, 4-4-2]","[[Fabri, Goalkeeper, 19771], [Calum Chambers, ...","[177723, 205989, 200778, 203505, 192725, 17531...",2018-07-19,2018-07-19,2018-08-21,"[80, 75, 72, 75, 76, 75, 79, 82, 76, 75, 78, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,38311,"[Huddersfield, Chelsea]","[0, 3]","[0, 2]",2018-08-11,"[3-5-1-1, 4-3-3]","[[Ben Hamer, Goalkeeper, 3183], [Christopher S...","[170008, 200607, 183491, 203485, 183546, 19495...",2018-07-19,2018-07-19,2018-08-21,"[68, 78, 76, 76, 76, 79, 72, 73, 68, 73, 76, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,38314,"[Newcastle, Spurs]","[1, 2]","[1, 2]",2018-08-11,"[4-4-1-1, 4-3-2-1]","[[Martin Dúbravka, Goalkeeper, 6451], [Ciaran ...","[220407, 183129, 203487, 204355, 212722, 18916...",2018-07-19,2018-07-19,2018-08-21,"[77, 76, 78, 74, 76, 78, 77, 76, 75, 74, 76, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2207,93635,"[Fulham, Newcastle]","[0, 1]","[0, 0]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bernd Leno, Goalkeeper, 4985], [Calvin Basse...","[192563, 241436, 222104, 222501, 229348, 20845...",2024-03-27,2024-03-20,2024-04-09,"[82, 75, 76, 76, 78, 77, 84, 77, 76, 77, 71, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2203,93631,"[Aston Villa, Brentford]","[3, 3]","[1, 0]",2024-04-06,"[4-4-1-1, 3-5-2]","[[Emiliano Martínez, Goalkeeper, 4245], [Lucas...","[202811, 200458, 227678, 241464, 219693, 22990...",2024-03-27,2024-03-20,2024-04-09,"[86, 79, 80, 82, 81, 82, 82, 81, 67, 82, 80, 8...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2208,93636,"[Luton, Bournemouth]","[2, 1]","[0, 0]",2024-04-06,"[3-4-2-1, 4-2-3-1]","[[Thomas Kaminski, Goalkeeper, 5844], [Teden M...","[188400, 253465, 221456, 253052, 202464, 25248...",2024-03-27,2024-03-20,2024-04-09,"[75, 72, 70, 73, 71, 73, 70, 76, 72, 71, 74, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2204,93632,"[Brighton, Arsenal]","[0, 3]","[0, 1]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bart Verbruggen, Goalkeeper, 75709], [Pervis...","[258498, 237942, 258908, 199915, 242418, 23977...",2024-03-27,2024-03-20,2024-04-09,"[75, 81, 74, 81, 75, 73, 69, 74, 71, 80, 75, 8...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


## Outcome and form columns

In [20]:
# We will create a score system for a team's form, which is how well they have performed in their last 5 matches.
# Win = 3 pts, draw = 1 pt, loss = 0 pts. Maximum will be 15 pts and minimum 0.

# Convert scores to integers
matches['score'] = matches['score'].apply(lambda x: [int(score) for score in x])

# Create outcome of 1X2 format (win, draw, loss for home team) for every match.
# Create 0 vectors
matches['outcome'] = [np.zeros(3) for row in range(len(matches))]
# Set appropriate index in 'outcome' to 1 depending on score
for _, row in matches.iterrows():
    if row['score'][0] > row['score'][1]:
        row['outcome'][0] = 1
    elif row['score'][0] == row['score'][1]:
        row['outcome'][1] = 1
    else:
        row['outcome'][2] = 1

In [21]:
# Create dictionary of subsets of matches dataframe, for every team in the Premier League.
team_subsets = {}
for team in unique_teams:
    team_subsets[team] = matches[matches['teams'].apply(lambda x: team in x)]

# Create zero vectors for form
matches['form'] = [np.zeros(2) for row in range(len(matches))]

all_forms = []
# Iterate over every match
for _, row in matches.iterrows():
    # Get home and away teams
    home_team = row['teams'][0]
    away_team = row['teams'][1]
    # Reset form to 0
    home_team_form = 0
    away_team_form = 0
    # Get match date
    match_date = row['date']
    # Using our dict team_subsets, find all matches for a specific team, before a specific date. Take the last 5 matches with .tail(5).
    last_5_home = team_subsets[home_team][team_subsets[home_team]['date'] < match_date].tail(5)
    last_5_away = team_subsets[away_team][team_subsets[away_team]['date'] < match_date].tail(5)
    # Calculate how many matches are missing. The first matches will obviously be missing historical data.
    missing_home_matches = 5 - len(last_5_home)
    missing_away_matches = 5 - len(last_5_away)
    # We set the missing matches to the average score between win, draw and loss = 4/3 
    home_team_form += (4/3)*missing_home_matches
    away_team_form += (4/3)*missing_away_matches

    # Add scores to teams' forms, depending on the outcome column.
    for _, row in last_5_home.iterrows():
        if home_team == row['teams'][0]:
            home_team_form = home_team_form + 3*row['outcome'][0] + row['outcome'][1]
        elif home_team == row['teams'][1]:
            home_team_form = home_team_form + 3*row['outcome'][2] + row['outcome'][1]
    
    for _, row in last_5_away.iterrows():
        if away_team == row['teams'][0]:
            away_team_form = away_team_form + 3*row['outcome'][0] + row['outcome'][1]
        elif away_team == row['teams'][1]:
            away_team_form = away_team_form + 3*row['outcome'][2] + row['outcome'][1]

    # Append forms to a list
    match_form = [home_team_form, away_team_form]
    all_forms.append(match_form)
    
matches['form'] = all_forms

In [22]:
matches

Unnamed: 0,match,teams,score,ht_score,date,formations,players,fifa_ids,closest_update,second_closest_update,closest_future_update,player_ratings,teams_bool,outcome,form
5,38313,"[Man Utd, Leicester]","[2, 1]","[1, 0]",2018-08-10,"[4-3-3, 4-2-3-1]","[[David de Gea, Goalkeeper, 4330], [Luke Shaw,...","[193080, 205988, 184392, 221660, 225508, 20929...",2018-07-19,2018-07-19,2018-08-21,"[91, 78, 77, 79, 83, 82, 78, 88, 83, 87, 81, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0]","[6.666666666666666, 6.666666666666666]"
1,38309,"[Bournemouth, Cardiff]","[2, 0]","[1, 0]",2018-08-11,"[4-4-2, 4-1-4-1]","[[Asmir Begovic, Goalkeeper, 2537], [Steve Coo...","[172723, 193011, 190885, 208920, 169638, 20780...",2018-07-19,2018-07-19,2018-08-21,"[79, 77, 75, 79, 75, 75, 73, 76, 69, 77, 78, 7...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0]","[6.666666666666666, 6.666666666666666]"
2,38310,"[Fulham, Crystal Palace]","[0, 2]","[0, 1]",2018-08-11,"[4-3-3, 4-4-2]","[[Fabri, Goalkeeper, 19771], [Calum Chambers, ...","[177723, 205989, 200778, 203505, 192725, 17531...",2018-07-19,2018-07-19,2018-08-21,"[80, 75, 72, 75, 76, 75, 79, 82, 76, 75, 78, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[6.666666666666666, 6.666666666666666]"
3,38311,"[Huddersfield, Chelsea]","[0, 3]","[0, 2]",2018-08-11,"[3-5-1-1, 4-3-3]","[[Ben Hamer, Goalkeeper, 3183], [Christopher S...","[170008, 200607, 183491, 203485, 183546, 19495...",2018-07-19,2018-07-19,2018-08-21,"[68, 78, 76, 76, 76, 79, 72, 73, 68, 73, 76, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[6.666666666666666, 6.666666666666666]"
6,38314,"[Newcastle, Spurs]","[1, 2]","[1, 2]",2018-08-11,"[4-4-1-1, 4-3-2-1]","[[Martin Dúbravka, Goalkeeper, 6451], [Ciaran ...","[220407, 183129, 203487, 204355, 212722, 18916...",2018-07-19,2018-07-19,2018-08-21,"[77, 76, 78, 74, 76, 78, 77, 76, 75, 74, 76, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[6.666666666666666, 6.666666666666666]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2207,93635,"[Fulham, Newcastle]","[0, 1]","[0, 0]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bernd Leno, Goalkeeper, 4985], [Calvin Basse...","[192563, 241436, 222104, 222501, 229348, 20845...",2024-03-27,2024-03-20,2024-04-09,"[82, 75, 76, 76, 78, 77, 84, 77, 76, 77, 71, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[7.0, 7.0]"
2203,93631,"[Aston Villa, Brentford]","[3, 3]","[1, 0]",2024-04-06,"[4-4-1-1, 3-5-2]","[[Emiliano Martínez, Goalkeeper, 4245], [Lucas...","[202811, 200458, 227678, 241464, 219693, 22990...",2024-03-27,2024-03-20,2024-04-09,"[86, 79, 80, 82, 81, 82, 82, 81, 67, 82, 80, 8...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0]","[7.0, 3.0]"
2208,93636,"[Luton, Bournemouth]","[2, 1]","[0, 0]",2024-04-06,"[3-4-2-1, 4-2-3-1]","[[Thomas Kaminski, Goalkeeper, 5844], [Teden M...","[188400, 253465, 221456, 253052, 202464, 25248...",2024-03-27,2024-03-20,2024-04-09,"[75, 72, 70, 73, 71, 73, 70, 76, 72, 71, 74, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0]","[2.0, 13.0]"
2204,93632,"[Brighton, Arsenal]","[0, 3]","[0, 1]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bart Verbruggen, Goalkeeper, 75709], [Pervis...","[258498, 237942, 258908, 199915, 242418, 23977...",2024-03-27,2024-03-20,2024-04-09,"[75, 81, 74, 81, 75, 73, 69, 74, 71, 80, 75, 8...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[5.0, 13.0]"


## Normalize columns

In [23]:
# We need to normalize the values in our columns that will be used as features and labels for our models.

# Start by normalizing player ratings
# Find min and max of ratings
all_ratings = []
for _, row in matches.iterrows():
    all_ratings.extend(row['player_ratings'])

min(all_ratings) # 53
max(all_ratings) # 91

# We transform values so the score 50 is equivalent to 0, and 100 is equivalent to 1.
def normalize_ratings(list):
    normalized_list = []
    for rating in list:
        normalized_rating = (rating - 50)/50
        normalized_list.append(normalized_rating)
    return normalized_list

# Set normalized_ratings column
matches['normalized_ratings'] = matches['player_ratings'].apply(normalize_ratings)

# Normalize form, we simply divide by 15
matches['normalized_form'] = matches['form'].apply(lambda x: [form / 15 for form in x])
# Round normalized column to 2 digits after decimal point.
matches['normalized_form'] = matches['normalized_form'].apply(lambda x: [round(form, 2) for form in x])

In [24]:
matches

Unnamed: 0,match,teams,score,ht_score,date,formations,players,fifa_ids,closest_update,second_closest_update,closest_future_update,player_ratings,teams_bool,outcome,form,normalized_ratings,normalized_form
5,38313,"[Man Utd, Leicester]","[2, 1]","[1, 0]",2018-08-10,"[4-3-3, 4-2-3-1]","[[David de Gea, Goalkeeper, 4330], [Luke Shaw,...","[193080, 205988, 184392, 221660, 225508, 20929...",2018-07-19,2018-07-19,2018-08-21,"[91, 78, 77, 79, 83, 82, 78, 88, 83, 87, 81, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0]","[6.666666666666666, 6.666666666666666]","[0.82, 0.56, 0.54, 0.58, 0.66, 0.64, 0.56, 0.7...","[0.44, 0.44]"
1,38309,"[Bournemouth, Cardiff]","[2, 0]","[1, 0]",2018-08-11,"[4-4-2, 4-1-4-1]","[[Asmir Begovic, Goalkeeper, 2537], [Steve Coo...","[172723, 193011, 190885, 208920, 169638, 20780...",2018-07-19,2018-07-19,2018-08-21,"[79, 77, 75, 79, 75, 75, 73, 76, 69, 77, 78, 7...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0]","[6.666666666666666, 6.666666666666666]","[0.58, 0.54, 0.5, 0.58, 0.5, 0.5, 0.46, 0.52, ...","[0.44, 0.44]"
2,38310,"[Fulham, Crystal Palace]","[0, 2]","[0, 1]",2018-08-11,"[4-3-3, 4-4-2]","[[Fabri, Goalkeeper, 19771], [Calum Chambers, ...","[177723, 205989, 200778, 203505, 192725, 17531...",2018-07-19,2018-07-19,2018-08-21,"[80, 75, 72, 75, 76, 75, 79, 82, 76, 75, 78, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[6.666666666666666, 6.666666666666666]","[0.6, 0.5, 0.44, 0.5, 0.52, 0.5, 0.58, 0.64, 0...","[0.44, 0.44]"
3,38311,"[Huddersfield, Chelsea]","[0, 3]","[0, 2]",2018-08-11,"[3-5-1-1, 4-3-3]","[[Ben Hamer, Goalkeeper, 3183], [Christopher S...","[170008, 200607, 183491, 203485, 183546, 19495...",2018-07-19,2018-07-19,2018-08-21,"[68, 78, 76, 76, 76, 79, 72, 73, 68, 73, 76, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[6.666666666666666, 6.666666666666666]","[0.36, 0.56, 0.52, 0.52, 0.52, 0.58, 0.44, 0.4...","[0.44, 0.44]"
6,38314,"[Newcastle, Spurs]","[1, 2]","[1, 2]",2018-08-11,"[4-4-1-1, 4-3-2-1]","[[Martin Dúbravka, Goalkeeper, 6451], [Ciaran ...","[220407, 183129, 203487, 204355, 212722, 18916...",2018-07-19,2018-07-19,2018-08-21,"[77, 76, 78, 74, 76, 78, 77, 76, 75, 74, 76, 8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[6.666666666666666, 6.666666666666666]","[0.54, 0.52, 0.56, 0.48, 0.52, 0.56, 0.54, 0.5...","[0.44, 0.44]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2207,93635,"[Fulham, Newcastle]","[0, 1]","[0, 0]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bernd Leno, Goalkeeper, 4985], [Calvin Basse...","[192563, 241436, 222104, 222501, 229348, 20845...",2024-03-27,2024-03-20,2024-04-09,"[82, 75, 76, 76, 78, 77, 84, 77, 76, 77, 71, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[7.0, 7.0]","[0.64, 0.5, 0.52, 0.52, 0.56, 0.54, 0.68, 0.54...","[0.47, 0.47]"
2203,93631,"[Aston Villa, Brentford]","[3, 3]","[1, 0]",2024-04-06,"[4-4-1-1, 3-5-2]","[[Emiliano Martínez, Goalkeeper, 4245], [Lucas...","[202811, 200458, 227678, 241464, 219693, 22990...",2024-03-27,2024-03-20,2024-04-09,"[86, 79, 80, 82, 81, 82, 82, 81, 67, 82, 80, 8...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0]","[7.0, 3.0]","[0.72, 0.58, 0.6, 0.64, 0.62, 0.64, 0.64, 0.62...","[0.47, 0.2]"
2208,93636,"[Luton, Bournemouth]","[2, 1]","[0, 0]",2024-04-06,"[3-4-2-1, 4-2-3-1]","[[Thomas Kaminski, Goalkeeper, 5844], [Teden M...","[188400, 253465, 221456, 253052, 202464, 25248...",2024-03-27,2024-03-20,2024-04-09,"[75, 72, 70, 73, 71, 73, 70, 76, 72, 71, 74, 7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0]","[2.0, 13.0]","[0.5, 0.44, 0.4, 0.46, 0.42, 0.46, 0.4, 0.52, ...","[0.13, 0.87]"
2204,93632,"[Brighton, Arsenal]","[0, 3]","[0, 1]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bart Verbruggen, Goalkeeper, 75709], [Pervis...","[258498, 237942, 258908, 199915, 242418, 23977...",2024-03-27,2024-03-20,2024-04-09,"[75, 81, 74, 81, 75, 73, 69, 74, 71, 80, 75, 8...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0]","[5.0, 13.0]","[0.5, 0.62, 0.48, 0.62, 0.5, 0.46, 0.38, 0.48,...","[0.33, 0.87]"


In [25]:
# Reset dataframe indexes
matches.reset_index(drop=True, inplace=True)

# Export dataframe
matches.to_pickle("matches_clean.pkl")
matches.to_csv("matches_clean.csv", index=False)

## Bookmaker odds columns

In [3]:
# Import matches pickle
matches = pd.read_pickle("matches_clean.pkl")

In [47]:
bookies_23_24 = pd.read_csv("E0 (1).csv")
bookies_22_23 = pd.read_csv("E0 (2).csv")
bookies_21_22 = pd.read_csv("E0 (3).csv")
bookies_20_21 = pd.read_csv("E0 (4).csv")
bookies_19_20 = pd.read_csv("E0 (5).csv")
bookies_18_19 = pd.read_csv("E0 (6).csv")

# Concatenate dataframes, sort by dates and keep only rows where we have data in matches dataframe.
bookies = pd.concat([bookies_18_19, bookies_19_20, bookies_20_21, bookies_21_22, bookies_22_23, bookies_23_24], ignore_index=True)
bookies['Date'] = bookies['Date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y').date())

bookies.loc[:len(matches)-1, 'Div':'PSA']

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA
0,E0,2018-08-10,Man United,Leicester,2,1,H,1,0,H,...,7.50,1.53,4.00,7.50,1.55,3.80,7.00,1.58,3.93,7.50
1,E0,2018-08-11,Bournemouth,Cardiff,2,0,H,1,0,H,...,4.50,1.90,3.40,4.40,1.90,3.50,4.10,1.89,3.63,4.58
2,E0,2018-08-11,Fulham,Crystal Palace,0,2,A,0,1,A,...,3.00,2.45,3.30,2.95,2.40,3.30,2.95,2.50,3.46,3.00
3,E0,2018-08-11,Huddersfield,Chelsea,0,3,A,0,2,A,...,1.61,6.25,3.90,1.57,6.20,4.00,1.55,6.41,4.02,1.62
4,E0,2018-08-11,Newcastle,Tottenham,1,2,A,1,2,A,...,2.04,3.80,3.50,2.00,3.70,3.35,2.05,3.83,3.57,2.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205,E0,2024-04-06,Everton,Burnley,1,0,H,1,0,H,...,5.25,1.65,4.00,5.00,,,,1.68,4.18,4.99
2206,E0,2024-04-06,Fulham,Newcastle,0,1,A,0,0,D,...,2.80,2.37,3.75,2.70,,,,2.35,3.87,2.86
2207,E0,2024-04-06,Luton,Bournemouth,2,1,H,0,0,D,...,1.91,3.70,4.00,1.85,,,,3.65,4.11,1.93
2208,E0,2024-04-06,Wolves,West Ham,1,2,A,1,0,H,...,2.70,2.45,3.50,2.70,,,,2.53,3.61,2.77


In [51]:
# Load CSV files as dataframes
bookies_23_24 = pd.read_csv("E0 (1).csv")
bookies_22_23 = pd.read_csv("E0 (2).csv")
bookies_21_22 = pd.read_csv("E0 (3).csv")
bookies_20_21 = pd.read_csv("E0 (4).csv")
bookies_19_20 = pd.read_csv("E0 (5).csv")
bookies_18_19 = pd.read_csv("E0 (6).csv")

# Concatenate dataframes, sort by dates and keep only rows where we have data in matches dataframe.
bookies = pd.concat([bookies_18_19, bookies_19_20, bookies_20_21, bookies_21_22, bookies_22_23, bookies_23_24], ignore_index=True)
bookies['Date'] = bookies['Date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y').date())
bookies = bookies.sort_values(['Date', 'HomeTeam'])
bookies = bookies[bookies['Date'] <= matches.iloc[-1]['date']]
bookies.reset_index(drop=True, inplace=True)

In [52]:
bookies

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,2018-08-10,Man United,Leicester,2,1,H,1,0,H,...,,,,,,,,,,
1,E0,2018-08-11,Bournemouth,Cardiff,2,0,H,1,0,H,...,,,,,,,,,,
2,E0,2018-08-11,Fulham,Crystal Palace,0,2,A,0,1,A,...,,,,,,,,,,
3,E0,2018-08-11,Huddersfield,Chelsea,0,3,A,0,2,A,...,,,,,,,,,,
4,E0,2018-08-11,Newcastle,Tottenham,1,2,A,1,2,A,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205,E0,2024-04-06,Crystal Palace,Man City,2,4,A,1,1,D,...,2.23,1.50,1.75,2.05,1.78,2.16,1.93,2.17,1.80,2.07
2206,E0,2024-04-06,Everton,Burnley,1,0,H,1,0,H,...,2.05,-0.75,1.99,1.91,1.99,1.93,2.04,1.95,1.96,1.90
2207,E0,2024-04-06,Fulham,Newcastle,0,1,A,0,0,D,...,2.38,-0.25,2.08,1.82,2.15,1.79,2.17,1.82,2.12,1.76
2208,E0,2024-04-06,Luton,Bournemouth,2,1,H,0,0,D,...,2.55,0.50,1.97,1.93,2.00,1.93,2.00,1.97,1.95,1.92


In [53]:
# Calculate the inverse of odds for bookmakers, this is equivalent to the probability
odds_columns = bookies.loc[:, 'B365H':'VCA'].columns.tolist()
for col in odds_columns:
    new_col_name = col + '_inv'
    bookies[new_col_name] = 1 / bookies[col]

# Calculate the margin for each bookmaker
inv_odds_columns = bookies.loc[:, 'B365H_inv':'VCA_inv'].columns.tolist()

bookies['B365_margin'] = np.nan
bookies['BW_margin'] = np.nan
bookies['IW_margin'] = np.nan
bookies['PS_margin'] = np.nan
bookies['WH_margin'] = np.nan
bookies['VC_margin'] = np.nan

# Calculate sums for every 3 columns (probabilities of home + draw + away)
prob_sums = []
for i in range(0, len(inv_odds_columns), 3):
    prob_sum = sum(bookies[col_name]for col_name in inv_odds_columns[i:i+3])
    prob_sums.append(prob_sum)

for i, col in enumerate(bookies.loc[:, 'B365_margin':'VC_margin'].columns):
    bookies[col] = prob_sums[i]

# Looking at margins, we see that Pinnacle (PS) has by far the lowest margins. We will use their odds to train the model.
print("Bookmaker margins:")
print(bookies['B365_margin'].mean(skipna=True))
print(bookies['BW_margin'].mean(skipna=True))
print(bookies['IW_margin'].mean(skipna=True))
print(bookies['PS_margin'].mean(skipna=True))
print(bookies['WH_margin'].mean(skipna=True))
print(bookies['VC_margin'].mean(skipna=True))

# Create Pinnacle probabilities labels
all_labels = []
for index, row in bookies.iterrows():
    labels = []
    for column in ['PSH_inv', 'PSD_inv', 'PSA_inv']:
        labels.append(round(row[column], 4))

    all_labels.append(labels)


bookies['PS_probs'] = all_labels

Bookmaker margins:
1.0493861856344195
1.0535019251926496
1.0519664470901586
1.0272269861148056
1.0586938427655022
1.0526171448242363


  bookies['PS_margin'] = np.nan
  bookies['WH_margin'] = np.nan
  bookies['VC_margin'] = np.nan
  bookies['PS_probs'] = all_labels


In [54]:
bookies

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,VCH_inv,VCD_inv,VCA_inv,B365_margin,BW_margin,IW_margin,PS_margin,WH_margin,VC_margin,PS_probs
0,E0,2018-08-10,Man United,Leicester,2,1,H,1,0,H,...,0.636943,0.250000,0.142857,1.026686,1.036928,1.051176,1.020698,1.066767,1.029800,"[0.6329, 0.2545, 0.1333]"
1,E0,2018-08-11,Bournemouth,Cardiff,2,0,H,1,0,H,...,0.534759,0.277778,0.210526,1.026316,1.047706,1.055933,1.022923,1.059274,1.023063,"[0.5291, 0.2755, 0.2183]"
2,E0,2018-08-11,Fulham,Crystal Palace,0,2,A,0,1,A,...,0.400000,0.294118,0.333333,1.027451,1.050177,1.058680,1.022351,1.068336,1.027451,"[0.4, 0.289, 0.3333]"
3,E0,2018-08-11,Huddersfield,Chelsea,0,3,A,0,2,A,...,0.153846,0.250000,0.617284,1.024964,1.053353,1.056452,1.022046,1.065767,1.021130,"[0.156, 0.2488, 0.6173]"
4,E0,2018-08-11,Newcastle,Tottenham,1,2,A,1,2,A,...,0.256410,0.294118,0.476190,1.032321,1.048872,1.056583,1.021978,1.063463,1.026718,"[0.2611, 0.2801, 0.4808]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205,E0,2024-04-06,Crystal Palace,Man City,2,4,A,1,1,D,...,0.090909,0.181818,0.775194,1.047619,1.050505,,1.036235,1.057012,1.047921,"[0.094, 0.1789, 0.7634]"
2206,E0,2024-04-06,Everton,Burnley,1,0,H,1,0,H,...,0.617284,0.263158,0.190476,1.046537,1.056061,,1.034873,1.064394,1.070918,"[0.5952, 0.2392, 0.2004]"
2207,E0,2024-04-06,Fulham,Newcastle,0,1,A,0,0,D,...,0.434783,0.270270,0.370370,1.040469,1.058978,,1.033580,1.061563,1.075423,"[0.4255, 0.2584, 0.3497]"
2208,E0,2024-04-06,Luton,Bournemouth,2,1,H,0,0,D,...,0.277778,0.263158,0.534759,1.053385,1.060811,,1.035416,1.056414,1.075695,"[0.274, 0.2433, 0.5181]"


In [56]:
# We need to sort our dataframes by date, then by home team name, in order to match matches up with odds properly.
# Create home team column in matches for sorting
matches['home_team'] = matches['teams'].apply(lambda x: x[0])

# Sort dataframes
matches = matches.sort_values(['date', 'home_team'])
bookies = bookies.sort_values(['Date', 'HomeTeam'])

# Add Pinnacle probabilities to matches dataframe
matches['PS_probs'] = bookies['PS_probs']

In [37]:
# Add other bookmakers data to matches, these will be used to simulate betting.
# Create lists from odds
bookies['B365_odds'] = [[row[column] for column in ['B365H', 'B365D', 'B365A']] for _, row in bookies.iterrows()]
bookies['BW_odds'] = [[row[column] for column in ['BWH', 'BWD', 'BWA']] for _, row in bookies.iterrows()]
bookies['IW_odds'] = [[row[column] for column in ['IWH', 'IWD', 'IWA']] for _, row in bookies.iterrows()]
bookies['PS_odds'] = [[row[column] for column in ['PSH', 'PSD', 'PSA']] for _, row in bookies.iterrows()]
bookies['WH_odds'] = [[row[column] for column in ['WHH', 'WHD', 'WHA']] for _, row in bookies.iterrows()]
bookies['VC_odds'] = [[row[column] for column in ['VCH', 'VCD', 'VCA']] for _, row in bookies.iterrows()]

# Add odds to matches dataframe
matches['B365_odds'] = bookies['B365_odds']
matches['BW_odds'] = bookies['BW_odds']
matches['IW_odds'] = bookies['IW_odds']
matches['PS_odds'] = bookies['PS_odds']
matches['WH_odds'] = bookies['WH_odds']
matches['VC_odds'] = bookies['VC_odds']

In [64]:
max_odds = []
for _, row in matches.iterrows():
    home_max = max(row['B365_odds'][0], row['BW_odds'][0], row['IW_odds'][0], row['PS_odds'][0], row['WH_odds'][0], row['VC_odds'][0])
    draw_max = max(row['B365_odds'][1], row['BW_odds'][1], row['IW_odds'][1], row['PS_odds'][1], row['WH_odds'][1], row['VC_odds'][1])
    away_max = max(row['B365_odds'][2], row['BW_odds'][2], row['IW_odds'][2], row['PS_odds'][2], row['WH_odds'][2], row['VC_odds'][2])
    match_max = [home_max, draw_max, away_max]
    max_odds.append(match_max)

matches['max_odds'] = max_odds

# Find average margin of max odds
reciprocal_max_odds = 1 / matches['max_odds'].apply(pd.Series)
reciprocal_max_odds.sum(axis=1).sum()/len(matches) # 1.019372315488447

In [67]:
# Save data
matches.reset_index(drop=True, inplace=True)
matches.to_csv("matches_clean.csv", index=False)
matches.to_pickle("matches_clean.pkl")

bookies.reset_index(drop=True, inplace=True)
bookies.to_csv("bookies.csv", index=False)
bookies.to_pickle("bookies.pkl")

In [66]:
matches

Unnamed: 0,match,teams,score,ht_score,date,formations,players,fifa_ids,closest_update,second_closest_update,...,normalized_form,home_team,PS_probs,B365_odds,BW_odds,IW_odds,PS_odds,WH_odds,VC_odds,max_odds
0,38313,"[Man Utd, Leicester]","[2, 1]","[1, 0]",2018-08-10,"[4-3-3, 4-2-3-1]","[[David de Gea, Goalkeeper, 4330], [Luke Shaw,...","[193080, 205988, 184392, 221660, 225508, 20929...",2018-07-19,2018-07-19,...,"[0.44, 0.44]",Man Utd,"[0.6329, 0.2545, 0.1333]","[1.57, 3.9, 7.5]","[1.53, 4.0, 7.5]","[1.55, 3.8, 7.0]","[1.58, 3.93, 7.5]","[1.57, 3.8, 6.0]","[1.57, 4.0, 7.0]","[1.58, 4.0, 7.5]"
1,38309,"[Bournemouth, Cardiff]","[2, 0]","[1, 0]",2018-08-11,"[4-4-2, 4-1-4-1]","[[Asmir Begovic, Goalkeeper, 2537], [Steve Coo...","[172723, 193011, 190885, 208920, 169638, 20780...",2018-07-19,2018-07-19,...,"[0.44, 0.44]",Bournemouth,"[0.5291, 0.2755, 0.2183]","[1.9, 3.6, 4.5]","[1.9, 3.4, 4.4]","[1.9, 3.5, 4.1]","[1.89, 3.63, 4.58]","[1.91, 3.5, 4.0]","[1.87, 3.6, 4.75]","[1.91, 3.63, 4.75]"
2,38310,"[Fulham, Crystal Palace]","[0, 2]","[0, 1]",2018-08-11,"[4-3-3, 4-4-2]","[[Fabri, Goalkeeper, 19771], [Calum Chambers, ...","[177723, 205989, 200778, 203505, 192725, 17531...",2018-07-19,2018-07-19,...,"[0.44, 0.44]",Fulham,"[0.4, 0.289, 0.3333]","[2.5, 3.4, 3.0]","[2.45, 3.3, 2.95]","[2.4, 3.3, 2.95]","[2.5, 3.46, 3.0]","[2.45, 3.3, 2.8]","[2.5, 3.4, 3.0]","[2.5, 3.46, 3.0]"
3,38311,"[Huddersfield, Chelsea]","[0, 3]","[0, 2]",2018-08-11,"[3-5-1-1, 4-3-3]","[[Ben Hamer, Goalkeeper, 3183], [Christopher S...","[170008, 200607, 183491, 203485, 183546, 19495...",2018-07-19,2018-07-19,...,"[0.44, 0.44]",Huddersfield,"[0.156, 0.2488, 0.6173]","[6.5, 4.0, 1.61]","[6.25, 3.9, 1.57]","[6.2, 4.0, 1.55]","[6.41, 4.02, 1.62]","[5.8, 3.9, 1.57]","[6.5, 4.0, 1.62]","[6.5, 4.02, 1.62]"
4,38314,"[Newcastle, Spurs]","[1, 2]","[1, 2]",2018-08-11,"[4-4-1-1, 4-3-2-1]","[[Martin Dúbravka, Goalkeeper, 6451], [Ciaran ...","[220407, 183129, 203487, 204355, 212722, 18916...",2018-07-19,2018-07-19,...,"[0.44, 0.44]",Newcastle,"[0.2611, 0.2801, 0.4808]","[3.9, 3.5, 2.04]","[3.8, 3.5, 2.0]","[3.7, 3.35, 2.05]","[3.83, 3.57, 2.08]","[3.8, 3.2, 2.05]","[3.9, 3.4, 2.1]","[3.9, 3.57, 2.1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205,93633,"[Crystal Palace, Man City]","[2, 4]","[1, 1]",2024-04-06,"[3-4-3, ]","[[Dean Henderson, Goalkeeper, 13988], [Joel Wa...","[233306, 186392, 213991, 224221, 259240, 23764...",2024-03-27,2024-03-20,...,"[0.33, 0.73]",Crystal Palace,"[0.094, 0.1789, 0.7634]","[9.5, 5.25, 1.33]","[9.0, 5.5, 1.32]","[nan, nan, nan]","[10.64, 5.59, 1.31]","[10.0, 5.5, 1.29]","[11.0, 5.5, 1.29]","[11.0, 5.59, 1.33]"
2206,93634,"[Everton, Burnley]","[1, 0]","[1, 0]",2024-04-06,"[4-4-1-1, 4-4-2]","[[Jordan Pickford, Goalkeeper, 4640], [Séamus ...","[204935, 180216, 247649, 202695, 244380, 24365...",2024-03-27,2024-03-20,...,"[0.13, 0.4]",Everton,"[0.5952, 0.2392, 0.2004]","[1.65, 4.0, 5.25]","[1.65, 4.0, 5.0]","[nan, nan, nan]","[1.68, 4.18, 4.99]","[1.65, 4.0, 4.8]","[1.62, 3.8, 5.25]","[1.68, 4.18, 5.25]"
2207,93635,"[Fulham, Newcastle]","[0, 1]","[0, 0]",2024-04-06,"[4-2-3-1, 4-3-3]","[[Bernd Leno, Goalkeeper, 4985], [Calvin Basse...","[192563, 241436, 222104, 222501, 229348, 20845...",2024-03-27,2024-03-20,...,"[0.47, 0.47]",Fulham,"[0.4255, 0.2584, 0.3497]","[2.38, 3.8, 2.8]","[2.37, 3.75, 2.7]","[nan, nan, nan]","[2.35, 3.87, 2.86]","[2.3, 3.9, 2.7]","[2.3, 3.7, 2.7]","[2.38, 3.9, 2.86]"
2208,93636,"[Luton, Bournemouth]","[2, 1]","[0, 0]",2024-04-06,"[3-4-2-1, 4-2-3-1]","[[Thomas Kaminski, Goalkeeper, 5844], [Teden M...","[188400, 253465, 221456, 253052, 202464, 25248...",2024-03-27,2024-03-20,...,"[0.13, 0.87]",Luton,"[0.274, 0.2433, 0.5181]","[3.75, 3.8, 1.91]","[3.7, 4.0, 1.85]","[nan, nan, nan]","[3.65, 4.11, 1.93]","[3.6, 4.2, 1.85]","[3.6, 3.8, 1.87]","[3.75, 4.2, 1.93]"
