In [60]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Introduction

Predict over 2.5 goals at French League Division 1. Data from [football-data.co.uk](http://www.football-data.co.uk/).

## Read Data


In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
french2223 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/Models-Sports/base french league/F1 2223.csv')
french2324 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/Models-Sports/base french league/F12324.csv')
french2425 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/Models-Sports/base french league/F2425.csv')

In [63]:
french = pd.concat([french2223, french2324, french2425], axis=0)

In [64]:
french.info()

<class 'pandas.core.frame.DataFrame'>
Index: 821 entries, 0 to 134
Columns: 131 entries, Div to BFECAHA
dtypes: float64(108), int64(16), object(7)
memory usage: 846.7+ KB


In [65]:
#for i in french.columns:
    #print(i)

## Data Cleaning

In [66]:
# create a match id
def match_id(df):
  df['match_id'] = df['Date'] + df['HomeTeam'] + df['AwayTeam']
  return df

french = match_id(french)

### Adjusting column names

In [67]:
french

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,1XBCD,1XBCA,BFECH,BFECD,BFECA,BFEC>2.5,BFEC<2.5,BFECAHH,BFECAHA,match_id
0,F1,05/08/2022,20:00,Lyon,Ajaccio,2,1,H,2,1,...,,,,,,,,,,05/08/2022LyonAjaccio
1,F1,06/08/2022,16:00,Strasbourg,Monaco,1,2,A,0,1,...,,,,,,,,,,06/08/2022StrasbourgMonaco
2,F1,06/08/2022,20:00,Clermont,Paris SG,0,5,A,0,3,...,,,,,,,,,,06/08/2022ClermontParis SG
3,F1,07/08/2022,12:00,Toulouse,Nice,1,1,D,1,0,...,,,,,,,,,,07/08/2022ToulouseNice
4,F1,07/08/2022,14:00,Angers,Nantes,0,0,D,0,0,...,,,,,,,,,,07/08/2022AngersNantes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,F1,15/12/2024,14:00,Montpellier,Nice,2,2,D,1,2,...,3.85,2.37,3.10,3.85,2.36,1.63,2.52,1.92,2.06,15/12/2024MontpellierNice
131,F1,15/12/2024,16:00,Brest,Nantes,4,1,H,2,0,...,3.26,3.59,2.38,3.25,3.60,2.62,1.60,2.01,1.97,15/12/2024BrestNantes
132,F1,15/12/2024,16:00,Le Havre,Strasbourg,0,3,A,0,2,...,3.30,2.26,3.60,3.30,2.36,2.18,1.83,1.99,2.00,15/12/2024Le HavreStrasbourg
133,F1,15/12/2024,16:00,Rennes,Angers,2,0,H,1,0,...,3.95,5.77,1.64,4.00,7.00,2.12,1.87,1.81,2.19,15/12/2024RennesAngers


In [68]:
match_stats_info = [
    "Div",  # League Division
    "match_id",  # Match ID (to be used with the results dataset)
    "Date",  # Match Date (dd/mm/yy)
    "Time",  # Time of match kick off
    "HomeTeam",  # Home Team
    "AwayTeam",  # Away Team
    "FTHG",  # Full Time Home Team Goals
    #"HG",  # Full Time Home Team Goals (alternate name)
    "FTAG",  # Full Time Away Team Goals
    #"AG",  # Full Time Away Team Goals (alternate name)
    "FTR",  # Full Time Result (H=Home Win, D=Draw, A=Away Win)
    #"Res",  # Full Time Result (alternate name)
    "HTHG",  # Half Time Home Team Goals
    "HTAG",  # Half Time Away Team Goals
    "HTR",  # Half Time Result (H=Home Win, D=Draw, A=Away Win)
    # Match Statistics
    #"Attendance",  # Crowd Attendance
    #"Referee",  # Match Referee
    "HS",  # Home Team Shots
    "AS",  # Away Team Shots
    "HST",  # Home Team Shots on Target
    "AST",  # Away Team Shots on Target
    #"HHW",  # Home Team Hit Woodwork
    #"AHW",  # Away Team Hit Woodwork
    "HC",  # Home Team Corners
    "AC",  # Away Team Corners
    "HF",  # Home Team Fouls Committed
    "AF",  # Away Team Fouls Committed
    #"HFKC",  # Home Team Free Kicks Conceded
    #"AFKC",  # Away Team Free Kicks Conceded
    #"HO",  # Home Team Offsides
    #"AO",  # Away Team Offsides
    "HY",  # Home Team Yellow Cards
    "AY",  # Away Team Yellow Cards
    "HR",  # Home Team Red Cards
    "AR"  # Away Team Red Cards
    #"HBP",  # Home Team Bookings Points (10 = yellow, 25 = red)
    #"ABP"  # Away Team Bookings Points (10 = yellow, 25 = red)
]

match_odds_info = ["Date",  # Match Date (dd/mm/yy),
              "match_id",  # Match ID (to be used with the results dataset),
              'AvgH', # Average home win odds market
              'AvgD', # Average draw odds market
              'AvgA', # Average away win odds market
              'Avg>2.5', # Average over 2.5 goals odds market
              'Avg<2.5', # Average under 2.5 goals odds market
              'AvgAHH', # Average Asian handicap home win odds market
              'AvgAHA'] # Average Asian handicap away win odds market

# making column names clear

match_stats_column_names = {
    "div": "league_division",
    'match_id': 'match_id',
    "date": "date",
    "time": "time_kick_off",
    "home_team": "h_team",
    "away_team": "a_team",
    "ft_home_goals": "ft_h_goals",
    # "hg": "full_time_home_team_goals_alternate_name",
    "ft_away_goals": "ft_a_goals",
    # "ag": "full_time_away_team_goals_alternate_name",
    "ft_result": "ft_result",
    # "res": "full_time_result_alternate_name",
    "ht_home_goals": "ht_h_team_goals",
    "ht_away_goals": "ht_a_team_goals",
    "ht_result": "ht_result",
    # Match Statistics
    # "attendance": "crowd_attendance",
    # "referee": "match_referee",
    "home_shots": "h_team_shots",
    "away_shots": "a_team_shots",
    "home_shots_on_target": "h_team_shots_on_target",
    "away_shots_on_target": "a_team_shots_on_target",
    # "hhw": "home_team_hit_woodwork",
    # "ahw": "away_team_hit_woodwork",
    "home_corners": "h_team_corners",
    "away_corners": "a_team_corners",
    "home_fouls": "h_team_fouls_committed",
    "away_fouls": "a_team_fouls_committed",
    # "hfkc": "home_team_free_kicks_conceded",
    # "afkc": "away_team_free_kicks_conceded",
    # "ho": "home_team_offsides",
    # "ao": "away_team_offsides",
    "home_yellow_cards": "h_team_yellow_cards",
    "away_yellow_cards": "a_team_yellow_cards",
    "home_red_cards": "h_team_red_cards",
    "away_red_cards": "a_team_red_cards",
    # "hbp": "home_team_bookings_points_10_yellow_25_red",
    # "abp": "away_team_bookings_points_10_yellow_25_red"
}

# making column names clear

match_odds_column_names = {"Date": 'date',
    "match_id": "match_id",
    "avg_h": "avg_h_win_odds_market",
    "avg_d": "avg_d_odds_market",
    "avg_a": "avg_a_win_odds_market",
    "avg_over_2_5": "avg_o_2_5_goals_odds",
    "avg_under_2_5": "avg_u_2_5_goals_odds",
    "avg_ahh": "avg_asian_handicap_h_win_odds",
    "avg_aha": "avg_asian_handicap_a_win_odds"
}

In [69]:
french_m_s = french[match_stats_info] # Match Stats df
french_m_o = french[match_odds_info] #Match Odds df - later I merge

french_m_s.columns = match_stats_column_names.values()
french_m_o.columns = match_odds_column_names.values()

# Check for null values
print(french_m_s.isnull().sum())
print('\n')
print(french_m_o.isnull().sum())

league_division           0
match_id                  0
date                      0
time_kick_off             0
h_team                    0
a_team                    0
ft_h_goals                0
ft_a_goals                0
ft_result                 0
ht_h_team_goals           0
ht_a_team_goals           0
ht_result                 0
h_team_shots              0
a_team_shots              0
h_team_shots_on_target    0
a_team_shots_on_target    0
h_team_corners            0
a_team_corners            0
h_team_fouls_committed    0
a_team_fouls_committed    0
h_team_yellow_cards       0
a_team_yellow_cards       0
h_team_red_cards          0
a_team_red_cards          0
dtype: int64


date                             0
match_id                         0
avg_h_win_odds_market            0
avg_d_odds_market                0
avg_a_win_odds_market            0
avg_o_2_5_goals_odds             0
avg_u_2_5_goals_odds             0
avg_asian_handicap_h_win_odds    0
avg_asian_handicap_a_win_odds    

### Adjusting Variables

In [70]:
print(list(french_m_s.select_dtypes(['object']).columns))

['league_division', 'match_id', 'date', 'time_kick_off', 'h_team', 'a_team', 'ft_result', 'ht_result']


In [71]:
# Transform date to datetime
french_m_s.loc[:,'date'] = pd.to_datetime(french_m_s['date'], dayfirst=True)
french_m_o.loc[:,'date'] = pd.to_datetime(french_m_s['date'], dayfirst=True)

In [72]:
french_m_s['ft_result']

Unnamed: 0,ft_result
0,H
1,A
2,A
3,D
4,D
...,...
130,D
131,H
132,A
133,H


In [73]:
french_m_s = pd.get_dummies(french_m_s, columns=['ft_result', 'ht_result'])
french_m_s_1 = french_m_s.drop(['league_division', 'time_kick_off'], axis=1) # changed the variable name to maybe use this variables later

#### Create over goals variable

In [74]:
def over_goals(df, goals_thresholds):
  for threshold in goals_thresholds:
    df[f'over_{threshold}'] = np.where((df['ft_h_goals'] + df['ft_a_goals']) > threshold, 1, 0)
  return df

In [75]:

# Applying the function for multiple thresholds at once
french_m_s_1 = over_goals(french_m_s_1, [1.5, 2.5, 3.5, 4.5])

## Create a DF to make an analysis for each team
* Split the raw_match_stats to two datasets (home_team_stats and away_team_stats).
* Stack these two datasets so that each row is the stats for a team for one match (team_stats_per_match).
* At each row of this dataset, get the team name, find the stats for that team during the last 10 matches, and average these stats (avg_stats_per_team).
* Add these stats to the team_stats_per_match dataset.

- Then, often teams can play better at home than away so it's important to consider whether they are playing at home or away, so , calculate last 5 games average for home teams and away teams

### Reshape average pre-match stats
Now that we have the average stats for each team going into every match, we can create a dataset similar to the raw_match_stats, where each row represents both teams from one match.

Re-segment the home and away teams (name Team 1 and Team 2 rather than home and away). Combine at each match to get a dataset with a row representing each match.

In [76]:
def reshape_avg_prematch_stats(df):
  def each_team_averages(df):
    # Split the raw_match_stats to two datasets (home_team_stats and away_team_stats)

    df = df.sort_values(by=['date'], ascending = False) # sort by date

    home_team_stats = df[[
    'date',
    'match_id',
    'h_team',
    'ft_h_goals',
    'ht_h_team_goals',
    'h_team_corners',
    'h_team_shots',
    'h_team_shots_on_target',
    'h_team_fouls_committed',
    'h_team_yellow_cards',
    'h_team_red_cards',
    'ft_result_H',
    'ft_a_goals',
    'ht_a_team_goals',
    'a_team_corners',
    'a_team_shots',
    'a_team_shots_on_target',
    'a_team_fouls_committed',
    'a_team_yellow_cards',
    'a_team_red_cards']]

    home_team_stats = home_team_stats.rename(columns={
    'h_team': 'name',
    'ft_h_goals': 'goalsScored',
    'ht_h_team_goals': 'halfTimeGoalsScored',
    'h_team_corners': 'cornerCount',
    'h_team_shots': 'shots',
    'h_team_shots_on_target': 'shotsOnTarget',
    'h_team_fouls_committed': 'foulsConceded',
    'h_team_yellow_cards': 'yellowConceded',
    'h_team_red_cards': 'redConceded',
    'ft_result_H': 'result',
    'ft_a_goals': 'goalsConceded',
    'ht_a_team_goals': 'halfTimeGoalsConceded',
    'a_team_corners': 'cornersConceded',
    'a_team_shots': 'shotsConceded',
    'a_team_shots_on_target': 'shotsOnTargetConceded',
    'a_team_fouls_committed': 'foulsReceived',
    'a_team_yellow_cards': 'yellowOpponent',
    'a_team_red_cards': 'redOpponent'
})

    away_team_stats = df[[
    'date',
    'match_id',
    'a_team',
    'ft_a_goals',
    'ht_a_team_goals',
    'a_team_corners',
    'a_team_shots',
    'a_team_shots_on_target',
    'a_team_fouls_committed',
    'a_team_yellow_cards',
    'a_team_red_cards',
    'ft_result_A',
    'ft_h_goals',
    'ht_h_team_goals',
    'h_team_corners',
    'h_team_shots',
    'h_team_shots_on_target',
    'h_team_fouls_committed',
    'h_team_yellow_cards',
    'h_team_red_cards']]

    away_team_stats = away_team_stats.rename(columns={
    'a_team': 'name',
    'ft_a_goals': 'goalsScored',
    'ht_a_team_goals': 'halfTimeGoalsScored',
    'a_team_corners': 'cornerCount',
    'a_team_shots': 'shots',
    'a_team_shots_on_target': 'shotsOnTarget',
    'a_team_fouls_committed': 'foulsConceded',
    'a_team_yellow_cards': 'yellowConceded',
    'a_team_red_cards': 'redConceded',
    'ft_result_A': 'result',
    'ft_h_goals': 'goalsConceded',
    'ht_h_team_goals': 'halfTimeGoalsConceded',
    'h_team_corners': 'cornersConceded',
    'h_team_shots': 'shotsConceded',
    'h_team_shots_on_target': 'shotsOnTargetConceded',
    'h_team_fouls_committed': 'foulsReceived',
    'h_team_yellow_cards': 'yellowOpponent',
    'h_team_red_cards': 'redOpponent'
})

  # add an additional column to specify if the team is playing home or away
    home_team_stats['home_or_away']='Home'
    away_team_stats['home_or_away']='Away'

  # stack these two datasets so that each row is the stats for a team for one match (team_stats_per_match)
    team_stats_per_match = pd.concat([home_team_stats,away_team_stats])

  # avg for last team matches
    avg_lastTen_stat_columns = [
                    'average_goalsScored_last_ten',
                    'average_halfTimeGoalsScored_last_ten',
                    'average_cornerCount_last_ten',
                    'average_shots_last_ten',
                    'average_shotsOnTarget_last_ten',
                    'average_foulsConceded_last_ten',
                    'average_yellowConceded_last_ten',
                    'average_redConceded_last_ten',
                    'average_result_last_ten',
                    'average_goalsConceded_last_ten',
                    'average_halfTimeGoalsConceded_last_ten',
                    'average_cornersConceded_last_ten',
                    'average_shotsConceded_last_ten',
                    'average_shotsOnTargetConceded_last_ten',
                    'average_foulsReceived_last_ten',
                    'average_yellowOpponent_last_ten',
                    'average_redOpponent_last_ten'
                    ]

    lastTen_stats_list = []
    for index, row in team_stats_per_match.iterrows():
      team_stats_last_ten_matches = team_stats_per_match.loc[(team_stats_per_match['name']==row['name']) & (team_stats_per_match['date']<row['date'])].sort_values(by=['date'], ascending=False)
      lastTen_stats_list.append(team_stats_last_ten_matches.iloc[0:10,3:-1].mean(axis=0).values[0:18])

    avg_lastTen_stats_per_team = pd.DataFrame(lastTen_stats_list, columns=avg_lastTen_stat_columns)

  # avg last 5 for home team
    avg_lastFiveHome_stat_columns=[
                    'average_goalsScored_last_five_home',
                    'average_halfTimeGoalsScored_last_five_home',
                    'average_cornerCount_last_five_home',
                    'average_shots_last_five_home',
                    'average_shotsOnTarget_last_five_home',
                    'average_foulsConceded_last_five_home',
                    'average_yellowConceded_last_five_home',
                    'average_redConceded_last_five_home',
                    'average_result_last_five_home',
                    'average_goalsConceded_last_five_home',
                    'average_halfTimeGoalsConceded_last_five_home',
                    'average_cornersConceded_last_five_home',
                    'average_shotsConceded_last_five_home',
                    'average_shotsOnTargetConceded_last_five_home',
                    'average_foulsReceived_last_five_home',
                    'average_yellowOpponent_last_five_home',
                    'average_redOpponent_last_five_home'
                    ]

    lastFive_Home_stats_list = []
    team_stats_L5_home_matches = team_stats_per_match[team_stats_per_match['home_or_away'] == 'Home']
    for index, row in team_stats_L5_home_matches.iterrows():
      team_stats_last_five_home_matches = team_stats_L5_home_matches.loc[(team_stats_L5_home_matches['name']==row['name']) & (team_stats_L5_home_matches['date']<row['date'])].sort_values(by=['date'], ascending=False)
      lastFive_Home_stats_list.append(team_stats_last_five_home_matches.iloc[0:5,3:-1].mean(axis=0).values[0:18])

    avg_lastFiveHome_stats_per_team = pd.DataFrame(lastFive_Home_stats_list, columns=avg_lastFiveHome_stat_columns)
  # concatenating original filtered by home matches with stats
    team_stats_L5_home_matches = pd.concat([team_stats_L5_home_matches.reset_index(drop=True), avg_lastFiveHome_stats_per_team], axis=1, ignore_index=False)

  # avg last 5 for away team
    avg_lastFiveAway_stat_columns=[

                    'average_goalsScored_last_five_away',
                    'average_halfTimeGoalsScored_last_five_away',
                    'average_cornerCount_last_five_away',
                    'average_shots_last_five_away',
                    'average_shotsOnTarget_last_five_away',
                    'average_foulsConceded_last_five_away',
                    'average_yellowConceded_last_five_away',
                    'average_redConceded_last_five_away',
                    'average_result_last_five_away',
                    'average_goalsConceded_last_five_away',
                    'average_halfTimeGoalsConceded_last_five_away',
                    'average_cornersConceded_last_five_away',
                    'average_shotsConceded_last_five_away',
                    'average_shotsOnTargetConceded_last_five_away',
                    'average_foulsReceived_last_five_away',
                    'average_yellowOpponent_last_five_away',
                    'average_redOpponent_last_five_away'
                    ]

    lastFive_away_stats_list = []
    team_stats_L5_away_matches = team_stats_per_match[team_stats_per_match['home_or_away'] == 'Away']
    for index, row in team_stats_L5_away_matches.iterrows():
        team_stats_last_five_away_matches = team_stats_L5_away_matches.loc[(team_stats_L5_away_matches['name']==row['name']) & (team_stats_L5_away_matches['date']<row['date'])].sort_values(by=['date'], ascending=False)
        lastFive_away_stats_list.append(team_stats_last_five_away_matches.iloc[0:5,3:-1].mean(axis=0).values[0:18])

    avg_lastFiveAway_stats_per_team = pd.DataFrame(lastFive_away_stats_list, columns=avg_lastFiveAway_stat_columns)
   # concatenating original filtered by away matches with stats
    team_stats_L5_away_matches = pd.concat([team_stats_L5_away_matches.reset_index(drop=True), avg_lastFiveAway_stats_per_team], axis=1, ignore_index=False)

  # Adjust column names
    team_stats_L5_home_matches.columns = team_stats_L5_home_matches.columns[:2].tolist() + ['team_1_'+str(col) for col in team_stats_L5_home_matches.columns[2:]]
    team_stats_L5_away_matches.columns = team_stats_L5_away_matches.columns[:2].tolist() + ['team_2_'+str(col) for col in team_stats_L5_away_matches.columns[2:]]

    home_and_away_stats = pd.merge(team_stats_L5_home_matches,team_stats_L5_away_matches,how='left',on=['date','match_id'])

    return home_and_away_stats, team_stats_per_match, avg_lastTen_stats_per_team

   # Call each_team_averages and assign return values
  home_and_away_stats, team_stats_per_match, avg_lastTen_stats_per_team = each_team_averages(df)

  team_stats_per_match = pd.concat([team_stats_per_match.reset_index(drop=True), avg_lastTen_stats_per_team], axis=1, ignore_index=False)
# Re-segment the home and away teams.
  home_team_stats = team_stats_per_match.iloc[:int(team_stats_per_match.shape[0]/2),:]
  away_team_stats = team_stats_per_match.iloc[int(team_stats_per_match.shape[0]/2):,:]

  home_team_stats.columns = home_team_stats.columns[:2].tolist() + ['team_1_'+str(col) for col in home_team_stats.columns[2:]]
  away_team_stats.columns = away_team_stats.columns[:2].tolist() + ['team_2_'+str(col) for col in away_team_stats.columns[2:]]

# Combine at each match to get a dataset with a row representing each match.
# drop the NA rows (earliest match for each team, i.e no previous stats)
  away_team_stats = away_team_stats.iloc[:, 2:]
  match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
  match_stats = match_stats.dropna().reset_index(drop=True)
  match_stats=pd.merge(match_stats,home_and_away_stats,how='left',on=['date',
                                                                    'match_id','team_1_name',
                                                                    'team_1_goalsScored',
                                                                    'team_1_halfTimeGoalsScored',
                                                                    'team_1_cornerCount',
                                                                    'team_1_shots',
                                                                    'team_1_shotsOnTarget',
                                                                    'team_1_foulsConceded',
                                                                    'team_1_yellowConceded',
                                                                    'team_1_redConceded',
                                                                    'team_1_result',
                                                                    'team_1_goalsConceded',
                                                                    'team_1_halfTimeGoalsConceded',
                                                                    'team_1_cornersConceded',
                                                                    'team_1_shotsConceded',
                                                                    'team_1_shotsOnTargetConceded',
                                                                    'team_1_foulsReceived',
                                                                    'team_1_yellowOpponent',
                                                                    'team_1_redOpponent',
                                                                    'team_1_home_or_away',
                                                                    'team_2_name',
                                                                    'team_2_goalsScored',
                                                                    'team_2_halfTimeGoalsScored',
                                                                    'team_2_cornerCount',
                                                                    'team_2_shots',
                                                                    'team_2_shotsOnTarget',
                                                                    'team_2_foulsConceded',
                                                                    'team_2_yellowConceded',
                                                                    'team_2_redConceded',
                                                                    'team_2_result',
                                                                    'team_2_goalsConceded',
                                                                    'team_2_halfTimeGoalsConceded',
                                                                    'team_2_cornersConceded',
                                                                    'team_2_shotsConceded',
                                                                    'team_2_shotsOnTargetConceded',
                                                                    'team_2_foulsReceived',
                                                                    'team_2_yellowOpponent',
                                                                    'team_2_redOpponent',
                                                                    'team_2_home_or_away'])

  match_stats = match_stats.dropna().reset_index(drop=True)
  match_stats['opp_code'] = match_stats['team_2_name'].astype('category').cat.codes # Identify each team as a number
  match_stats['team_code'] = match_stats['team_1_name'].astype('category').cat.codes

  match_stats = pd.get_dummies(match_stats, columns=['team_code', 'opp_code'])

  return match_stats

In [77]:
match_stats = reshape_avg_prematch_stats(french_m_s_1)

In [78]:
match_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794 entries, 0 to 793
Columns: 154 entries, date to opp_code_22
dtypes: bool(48), float64(68), int64(32), object(6)
memory usage: 694.9+ KB


##### Making target variable

In [79]:
match_stats['over_2.5'] = np.where((match_stats['team_1_goalsScored'] + match_stats['team_2_goalsScored']) > 2.5, 1, 0)
match_stats.value_counts('over_2.5')

Unnamed: 0_level_0,count
over_2.5,Unnamed: 1_level_1
1,434
0,360


## Goal Spread Analysis
Here, I will see how a linear regression explains the variance of the model based on analysis of goal spreads -> goals spreads is the difference between a number of goals a home team scored against its opponent.

In [80]:
sum(match_stats.isnull().sum())

0

In [81]:
def scaled_goals_shifted(df):
  # Calculation the goals spread for home team
  df['spread'] = df['team_1_goalsScored'] - df['team_2_goalsScored']
  # Total goals
  df['total_goals'] = df['team_1_goalsScored'] + df['team_2_goalsScored']
  # Avg number of goals for each team
  avg_goals_per_team = df['total_goals'].sum() / (2*len(df))
  # Scale score and away goals
  df['scaled_home_goals'] = df['team_1_goalsScored'] - avg_goals_per_team
  df['scaled_away_goals'] = df['team_2_goalsScored'] - avg_goals_per_team
  #  Shift scale goals to make sense for a predictive model
  df.loc[:,'scaled_home_goals_last'] = df.groupby('team_1_name')['scaled_home_goals'].shift(1)
  df.loc[:,'scaled_away_goals_last'] = df.groupby('team_2_name')['scaled_away_goals'].shift(1)

  return df.dropna().reset_index(drop=True)

In [82]:
match_stats = scaled_goals_shifted(match_stats)

Scaled goals is the difference between number of goals scored compared to the average of goals of the league.

In [83]:
spreads_df = match_stats[['date','match_id','team_1_name','team_2_name', 'spread', 'total_goals', 'scaled_home_goals',
                               'scaled_away_goals']]

In [84]:
spreads_df

Unnamed: 0,date,match_id,team_1_name,team_2_name,spread,total_goals,scaled_home_goals,scaled_away_goals
0,2024-12-01 00:00:00,01/12/2024MarseilleMonaco,Marseille,Monaco,1,3,0.615239,-0.384761
1,2024-12-01 00:00:00,01/12/2024Le HavreAngers,Le Havre,Angers,-1,1,-1.384761,-0.384761
2,2024-12-01 00:00:00,01/12/2024MontpellierLille,Montpellier,Lille,0,4,0.615239,0.615239
3,2024-11-30 00:00:00,30/11/2024RennesSt Etienne,Rennes,St Etienne,5,5,3.615239,-1.384761
4,2024-11-30 00:00:00,30/11/2024BrestStrasbourg,Brest,Strasbourg,2,4,1.615239,-0.384761
...,...,...,...,...,...,...,...,...
760,2022-08-21 00:00:00,21/08/2022AngersBrest,Angers,Brest,-2,4,-0.384761,1.615239
761,2022-08-21 00:00:00,21/08/2022StrasbourgReims,Strasbourg,Reims,0,2,-0.384761,-0.384761
762,2022-08-20 00:00:00,20/08/2022MonacoLens,Monaco,Lens,-3,5,-0.384761,2.615239
763,2022-08-20 00:00:00,20/08/2022MarseilleNantes,Marseille,Nantes,1,3,0.615239,-0.384761


In [85]:
# 1 for offensive and -1 for defensive is the way to calculate the betas to explain the variance in the target variable, since a goal for the away
# should count as a -1 to the defense of home team, that are scaled home goals or scaled away goals.
# HFA is the Home Field Advantage defined as 0.5

list_team1 = []
for i in range(len(spreads_df)):
    temp = {spreads_df.loc[i,'team_1_name']+'_Off':1, spreads_df.loc[i,'team_2_name']+'_Def':-1,
            'HFA':0.5, 'target':spreads_df.loc[i,'scaled_home_goals']}
    list_team1.append(temp)

list_team2 = []
for i in range(len(spreads_df)):
    temp = {spreads_df.loc[i,'team_2_name']+'_Off':1, spreads_df.loc[i,'team_1_name']+'_Def':-1,
            'HFA':-0.5, 'target':spreads_df.loc[i,'scaled_away_goals']}
    list_team2.append(temp)

df_reg = pd.DataFrame(list_team1+list_team2)
df_reg.fillna(0, inplace = True)

In [86]:
teams = sorted(list(set(spreads_df['team_1_name'])))
cols = [t+'_Off' for t in teams] + [t+'_Def' for t in teams] + ['HFA','target']
df_reg = df_reg[cols]

In [87]:
df_reg.head(2)

Unnamed: 0,Ajaccio_Off,Angers_Off,Auxerre_Off,Brest_Off,Clermont_Off,Le Havre_Off,Lens_Off,Lille_Off,Lorient_Off,Lyon_Off,...,Nice_Def,Paris SG_Def,Reims_Def,Rennes_Def,St Etienne_Def,Strasbourg_Def,Toulouse_Def,Troyes_Def,HFA,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.615239
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,-1.384761


In [88]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
reg = LinearRegression(fit_intercept=False)
reg.fit(df_reg.drop('target', axis=1), df_reg['target'])
print(r2_score(df_reg['target'], reg.predict(df_reg.drop('target', axis=1))))

0.16359081220645388


The goal spread explains around 16% of the scaled_goals.

In [89]:
pr_dict = dict(zip(df_reg.columns[:-1], reg.coef_.round(2)))
df_ranking = pd.DataFrame(index = teams, columns = ['Power Rating', 'Off', 'Def'])

for c in ['Off','Def']:
    df_ranking[c] = [pr_dict[t+'_'+c] for t in teams]

df_ranking['Power Rating'] = df_ranking['Off'] + df_ranking['Def']
df_ranking.sort_values('Power Rating', ascending = False)

Unnamed: 0,Power Rating,Off,Def
Paris SG,1.5,1.08,0.42
Marseille,0.75,0.43,0.32
Lens,0.75,0.18,0.57
Monaco,0.71,0.61,0.1
Lille,0.7,0.3,0.4
Rennes,0.56,0.33,0.23
Nice,0.55,0.0,0.55
Lyon,0.34,0.3,0.04
Brest,0.23,0.04,0.19
Reims,0.2,-0.05,0.25


## get GAP Ratings - Function from: [LINK](https://medium.com/@ML_Soccer_Betting)

The Generalised Attacking Performance (GAP) rating system, introduced by Wheatcroft 2020, is a dynamic ratings system that aims to assess the attacking and defensive strengths of football teams.

The parameter λ determines the inﬂuence of the last match on the ratings of each team whilst φ1 and φ2 determine the inﬂuence of a home match on a team’s away ratings and the inﬂuence of an away match on a team’s home ratings respectively. If φ1 = 0, the home team’s away ratings will not be aﬀected whilst the same is true of the away team’s home ratings if φ2 = 0..

Since we can't use goals from the current match to predict the current match... I use the scaled goals from the latest match by each team as a feature.

In [90]:
# Scaled goals were defined above

In [91]:
def get_gap_ratings(df, params, columns):
    # Sort matches by date to ensure chronological processing
    df = df.sort_values('date')

    def set_initial_ratings(df):
        home_attack_dict = {}
        away_attack_dict = {}
        home_defense_dict = {}
        away_defense_dict = {}

        home_teams = df['team_1_name'].unique()
        away_teams = df['team_2_name'].unique()

        for team in set(home_teams).union(set(away_teams)):
            home_attack_dict[f"Home Attack {team}"] = 0
            away_attack_dict[f"Away Attack {team}"] = 0
            home_defense_dict[f"Home Defense {team}"] = 0
            away_defense_dict[f"Away Defense {team}"] = 0

        return home_attack_dict, away_attack_dict, home_defense_dict, away_defense_dict

    def get_home_ratings_update(r1, r2, gamma, theta, stat):
        mu = (r1 + r2) / 2
        gt = gamma * theta
        s1 = r1 + (gt * (stat - mu))
        return max(s1, 0)  # Ensure rating is non-negative

    def get_away_ratings_update(r1, r2, r3, gamma, theta, stat):
        mu = (r2 + r3) / 2
        gt = (1 - gamma) * theta
        s1 = r1 + (gt * (stat - mu))
        return max(s1, 0)  # Ensure rating is non-negative

    # Initialize ratings
    home_attack_dict, away_attack_dict, home_defense_dict, away_defense_dict = set_initial_ratings(df)

    # Unpack parameters
    theta_h = params[0]
    theta_a = params[1]
    gamma = params[2]

    # Unpack column names
    home_index_column = columns[0]
    away_index_column = columns[1]
    home_stat_index_column = columns[2]
    away_stat_index_column = columns[3]

    # Convert DataFrame to list for efficient row-wise processing
    df_list = df.values.tolist()

    # Get column indices
    home_index = df.columns.get_loc(home_index_column)
    away_index = df.columns.get_loc(away_index_column)
    home_stat_index = df.columns.get_loc(home_stat_index_column)
    away_stat_index = df.columns.get_loc(away_stat_index_column)

    # Process each match
    for i in range(len(df_list)):
        row = df_list[i]

        # Extract match data
        ht = row[home_index]
        at = row[away_index]
        stat_home = row[home_stat_index]
        stat_away = row[away_stat_index]

        # Use current ratings before updating
        ha1 = home_attack_dict[f"Home Attack {ht}"]
        hd1 = home_defense_dict[f"Home Defense {ht}"]
        aa2 = away_attack_dict[f"Away Attack {at}"]
        ad2 = away_defense_dict[f"Away Defense {at}"]

        # Append current ratings to the row
        row.append(ha1)
        row.append(hd1)
        row.append(aa2)
        row.append(ad2)

        # Update ratings using the current game's statistics
        ha1 = get_home_ratings_update(ha1, ad2, gamma, theta_h, stat_home)
        hd1 = get_home_ratings_update(hd1, aa2, gamma, theta_h, stat_away)
        aa2 = get_home_ratings_update(aa2, hd1, gamma, theta_a, stat_away)
        ad2 = get_home_ratings_update(ad2, ha1, gamma, theta_a, stat_home)

        # Store updated ratings
        home_attack_dict[f"Home Attack {ht}"] = ha1
        home_defense_dict[f"Home Defense {ht}"] = hd1
        away_attack_dict[f"Away Attack {at}"] = aa2
        away_defense_dict[f"Away Defense {at}"] = ad2

    # Convert updated list back to DataFrame and add new columns
    df = pd.DataFrame(df_list, columns=df.columns.tolist() +
                      ['Home Attack', 'Home Defense', 'Away Attack', 'Away Defense'])

    return df

# Usage
ratings = get_gap_ratings(
    match_stats,
    [0.1, 0.1, 0.3], #tetha home, tetha away, gamma
    ['team_1_name', 'team_2_name', 'scaled_home_goals_last', 'scaled_away_goals_last'])

## Creating New Variables Weighted by the Odds
Considering a 5% juice from bookmakers, the ODD used to calculate the "real" probability will be increased by 5%. For example, an odds of 2.0 will be 2.0 * 1.05 = 2.1, so, by dividing 1/2.1, we will have a more reliable approximation of the probability estimated by the bookmakers.

#### Merge with odds df

In [92]:
ratings = pd.merge(ratings, french_m_o, how='left', on=['match_id'])

#### Probabilidade de eventos

In [93]:
def result_prob(df, odds_column):
  for column in odds_column:
    df[column+'_prob'] = 1/ (df[column]*1.05)
  return df

In [94]:
ratings = result_prob(ratings, ['avg_h_win_odds_market', 'avg_d_odds_market', 'avg_a_win_odds_market'])

In [95]:
#ratings['h_win_prob'] = 1/ (ratings['avg_h_win_odds_market']*1.05)
#ratings['d_prob'] = 1/ (ratings['avg_d_odds_market']*1.05)
#atings['a_win_prob'] = 1/ (ratings['avg_a_win_odds_market']*1.05)

In [96]:
#for i in ratings.columns:
  #print(i)

In [97]:
ratings['team_1_h_or_a'] = 'Home'
ratings['team_2_h_or_a'] = 'Away'

#### Média de saldo de gols ponderado pela probabilidade do time

In [98]:
def calc_weighted_features(df, n_per):
  # AVERAGE GOAL DIFFERENCE WEIGHTED BY TEAM PROBABILITY
  df['spread_weighted_h_win'] = df['spread'] / df['avg_h_win_odds_market_prob']
  df['spread_weighted_a_win'] = (-df['spread']) / df['avg_a_win_odds_market_prob']

  # Rolling mean
  df['mean_spread_weighted_h_win'] = df.groupby('team_1_h_or_a')['spread_weighted_h_win'].rolling(window = n_per, min_periods = n_per).mean().reset_index(0,drop=True)
  df['mean_spread_weighted_a_win'] = df.groupby('team_2_h_or_a')['spread_weighted_a_win'].rolling(window = n_per, min_periods = n_per).mean().reset_index(0,drop=True)
  df['mean_spread_weighted_h_win'] = df.groupby('team_1_h_or_a')['mean_spread_weighted_h_win'].shift(1)
  df['mean_spread_weighted_a_win'] = df.groupby('team_2_h_or_a')['mean_spread_weighted_a_win'].shift(1)

  # Rolling std
  df['std_spread_weighted_h_win'] = df.groupby('team_1_h_or_a')['spread_weighted_h_win'].rolling(window = n_per, min_periods = n_per).std().reset_index(0,drop=True)
  df['std_spread_weighted_a_win'] = df.groupby('team_2_h_or_a')['spread_weighted_a_win'].rolling(window = n_per, min_periods = n_per).std().reset_index(0,drop=True)
  df['std_spread_weighted_h_win'] = df.groupby('team_1_h_or_a')['std_spread_weighted_h_win'].shift(1)
  df['std_spread_weighted_a_win'] = df.groupby('team_2_h_or_a')['std_spread_weighted_a_win'].shift(1)

  # Variation Coefficient
  df['cv_spread_h'] = df['std_spread_weighted_h_win'] / df['mean_spread_weighted_h_win']
  df['cv_spread_a'] = df['std_spread_weighted_a_win'] / df['mean_spread_weighted_a_win']

  # AVERAGE GOAL DIFFERENCE WEIGHTED BY THE OPPONENT'S PROBABILITY
  df['spread_weighted_h_win_opp'] = df['spread'] / df['avg_a_win_odds_market_prob']
  df['spread_weighted_a_win_opp'] = (-df['spread']) / df['avg_h_win_odds_market_prob']

  # Rolling mean
  df['mean_spread_weighted_h_win_opp'] = df.groupby('team_1_h_or_a')['spread_weighted_h_win_opp'].rolling(window = n_per, min_periods = n_per).mean().reset_index(0,drop=True)
  df['mean_spread_weighted_a_win_opp'] = df.groupby('team_2_h_or_a')['spread_weighted_a_win_opp'].rolling(window = n_per, min_periods = n_per).mean().reset_index(0,drop=True)
  df['mean_spread_weighted_h_win_opp'] = df.groupby('team_1_h_or_a')['mean_spread_weighted_h_win_opp'].shift(1)
  df['mean_spread_weighted_a_win_opp'] = df.groupby('team_2_h_or_a')['mean_spread_weighted_a_win_opp'].shift(1)

  # Rolling std
  df['std_spread_weighted_h_win_opp'] = df.groupby('team_1_h_or_a')['spread_weighted_h_win_opp'].rolling(window = n_per, min_periods = n_per).std().reset_index(0,drop=True)
  df['std_spread_weighted_a_win_opp'] = df.groupby('team_2_h_or_a')['spread_weighted_a_win_opp'].rolling(window = n_per, min_periods = n_per).std().reset_index(0,drop=True)
  df['std_spread_weighted_h_win_opp'] = df.groupby('team_1_h_or_a')['std_spread_weighted_h_win_opp'].shift(1)
  df['std_spread_weighted_a_win_opp'] = df.groupby('team_2_h_or_a')['std_spread_weighted_a_win_opp'].shift(1)

  # Variation Coefficient
  df['cv_spread_h'] = df['std_spread_weighted_h_win_opp'] / df['mean_spread_weighted_h_win_opp']
  df['cv_spread_a'] = df['std_spread_weighted_a_win_opp'] / df['mean_spread_weighted_a_win_opp']


  # AVERAGE GOAL VALUE
  df['goal_value_h'] = df['team_1_goalsScored'] * df['avg_a_win_odds_market_prob']
  df['goal_value_a'] = df['team_2_goalsScored'] * df['avg_h_win_odds_market_prob']

  # Rolling mean
  df['mean_goal_value_h'] = df.groupby('team_1_h_or_a')['goal_value_h'].rolling(window = n_per, min_periods = n_per).mean().reset_index(0,drop=True)
  df['mean_goal_value_a'] = df.groupby('team_2_h_or_a')['goal_value_a'].rolling(window = n_per, min_periods = n_per).mean().reset_index(0,drop=True)
  df['mean_goal_value_h'] = df.groupby('team_1_h_or_a')['mean_goal_value_h'].shift(1)
  df['mean_goal_value_a'] = df.groupby('team_2_h_or_a')['mean_goal_value_a'].shift(1)

  # Rolling std
  df['std_goal_value_h'] = df.groupby('team_1_h_or_a')['goal_value_h'].rolling(window = n_per, min_periods = n_per).std().reset_index(0,drop=True)
  df['std_goal_value_a'] = df.groupby('team_2_h_or_a')['goal_value_a'].rolling(window = n_per, min_periods = n_per).std().reset_index(0,drop=True)
  df['std_goal_value_h'] = df.groupby('team_1_h_or_a')['std_goal_value_h'].shift(1)
  df['std_goal_value_a'] = df.groupby('team_2_h_or_a')['std_goal_value_a'].shift(1)

  # Variation Coefficient
  df['cv_goal_value_h'] = df['std_goal_value_h'] / df['mean_goal_value_h']
  df['cv_goal_value_a'] = df['std_goal_value_a'] / df['mean_goal_value_a']

  # GOAL COST
  #The greater the probability of victory, the lower the cost of that goal, therefore, it ends up being a "cheaper" goal
  df['goal_cost_h'] = df['team_1_goalsScored'] / df['avg_h_win_odds_market_prob']
  df['goal_cost_a'] = df['team_2_goalsScored'] / df['avg_a_win_odds_market_prob']

  # Rolling mean
  df['mean_goal_cost_h'] = df.groupby('team_1_h_or_a')['goal_cost_h'].rolling(window = n_per, min_periods = n_per).mean().reset_index(0,drop=True)
  df['mean_goal_cost_a'] = df.groupby('team_2_h_or_a')['goal_cost_a'].rolling(window = n_per, min_periods = n_per).mean().reset_index(0,drop=True)
  df['mean_goal_cost_h'] = df.groupby('team_1_h_or_a')['mean_goal_cost_h'].shift(1)
  df['mean_goal_cost_a'] = df.groupby('team_2_h_or_a')['mean_goal_cost_a'].shift(1)

  # Rolling std
  df['std_goal_cost_h'] = df.groupby('team_1_h_or_a')['goal_cost_h'].rolling(window = n_per, min_periods = n_per).std().reset_index(0,drop=True)
  df['std_goal_cost_a'] = df.groupby('team_2_h_or_a')['goal_cost_a'].rolling(window = n_per, min_periods = n_per).std().reset_index(0,drop=True)
  df['std_goal_cost_h'] = df.groupby('team_1_h_or_a')['std_goal_cost_h'].shift(1)
  df['std_goal_cost_a'] = df.groupby('team_2_h_or_a')['std_goal_cost_a'].shift(1)

  # Variation Coefficient
  df['cv_goal_cost_h'] = df['std_goal_cost_h'] / df['mean_goal_cost_h']
  df['cv_goal_cost_a'] = df['std_goal_cost_a'] / df['mean_goal_cost_a']

  return df

ratings = calc_weighted_features(ratings, 5)

## Training and Testing ML Model

In [99]:
features = ['team_2_average_goalsScored_last_ten',
'team_2_average_halfTimeGoalsScored_last_ten',
'team_2_average_cornerCount_last_ten',
'team_2_average_shots_last_ten',
'team_2_average_shotsOnTarget_last_ten',
'team_2_average_foulsConceded_last_ten',
'team_2_average_yellowConceded_last_ten',
'team_2_average_redConceded_last_ten',
'team_2_average_result_last_ten',
'team_2_average_goalsConceded_last_ten',
'team_2_average_halfTimeGoalsConceded_last_ten',
'team_2_average_cornersConceded_last_ten',
'team_2_average_shotsConceded_last_ten',
'team_2_average_shotsOnTargetConceded_last_ten',
'team_2_average_foulsReceived_last_ten',
'team_2_average_yellowOpponent_last_ten',
'team_2_average_redOpponent_last_ten',
'team_1_average_goalsScored_last_five_home',
'team_1_average_halfTimeGoalsScored_last_five_home',
'team_1_average_cornerCount_last_five_home',
'team_1_average_shots_last_five_home',
'team_1_average_shotsOnTarget_last_five_home',
'team_1_average_foulsConceded_last_five_home',
'team_1_average_yellowConceded_last_five_home',
'team_1_average_redConceded_last_five_home',
'team_1_average_result_last_five_home',
'team_1_average_goalsConceded_last_five_home',
'team_1_average_halfTimeGoalsConceded_last_five_home',
'team_1_average_cornersConceded_last_five_home',
'team_1_average_shotsConceded_last_five_home',
'team_1_average_shotsOnTargetConceded_last_five_home',
'team_1_average_foulsReceived_last_five_home',
'team_1_average_yellowOpponent_last_five_home',
'team_1_average_redOpponent_last_five_home',
'team_2_average_goalsScored_last_five_away',
'team_2_average_halfTimeGoalsScored_last_five_away',
'team_2_average_cornerCount_last_five_away',
'team_2_average_shots_last_five_away',
'team_2_average_shotsOnTarget_last_five_away',
'team_2_average_foulsConceded_last_five_away',
'team_2_average_yellowConceded_last_five_away',
'team_2_average_redConceded_last_five_away',
'team_2_average_result_last_five_away',
'team_2_average_goalsConceded_last_five_away',
'team_2_average_halfTimeGoalsConceded_last_five_away',
'team_2_average_cornersConceded_last_five_away',
'team_2_average_shotsConceded_last_five_away',
'team_2_average_shotsOnTargetConceded_last_five_away',
'team_2_average_foulsReceived_last_five_away',
'team_2_average_yellowOpponent_last_five_away',
'team_2_average_redOpponent_last_five_away',
'team_code_0',
'team_code_1',
'team_code_2',
'team_code_3',
'team_code_4',
'team_code_5',
'team_code_6',
'team_code_7',
'team_code_8',
'team_code_9',
'team_code_10',
'team_code_11',
'team_code_12',
'team_code_13',
'team_code_14',
'team_code_15',
'team_code_16',
'team_code_17',
'team_code_18',
'team_code_19',
'team_code_20',
'team_code_21',
'team_code_22',
'opp_code_0',
'opp_code_1',
'opp_code_2',
'opp_code_3',
'opp_code_4',
'opp_code_5',
'opp_code_6',
'opp_code_7',
'opp_code_8',
'opp_code_9',
'opp_code_10',
'opp_code_11',
'opp_code_12',
'opp_code_13',
'opp_code_14',
'opp_code_15',
'opp_code_16',
'opp_code_17',
'opp_code_18',
'opp_code_19',
'opp_code_20',
'opp_code_21',
'opp_code_22',
'scaled_home_goals_last',
'scaled_away_goals_last',
'Home Attack',
'Home Defense',
'Away Attack',
'Away Defense',
'avg_h_win_odds_market',
'avg_d_odds_market',
'avg_a_win_odds_market',
'avg_o_2_5_goals_odds',
'avg_u_2_5_goals_odds',
'avg_asian_handicap_h_win_odds',
'avg_asian_handicap_a_win_odds',
'avg_h_win_odds_market_prob',
'avg_d_odds_market_prob',
'avg_a_win_odds_market_prob',
'mean_spread_weighted_h_win',
'mean_spread_weighted_a_win',
'std_spread_weighted_h_win',
'std_spread_weighted_a_win',
'cv_spread_h',
'cv_spread_a',
'mean_spread_weighted_h_win_opp',
'mean_spread_weighted_a_win_opp',
'std_spread_weighted_h_win_opp',
'std_spread_weighted_a_win_opp',
'std_goal_value_h',
'std_goal_value_a',
'cv_goal_value_h',
'cv_goal_value_a',
'mean_goal_cost_h',
'mean_goal_cost_a',
'std_goal_cost_h',
'std_goal_cost_a',
'cv_goal_cost_h',
'cv_goal_cost_a'
]

In [100]:
ratings = ratings.drop(['date_y'], axis = 1)
ratings = ratings.rename(columns={'date_x': 'date'})

The 2024–25 Ligue 1, also known as Ligue 1 McDonald's for sponsorship reasons, is the 87th season of the Ligue 1, France's premier football competition. It began on 16 August 2024 and is set to conclude on 22 May 2025. All statistics correct as of 18 December 2024.

In [101]:
train_data = ratings[ratings['date'] < pd.to_datetime('2024-08-16')].dropna()
test_data = ratings[ratings['date'] >= pd.to_datetime('2024-08-16')].dropna()

In [102]:
X_train = train_data[features]
X_test = test_data[features]
Y_train = train_data['over_2.5']
Y_test = test_data['over_2.5']

In [103]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [104]:
# With pipelines you can easily perform a grid-search over set of parameters for each step of this meta-estimator.
pipeline = Pipeline(
    [('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=mutual_info_classif, k=20)),  # Select top 10 features
        ('model', RandomForestClassifier(n_estimators=50,max_depth = 5,oob_score= True,ccp_alpha=0.1, random_state=42))  # Random Forest model # Tried 100 estimators, but the result was the same
    ]
)

In [105]:
from sklearn.model_selection import GridSearchCV

param_grid = {'selector__k': [3, 4, 5, 6, 7, 8, 9, 10]} # To select between 3 to 10 features

# GridSearch tries every combination of the provided hyper-parameter values in order to find the best model.
search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    n_jobs=-1,
    scoring="accuracy",
    cv=10,
    verbose=3
)

In [106]:
# Fit the GridSearchCV object to the training data
search.fit(X_train,Y_train)

# Access the best_params_ attribute
print("Best parameters:", search.best_params_)

# Get the feature selector from the best estimator
selector = search.best_estimator_.named_steps['selector']

# Get the selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get the selected feature names
selected_feature_names = X_train.columns[selected_feature_indices]

# Print the selected feature names
print("Selected features:", selected_feature_names)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Best parameters: {'selector__k': 3}
Selected features: Index(['Home Attack', 'Away Attack', 'avg_a_win_odds_market'], dtype='object')


In [107]:
# Prediction on training data
y_pred_train = search.predict(X_train)
accuracy_train = accuracy_score(Y_train, y_pred_train)
print("Accuracy in training:", accuracy_train)


# prediction on test data
y_pred_rf = search.predict(X_test)
accuracy_rf = accuracy_score(Y_test, y_pred_rf)
print("Accuracy:", accuracy_rf)

Accuracy in training: 0.541795665634675
Accuracy: 0.5663716814159292


#### Testing with less parameters

In [108]:
new_features = ['Home Attack',
'Home Defense',
'Away Attack',
'Away Defense',
'avg_h_win_odds_market_prob',
'avg_d_odds_market_prob',
'avg_a_win_odds_market_prob',
'mean_spread_weighted_h_win',
'mean_spread_weighted_a_win',
'std_spread_weighted_h_win',
'std_spread_weighted_a_win',
'cv_spread_h',
'cv_spread_a',
'mean_spread_weighted_h_win_opp',
'mean_spread_weighted_a_win_opp',
'std_spread_weighted_h_win_opp',
'std_spread_weighted_a_win_opp',
'mean_goal_value_h',
'mean_goal_value_a',
'std_goal_value_h',
'std_goal_value_a',
'cv_goal_value_h',
'cv_goal_value_a',
'mean_goal_cost_h',
'mean_goal_cost_a',
'std_goal_cost_h',
'std_goal_cost_a',
'cv_goal_cost_h',
'cv_goal_cost_a']

In [109]:
new_X_train = train_data[new_features]
new_X_test = test_data[new_features]


In [110]:
search.fit(new_X_train,Y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


In [111]:
# Prediction on training data
y_pred_train = search.predict(new_X_train)
accuracy_train = accuracy_score(Y_train, y_pred_train)
print("Accuracy in training:", accuracy_train)


# prediction on test data
y_pred_rf = search.predict(new_X_test)
accuracy_rf = accuracy_score(Y_test, y_pred_rf)
print("Accuracy:", accuracy_rf)

Accuracy in training: 0.541795665634675
Accuracy: 0.5663716814159292


#### Merging upcoming features

In [125]:
fixtures = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/Models-Sports/base french league/fixtures.csv')

In [129]:
france_fixtures = fixtures[fixtures['Div'] == 'F1']
france_fixtures = france_fixtures[['HomeTeam', 'AwayTeam']]

In [130]:
import warnings
warnings.filterwarnings('ignore')

# Step 1: Standardize team names
ratings['team_1_name'] = ratings['team_1_name'].str.strip().str.lower()
ratings['team_2_name'] = ratings['team_2_name'].str.strip().str.lower()
ratings['date'] = pd.to_datetime(ratings['date'], format='%d/%m/%Y')  # Ensure date column is datetime
teams = pd.concat([ratings['team_1_name'], ratings['team_2_name']]).unique()

# Fill missing values in stat columns with 0
ratings.fillna(0, inplace=True)

# Initialize lists to store results
home_averages = []
away_averages = []

# Step 2: Loop through each team and calculate weighted averages for home and away matches
for team in teams:
    # Home stats
    home_stats = ratings[ratings['team_1_name'] == team]
    home_stats = home_stats.sort_values(by='date', ascending=False)
    home_stats['weight'] = np.linspace(1, 0.5, len(home_stats)) if len(home_stats) > 0 else []

    if home_stats.empty:
        print(f"No home matches found for {team}")
        continue

    weighted_avg_home = {
        'Team': team,
        'Location': 'home'
    }
    for col in [
        'team_1_average_goalsScored_last_five_home',
        'team_1_average_halfTimeGoalsScored_last_five_home',
        'team_1_average_cornerCount_last_five_home',
        'team_1_average_shots_last_five_home',
        'team_1_average_shotsOnTarget_last_five_home',
        'team_1_average_foulsConceded_last_five_home',
        'team_1_average_yellowConceded_last_five_home',
        'team_1_average_redConceded_last_five_home',
        'team_1_average_result_last_five_home',
        'team_1_average_goalsConceded_last_five_home',
        'team_1_average_halfTimeGoalsConceded_last_five_home',
        'team_1_average_cornersConceded_last_five_home',
        'team_1_average_shotsConceded_last_five_home',
        'team_1_average_shotsOnTargetConceded_last_five_home',
        'team_1_average_foulsReceived_last_five_home',
        'team_1_average_yellowOpponent_last_five_home',
        'team_1_average_redOpponent_last_five_home',
        'scaled_home_goals_last',
        'Home Attack',
        'Home Defense',
        'mean_spread_weighted_h_win',
        'std_spread_weighted_h_win',
        'cv_spread_h',
        'mean_spread_weighted_h_win_opp',
        'std_spread_weighted_h_win_opp',
        'std_goal_value_h',
        'cv_goal_value_h',
        'mean_goal_cost_h',
        'std_goal_cost_h',
        'cv_goal_cost_h'
    ]:
        weighted_avg_home[col] = np.average(home_stats[col], weights=home_stats['weight']) if not home_stats.empty else np.nan

    home_averages.append(weighted_avg_home)

    # Away stats
    away_stats = ratings[ratings['team_2_name'] == team]
    away_stats = away_stats.sort_values(by='date', ascending=False)
    away_stats['weight'] = np.linspace(1, 0.5, len(away_stats)) if len(away_stats) > 0 else []

    if away_stats.empty:
        print(f"No away matches found for {team}")
        continue

    weighted_avg_away = {
        'Team': team,
        'Location': 'away'
    }
    for col in [
        'team_2_average_goalsScored_last_ten',
        'team_2_average_halfTimeGoalsScored_last_ten',
        'team_2_average_cornerCount_last_ten',
        'team_2_average_shots_last_ten',
        'team_2_average_shotsOnTarget_last_ten',
        'team_2_average_foulsConceded_last_ten',
        'team_2_average_yellowConceded_last_ten',
        'team_2_average_redConceded_last_ten',
        'team_2_average_result_last_ten',
        'team_2_average_goalsConceded_last_ten',
        'team_2_average_halfTimeGoalsConceded_last_ten',
        'team_2_average_cornersConceded_last_ten',
        'team_2_average_shotsConceded_last_ten',
        'team_2_average_shotsOnTargetConceded_last_ten',
        'team_2_average_foulsReceived_last_ten',
        'team_2_average_yellowOpponent_last_ten',
        'team_2_average_redOpponent_last_ten',
        'team_2_average_goalsScored_last_five_away',
        'team_2_average_halfTimeGoalsScored_last_five_away',
        'team_2_average_cornerCount_last_five_away',
        'team_2_average_shots_last_five_away',
        'team_2_average_shotsOnTarget_last_five_away',
        'team_2_average_foulsConceded_last_five_away',
        'team_2_average_yellowConceded_last_five_away',
        'team_2_average_redConceded_last_five_away',
        'team_2_average_result_last_five_away',
        'team_2_average_goalsConceded_last_five_away',
        'team_2_average_halfTimeGoalsConceded_last_five_away',
        'team_2_average_cornersConceded_last_five_away',
        'team_2_average_shotsConceded_last_five_away',
        'team_2_average_shotsOnTargetConceded_last_five_away',
        'team_2_average_foulsReceived_last_five_away',
        'team_2_average_yellowOpponent_last_five_away',
        'team_2_average_redOpponent_last_five_away',
        'scaled_away_goals_last',
        'Away Attack',
        'Away Defense',
        'mean_spread_weighted_a_win',
        'std_spread_weighted_a_win',
        'cv_spread_a',
        'mean_spread_weighted_a_win_opp',
        'std_spread_weighted_a_win_opp',
        'std_goal_value_a',
        'cv_goal_value_a',
        'mean_goal_cost_a',
        'std_goal_cost_a',
        'cv_goal_cost_a'
    ]:
        weighted_avg_away[col] = np.average(away_stats[col], weights=away_stats['weight']) if not away_stats.empty else np.nan

    away_averages.append(weighted_avg_away)

# Step 3: Create DataFrames for home and away averages
home_avg_df = pd.DataFrame(home_averages)
away_avg_df = pd.DataFrame(away_averages)

# Combine both into a single DataFrame
final_avg_df = pd.concat([home_avg_df, away_avg_df], ignore_index=True)



In [134]:
#final_avg_df

In [135]:
# First, let's create separate home and away DataFrames with proper column selections
home_columns = [
    'Team',
    'team_1_average_goalsScored_last_five_home',
    'team_1_average_halfTimeGoalsScored_last_five_home',
    'team_1_average_cornerCount_last_five_home',
    'team_1_average_shots_last_five_home',
    'team_1_average_shotsOnTarget_last_five_home',
    'team_1_average_foulsConceded_last_five_home',
    'team_1_average_yellowConceded_last_five_home',
    'team_1_average_redConceded_last_five_home',
    'team_1_average_result_last_five_home',
    'team_1_average_goalsConceded_last_five_home',
    'team_1_average_halfTimeGoalsConceded_last_five_home',
    'team_1_average_cornersConceded_last_five_home',
    'team_1_average_shotsConceded_last_five_home',
    'team_1_average_shotsOnTargetConceded_last_five_home',
    'team_1_average_foulsReceived_last_five_home',
    'team_1_average_yellowOpponent_last_five_home',
    'team_1_average_redOpponent_last_five_home',
    'scaled_home_goals_last',
    'Home Attack',
    'Home Defense',
    'mean_spread_weighted_h_win',
    'std_spread_weighted_h_win',
    'cv_spread_h',
    'mean_spread_weighted_h_win_opp',
    'std_spread_weighted_h_win_opp',
    'std_goal_value_h',
    'cv_goal_value_h',
    'mean_goal_cost_h',
    'std_goal_cost_h',
    'cv_goal_cost_h'
]

away_columns = [
    'Team',
    'team_2_average_goalsScored_last_ten',
    'team_2_average_halfTimeGoalsScored_last_ten',
    'team_2_average_cornerCount_last_ten',
    'team_2_average_shots_last_ten',
    'team_2_average_shotsOnTarget_last_ten',
    'team_2_average_foulsConceded_last_ten',
    'team_2_average_yellowConceded_last_ten',
    'team_2_average_redConceded_last_ten',
    'team_2_average_result_last_ten',
    'team_2_average_goalsConceded_last_ten',
    'team_2_average_halfTimeGoalsConceded_last_ten',
    'team_2_average_cornersConceded_last_ten',
    'team_2_average_shotsConceded_last_ten',
    'team_2_average_shotsOnTargetConceded_last_ten',
    'team_2_average_foulsReceived_last_ten',
    'team_2_average_yellowOpponent_last_ten',
    'team_2_average_redOpponent_last_ten',
    'team_2_average_goalsScored_last_five_away',
    'team_2_average_halfTimeGoalsScored_last_five_away',
    'team_2_average_cornerCount_last_five_away',
    'team_2_average_shots_last_five_away',
    'team_2_average_shotsOnTarget_last_five_away',
    'team_2_average_foulsConceded_last_five_away',
    'team_2_average_yellowConceded_last_five_away',
    'team_2_average_redConceded_last_five_away',
    'team_2_average_result_last_five_away',
    'team_2_average_goalsConceded_last_five_away',
    'team_2_average_halfTimeGoalsConceded_last_five_away',
    'team_2_average_cornersConceded_last_five_away',
    'team_2_average_shotsConceded_last_five_away',
    'team_2_average_shotsOnTargetConceded_last_five_away',
    'team_2_average_foulsReceived_last_five_away',
    'team_2_average_yellowOpponent_last_five_away',
    'team_2_average_redOpponent_last_five_away',
    'scaled_away_goals_last',
    'Away Attack',
    'Away Defense',
    'mean_spread_weighted_a_win',
    'std_spread_weighted_a_win',
    'cv_spread_a',
    'mean_spread_weighted_a_win_opp',
    'std_spread_weighted_a_win_opp',
    'std_goal_value_a',
    'cv_goal_value_a',
    'mean_goal_cost_a',
    'std_goal_cost_a',
    'cv_goal_cost_a'
]

# Standardize team names
france_fixtures['HomeTeam'] = france_fixtures['HomeTeam'].str.lower().str.strip()
france_fixtures['AwayTeam'] = france_fixtures['AwayTeam'].str.lower().str.strip()

# Create a mapping dictionary for any special cases
team_mapping = {
    'paris sg': 'paris sg',
    'psg': 'paris sg',
    'st etienne': 'st-etienne',
    'saint-etienne': 'st-etienne'
    # Add more mappings if needed
}

# Apply mappings
france_fixtures['HomeTeam'] = france_fixtures['HomeTeam'].map(team_mapping).fillna(france_fixtures['HomeTeam'])
france_fixtures['AwayTeam'] = france_fixtures['AwayTeam'].map(team_mapping).fillna(france_fixtures['AwayTeam'])

# Create home and away DataFrames with correct columns
home_stats = final_avg_df[final_avg_df['Location'] == 'home'][home_columns]
away_stats = final_avg_df[final_avg_df['Location'] == 'away'][away_columns]

# Standardize team names in stats DataFrames
home_stats['Team'] = home_stats['Team'].str.lower().str.strip()
away_stats['Team'] = away_stats['Team'].str.lower().str.strip()

# Apply mappings to stats DataFrames
home_stats['Team'] = home_stats['Team'].map(team_mapping).fillna(home_stats['Team'])
away_stats['Team'] = away_stats['Team'].map(team_mapping).fillna(away_stats['Team'])

# Perform the merges
merged_fixtures = france_fixtures.merge(
    home_stats,
    left_on='HomeTeam',
    right_on='Team',
    how='left'
).drop('Team', axis=1)

final_merged = merged_fixtures.merge(
    away_stats,
    left_on='AwayTeam',
    right_on='Team',
    how='left'
).drop('Team', axis=1)

# Print diagnostics
print("After standardization:")
print("Number of columns:", len(final_merged.columns))
print("Number of missing values:", final_merged.isna().sum().sum())
print("\nMissing values by column:")
missing_by_col = final_merged.isna().sum()
print(missing_by_col[missing_by_col > 0])

# Print unique teams after standardization
print("\nUnique HomeTeams after standardization:", sorted(france_fixtures['HomeTeam'].unique()))
print("\nUnique AwayTeams after standardization:", sorted(france_fixtures['AwayTeam'].unique()))
print("\nUnique Teams in stats:", sorted(final_avg_df['Team'].unique()))

After standardization:
Number of columns: 79
Number of missing values: 0

Missing values by column:
Series([], dtype: int64)

Unique HomeTeams after standardization: ['auxerre', 'brest', 'le havre', 'montpellier', 'nantes', 'paris sg', 'reims', 'rennes', 'toulouse']

Unique AwayTeams after standardization: ['angers', 'lens', 'lille', 'lyon', 'marseille', 'monaco', 'nice', 'st-etienne', 'strasbourg']

Unique Teams in stats: ['ajaccio', 'angers', 'auxerre', 'brest', 'clermont', 'le havre', 'lens', 'lille', 'lorient', 'lyon', 'marseille', 'metz', 'monaco', 'montpellier', 'nantes', 'nice', 'paris sg', 'reims', 'rennes', 'st etienne', 'strasbourg', 'toulouse', 'troyes']


In [117]:
features_for_pred = ['team_1_average_goalsScored_last_five_home',
    'team_1_average_halfTimeGoalsScored_last_five_home',
    'team_1_average_cornerCount_last_five_home',
    'team_1_average_shots_last_five_home',
    'team_1_average_shotsOnTarget_last_five_home',
    'team_1_average_foulsConceded_last_five_home',
    'team_1_average_yellowConceded_last_five_home',
    'team_1_average_redConceded_last_five_home',
    'team_1_average_result_last_five_home',
    'team_1_average_goalsConceded_last_five_home',
    'team_1_average_halfTimeGoalsConceded_last_five_home',
    'team_1_average_cornersConceded_last_five_home',
    'team_1_average_shotsConceded_last_five_home',
    'team_1_average_shotsOnTargetConceded_last_five_home',
    'team_1_average_foulsReceived_last_five_home',
    'team_1_average_yellowOpponent_last_five_home',
    'team_1_average_redOpponent_last_five_home',
    'scaled_home_goals_last',
    'Home Attack',
    'Home Defense',
    'mean_spread_weighted_h_win',
    'std_spread_weighted_h_win',
    'cv_spread_h',
    'mean_spread_weighted_h_win_opp',
    'std_spread_weighted_h_win_opp',
    'std_goal_value_h',
    'cv_goal_value_h',
    'mean_goal_cost_h',
    'std_goal_cost_h',
    'cv_goal_cost_h',
    'team_2_average_goalsScored_last_ten',
    'team_2_average_halfTimeGoalsScored_last_ten',
    'team_2_average_cornerCount_last_ten',
    'team_2_average_shots_last_ten',
    'team_2_average_shotsOnTarget_last_ten',
    'team_2_average_foulsConceded_last_ten',
    'team_2_average_yellowConceded_last_ten',
    'team_2_average_redConceded_last_ten',
    'team_2_average_result_last_ten',
    'team_2_average_goalsConceded_last_ten',
    'team_2_average_halfTimeGoalsConceded_last_ten',
    'team_2_average_cornersConceded_last_ten',
    'team_2_average_shotsConceded_last_ten',
    'team_2_average_shotsOnTargetConceded_last_ten',
    'team_2_average_foulsReceived_last_ten',
    'team_2_average_yellowOpponent_last_ten',
    'team_2_average_redOpponent_last_ten',
    'team_2_average_goalsScored_last_five_away',
    'team_2_average_halfTimeGoalsScored_last_five_away',
    'team_2_average_cornerCount_last_five_away',
    'team_2_average_shots_last_five_away',
    'team_2_average_shotsOnTarget_last_five_away',
    'team_2_average_foulsConceded_last_five_away',
    'team_2_average_yellowConceded_last_five_away',
    'team_2_average_redConceded_last_five_away',
    'team_2_average_result_last_five_away',
    'team_2_average_goalsConceded_last_five_away',
    'team_2_average_halfTimeGoalsConceded_last_five_away',
    'team_2_average_cornersConceded_last_five_away',
    'team_2_average_shotsConceded_last_five_away',
    'team_2_average_shotsOnTargetConceded_last_five_away',
    'team_2_average_foulsReceived_last_five_away',
    'team_2_average_yellowOpponent_last_five_away',
    'team_2_average_redOpponent_last_five_away',
    'scaled_away_goals_last',
    'Away Attack',
    'Away Defense',
    'mean_spread_weighted_a_win',
    'std_spread_weighted_a_win',
    'cv_spread_a',
    'mean_spread_weighted_a_win_opp',
    'std_spread_weighted_a_win_opp',
    'std_goal_value_a',
    'cv_goal_value_a',
    'mean_goal_cost_a',
    'std_goal_cost_a',
    'cv_goal_cost_a']

In [137]:
X_train = train_data[features_for_pred]
X_tested = final_merged.iloc[:,2:]

search.fit(X_train,Y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


In [139]:
y_pred_train = search.predict(X_tested)

In [140]:
y_pred_train

array([1, 1, 1, 1, 1, 1, 1, 1, 1])