In [1]:
import pandas as pd
import numpy as np

# FTHG = Full Time Home Team Goals
# FTAG = Full Time Away Team Goals
# FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)

fields = ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR"]

df = pd.read_csv("data/Ligue1_game_stats.csv", usecols = fields)
df['Date'] = pd.to_datetime(df['Date'])

# Rename the columns for clarity: 
df = df.rename(columns={'FTHG': 'Home_goals', 'FTAG': 'Away_goals', 'FTR': 'Result'})
# Home_goals = Full Time Home Team Goals
# Away_goals = Full Time Away Team Goals
# Result = Full Time Result (H=Home Win, D=Draw, A=Away Win)


df.head()


  df['Date'] = pd.to_datetime(df['Date'])


Unnamed: 0,Date,HomeTeam,AwayTeam,Home_goals,Away_goals,Result
0,2025-08-15,Rennes,Marseille,1,0,H
1,2025-08-16,Lens,Lyon,0,1,A
2,2025-08-16,Monaco,Le Havre,3,1,H
3,2025-08-16,Nice,Toulouse,0,1,A
4,2025-08-17,Brest,Lille,3,3,D


In [19]:
# Dataset with only Marseille games

df_om_games = df[(df["HomeTeam"] == "Marseille") | (df["AwayTeam"] == "Marseille")].copy()


df_om_games.head()

df_om_games["OM_goals_scored"] = np.where(df_om_games['HomeTeam'] == 'Marseille', df_om_games["Home_goals"], df_om_games["Away_goals"])
df_om_games["OM_goals_conceded"] = np.where(df_om_games['HomeTeam'] == 'Marseille', df_om_games["Away_goals"], df_om_games["Home_goals"])
df_om_games["is_OM_win"] = np.where(df_om_games['HomeTeam'] == 'Marseille', df_om_games["Result"] == "H", df_om_games["Result"] == "A")
df_om_games["is_Draw"] = df_om_games["Result"] == "D"


df_om_games.head(10)

df_om_games = df_om_games.assign(goals_scored_last_3_games=lambda d: d["OM_goals_scored"].rolling(3).sum().shift())
df_om_games = df_om_games.assign(goals_conceded_last_3_games=lambda d: d["OM_goals_conceded"].rolling(3).sum().shift())

df_om_games

Unnamed: 0,Date,HomeTeam,AwayTeam,Home_goals,Away_goals,Result,OM_goals_scored,OM_goals_conceded,is_OM_win,is_Draw,goals_scored_last_3_games,goals_conceded_last_3_games
0,2025-08-15,Rennes,Marseille,1,0,H,0,1,False,False,,
10,2025-08-23,Marseille,Paris FC,5,2,H,5,2,True,False,,
26,2025-08-31,Lyon,Marseille,1,0,H,0,1,False,False,,
27,2025-09-12,Marseille,Lorient,4,0,H,4,0,True,False,5.0,4.0
44,2025-09-22,Marseille,Paris SG,1,0,H,1,0,True,False,9.0,3.0
45,2025-09-26,Strasbourg,Marseille,1,2,A,2,1,True,False,5.0,1.0
55,2025-10-04,Metz,Marseille,0,3,A,3,0,True,False,7.0,1.0
66,2025-10-18,Marseille,Le Havre,6,2,H,6,2,True,False,6.0,1.0
75,2025-10-25,Lens,Marseille,2,1,H,1,2,False,False,11.0,3.0
85,2025-10-29,Marseille,Angers,2,2,D,2,2,False,True,10.0,4.0


In [None]:
# TODO, add number of rest days before game - pour ça il va falloir aussi rajouter les matchs joués en champions league.
# add Forme récente : points pris sur les 5 derniers matchs, différence de buts, série de victoires/défaites
# A voir si on peut ajouter: les joueurs blessés. 
# Add current rank
# Add Elo score

## Construction of a model for the 2024-2025 season


In [3]:
# Loading the data of the previous year for training the model. 
df_2024 = pd.read_csv("data/Ligue1_2024-2025.csv", usecols = fields)
df['Date'] = pd.to_datetime(df['Date'])

# Rename the columns for clarity: 
df_2024 = df_2024.rename(columns={'FTHG': 'Home_goals', 'FTAG': 'Away_goals', 'FTR': 'Result'})

df_2024.columns

Index(['Date', 'HomeTeam', 'AwayTeam', 'Home_goals', 'Away_goals', 'Result'], dtype='object')

### Creating an ELO score, which compute a rank of the teams using a numerical score

In [4]:
# Defining the ELO function: 
import pandas as pd
import numpy as np

def expected_score(rating_A, rating_B):
    """Odds that A wins against B"""
    return 1 / (1 + 10 ** ((rating_B - rating_A) / 400))

def update_elo(rating_A, rating_B, score_A, K=30):
    """
    Update Elo ratings after a match
    score_A : 1 = A wins, 0.5 = draw, 0 = A loses
    We chose K=30 as of now based on common practice for this hyper_parameter in football 
    """
    
    expected_A = expected_score(rating_A, rating_B)
    expected_B = 1 - expected_A
    
    new_rating_A = rating_A + K * (score_A - expected_A)
    new_rating_B = rating_B + K * ((1 - score_A) - expected_B)
    
    return new_rating_A, new_rating_B

In [11]:
# Initialisation
teams = pd.concat([df_2024['HomeTeam']]).unique()
elo = {team: 1500 for team in teams}  # Initial score
elo_home, elo_away = [], []

# Iteration
for i, row in df_2024.iterrows():
    home, away = row['HomeTeam'], row['AwayTeam']
    res = row['Result']

    # Results
    if res == 'H': score_home, score_away = 1, 0
    elif res == 'A': score_home, score_away = 0, 1
    else: score_home, score_away = 0.5, 0.5
    
    # Save before update
    elo_home.append(elo[home])
    elo_away.append(elo[away])
    
    # Update
    new_home, new_away = update_elo(elo[home], elo[away], score_home)
    elo[home], elo[away] = new_home, new_away

# Add to the data_set
df_2024['Elo_Home'] = elo_home
df_2024['Elo_Away'] = elo_away
df_2024['Diff_Elo'] = df['Elo_Home'] - df['Elo_Away']




KeyError: 'Elo_Home'

In [None]:
elo
df_2024
elo[df_2024]
for i, row in df_2024.iterrows():

    rating_A, rating_b = update_elo(rating_A, rating_b)

Unnamed: 0,Date,HomeTeam,AwayTeam,Home_goals,Away_goals,Result
0,16/08/2024,Le Havre,Paris SG,1,4,A
1,17/08/2024,Brest,Marseille,1,5,A
2,17/08/2024,Reims,Lille,0,2,A
3,17/08/2024,Monaco,St Etienne,1,0,H
4,18/08/2024,Auxerre,Nice,2,1,H
...,...,...,...,...,...,...
301,17/05/2025,Nantes,Montpellier,3,0,H
302,17/05/2025,Nice,Brest,6,0,H
303,17/05/2025,Paris SG,Auxerre,3,1,H
304,17/05/2025,St Etienne,Toulouse,2,3,A


In [34]:
# TODO, add number of rest days before game - pour ça il va falloir aussi rajouter les matchs joués en champions league.
# add Forme récente : points pris sur les 5 derniers matchs, différence de buts, série de victoires/défaites
# A voir si on peut ajouter: les joueurs blessés. 
# Add Elo score: j'ai pas finis, mais je m'en charge

## TODO Adding numbers of rest days

In [16]:
# To do this i'm first going to need to scrape this data table which gives the calendar for OM including ligue1 and CL.
# URL : https://fbref.com/en/squads/5725cc7b/2025-2026/matchlogs/all_comps/misc/Marseille-Match-Logs-All-Competitions
# I found where the data was in the html code =>
# Inspect: data is tagged by <th scope="row" class="left " data-stat="date" csk="20250815"> ... </th>