Algorithme de la Régression Logistique

In [6]:
import pandas as pd
df_matchs = pd.read_csv("donnees_football\\Match.csv", parse_dates=['date'])
df_player_att = pd.read_csv("donnees_football\\Player_Attributes.csv", parse_dates=['date'])
df_team_attributes = pd.read_csv("donnees_football\\Team_Attributes.csv", parse_dates=['date'])

Sélection et création des variables

In [None]:
# 1. Filtrer les matchs entre aout 2011 et aout 2015
start_date = pd.Timestamp('2011-08-01')
end_date = pd.Timestamp('2015-08-01')
df_matchs_filtered = df_matchs[(df_matchs['date'] >= start_date) & (df_matchs['date'] <= end_date)].copy()


# 2. Issue du match
def get_result(row):
    if row['home_team_goal'] > row['away_team_goal']:
        return 'home_win'
    elif row['home_team_goal'] < row['away_team_goal']:
        return 'away_win'
    else:
        return 'draw'

df_matchs_filtered['result'] = df_matchs_filtered.apply(get_result, axis=1)


# 3. Moyenne des buts marqués/encaissés sur les 5 derniers matchs
def calc_avg_goals(team_id, date, home=True):
    if home:
        past_matches = df_matchs[(df_matchs['home_team_api_id'] == team_id) & (df_matchs['date'] < date)].sort_values(by='date', ascending=False).head(5)
        scored = past_matches['home_team_goal'].mean() if not past_matches.empty else 0
        conceded = past_matches['away_team_goal'].mean() if not past_matches.empty else 0
    else:
        past_matches = df_matchs[(df_matchs['away_team_api_id'] == team_id) & (df_matchs['date'] < date)].sort_values(by='date', ascending=False).head(5)
        scored = past_matches['away_team_goal'].mean() if not past_matches.empty else 0
        conceded = past_matches['home_team_goal'].mean() if not past_matches.empty else 0
    return scored, conceded

home_scored, home_conceded, away_scored, away_conceded = [], [], [], []

for _, row in df_matchs_filtered.iterrows():
    hs, hc = calc_avg_goals(row['home_team_api_id'], row['date'], home=True)
    as_, ac = calc_avg_goals(row['away_team_api_id'], row['date'], home=False)
    home_scored.append(hs)
    home_conceded.append(hc)
    away_scored.append(as_)
    away_conceded.append(ac)

df_matchs_filtered['home_avg_scored'] = home_scored
df_matchs_filtered['home_avg_conceded'] = home_conceded
df_matchs_filtered['away_avg_scored'] = away_scored
df_matchs_filtered['away_avg_conceded'] = away_conceded


# 4. Moyenne des overall_rating (dernier connu) pour chaque équipe

# Fonction pour récupérer moyenne des ratings des joueurs d'une équipe
def get_team_rating(player_ids, match_date):
    ratings = []
    for pid in player_ids:
        if pd.isnull(pid):
            continue
        player_hist = df_player_att[(df_player_att['player_api_id'] == pid) & (df_player_att['date'] <= match_date)]
        if not player_hist.empty:
            rating = player_hist.sort_values(by='date', ascending=False).iloc[0]['overall_rating']
            if not pd.isnull(rating):
                ratings.append(rating)
    return sum(ratings)/len(ratings) if ratings else None

home_team_rating, away_team_rating = [], []

for _, row in df_matchs_filtered.iterrows():
    home_players = [row[f'home_player_{i}'] for i in range(1, 12)]
    away_players = [row[f'away_player_{i}'] for i in range(1, 12)]
    h_rating = get_team_rating(home_players, row['date'])
    a_rating = get_team_rating(away_players, row['date'])
    home_team_rating.append(h_rating)
    away_team_rating.append(a_rating)

df_matchs_filtered['home_team_rating'] = home_team_rating
df_matchs_filtered['away_team_rating'] = away_team_rating
df_matchs_filtered['rating_diff'] = df_matchs_filtered['home_team_rating'] - df_matchs_filtered['away_team_rating']


# 5. Ajouter les attributs d’équipe (dernier connu avant le match)

team_attributes_cols = [
    'buildUpPlaySpeed',
    'chanceCreationPassing',
    'defencePressure',
    'defenceAggression',
    'defenceTeamWidth'
]

def get_team_attributes(team_id, match_date):
    team_hist = df_team_attributes[(df_team_attributes['team_api_id'] == team_id) & (df_team_attributes['date'] <= match_date)]
    if not team_hist.empty:
        latest = team_hist.sort_values(by='date', ascending=False).iloc[0]
        return latest[team_attributes_cols].values
    else:
        return [None]*len(team_attributes_cols)

home_attr_vals, away_attr_vals = [], []

for _, row in df_matchs_filtered.iterrows():
    h_attr = get_team_attributes(row['home_team_api_id'], row['date'])
    a_attr = get_team_attributes(row['away_team_api_id'], row['date'])
    home_attr_vals.append(h_attr)
    away_attr_vals.append(a_attr)

# Transformer en DataFrames
home_attr_df = pd.DataFrame(home_attr_vals, columns=[f'home_{col}' for col in team_attributes_cols])
away_attr_df = pd.DataFrame(away_attr_vals, columns=[f'away_{col}' for col in team_attributes_cols])

# Fusionner dans le dataframe principal
df_final = pd.concat([df_matchs_filtered.reset_index(drop=True), home_attr_df, away_attr_df], axis=1)

# Calculer les différences
for col in team_attributes_cols:
    df_final[f'{col}_diff'] = df_final[f'home_{col}'] - df_final[f'away_{col}']


# Garder les colonnes finales
cols_to_keep = [
    'date',
    'home_avg_scored', 'home_avg_conceded',
    'away_avg_scored', 'away_avg_conceded',
    'home_team_rating', 'away_team_rating',
    'rating_diff',
    'result'
] + [f'home_{col}' for col in team_attributes_cols] + [f'away_{col}' for col in team_attributes_cols] + [f'{col}_diff' for col in team_attributes_cols]

df_final = df_final[cols_to_keep]

df_final = df_final.dropna()

        date  home_avg_scored  home_avg_conceded  away_avg_scored  \
0 2011-10-16             2.60               0.60              2.8   
1 2011-10-16             2.60               1.40              1.0   
2 2011-10-15             1.80               3.00              1.0   
3 2011-10-15             1.20               1.40              0.4   
4 2011-10-15             1.75               1.25              0.6   

   away_avg_conceded  home_team_rating  away_team_rating  rating_diff  \
0                2.2         70.090909         69.454545     0.636364   
1                1.6         72.363636         69.909091     2.454545   
2                2.0         63.636364         66.000000    -2.363636   
3                1.2         64.909091         64.727273     0.181818   
4                1.8         62.545455         64.636364    -2.090909   

     result  home_buildUpPlaySpeed  ...  away_buildUpPlaySpeed  \
0  home_win                   67.0  ...                   55.0   
1  home_win   

Séparation des features (X) et de la cible (y)

In [27]:
X = df_final[['home_avg_scored', 'home_avg_conceded',
 'away_avg_scored', 'away_avg_conceded',
 'home_team_rating', 'away_team_rating', 'rating_diff',
 'home_buildUpPlaySpeed', 'home_chanceCreationPassing', 'home_defencePressure', 'home_defenceAggression', 'home_defenceTeamWidth',
 'away_buildUpPlaySpeed', 'away_chanceCreationPassing', 'away_defencePressure', 'away_defenceAggression', 'away_defenceTeamWidth',
 'buildUpPlaySpeed_diff', 'chanceCreationPassing_diff', 'defencePressure_diff', 'defenceAggression_diff', 'defenceTeamWidth_diff']]

y = df_final['result']


Division des données d'entraînement et de test

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


Implémentation de la pipeline

In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(
        solver='lbfgs',
        max_iter=1000
    ))
])


Entrainement du modèle

In [36]:
pipe.fit(X_train, y_train)

Prédiciton du modèle

In [37]:
y_pred = pipe.predict(X_test)

Evaluation des performances

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Exactitude globale
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.2f}')

# Matrice de confusion
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))

# Rapport détaillé (précision, rappel, f1-score)
print('Classification report:')
print(classification_report(y_test, y_pred))


Accuracy: 0.52
Confusion matrix:
[[322   1 367]
 [190   1 412]
 [182   0 903]]
Classification report:
              precision    recall  f1-score   support

    away_win       0.46      0.47      0.47       690
        draw       0.50      0.00      0.00       603
    home_win       0.54      0.83      0.65      1085

    accuracy                           0.52      2378
   macro avg       0.50      0.43      0.37      2378
weighted avg       0.51      0.52      0.43      2378

