Libraries needed:

In [1]:
import pickle
import sqlite3
import sys
import os
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
# Segur q es pot arreglar d"una altra forma
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import settings


Load the data:

In [2]:
with sqlite3.connect(settings.DATABASE_PATH) as conn:
    df = pd.read_sql(
        f"""
            SELECT * FROM Matches
        """,
        conn,
    )

Let's see the form of it:

In [3]:
df

Unnamed: 0,season,division,matchday,date,time,home_team,away_team,score
0,1928-1929,1,1,2/10/29,,Arenas Club,Athletic Madrid,2:3
1,1928-1929,1,1,2/10/29,,Espanyol,Real Unión,3:2
2,1928-1929,1,1,2/10/29,,Real Madrid,Catalunya,5:0
3,1928-1929,1,1,2/10/29,,Donostia,Athletic,1:1
4,1928-1929,1,1,2/12/29,,Racing,Barcelona,0:2
...,...,...,...,...,...,...,...,...
48775,2021-2022,2,42,5/29/22,,Real Oviedo,UD Ibiza,
48776,2021-2022,2,42,5/29/22,,Real Sociedad B,Real Zaragoza,
48777,2021-2022,2,42,5/29/22,,Sporting Gijón,UD Las Palmas,
48778,2021-2022,2,42,5/29/22,,CD Tenerife,FC Cartagena,


Data treatment:

In [4]:
df = df.dropna(subset=["score"]).copy()
df["home_score"] = df["score"].str.split(":").str[0].astype(int)
df["away_score"] = df["score"].str.split(":").str[1].astype(int)
df["home_win"] = (df["home_score"] > df["away_score"]).astype(int)
df["away_win"] = (df["home_score"] < df["away_score"]).astype(int)
df["tie"] = (df["home_score"] == df["away_score"]).astype(int)



def determine_result(row):
    if row['home_win'] == 1:
        return 1
    elif row['tie'] == 1:
        return 2
    elif row['away_win'] == 1:
        return 3
    else:
        return None  

df.loc[:,'result'] = df.apply(determine_result, axis=1)
df.drop(["date", "time", "home_win", "tie", "away_win"], axis = 1, inplace= True)




In [5]:
df

Unnamed: 0,season,division,matchday,home_team,away_team,score,home_score,away_score,result
0,1928-1929,1,1,Arenas Club,Athletic Madrid,2:3,2,3,3
1,1928-1929,1,1,Espanyol,Real Unión,3:2,3,2,1
2,1928-1929,1,1,Real Madrid,Catalunya,5:0,5,0,1
3,1928-1929,1,1,Donostia,Athletic,1:1,1,1,2
4,1928-1929,1,1,Racing,Barcelona,0:2,0,2,3
...,...,...,...,...,...,...,...,...,...
48345,2021-2022,2,3,Ponferradina,Girona,2:1,2,1,1
48346,2021-2022,2,3,SD Amorebieta,UD Almería,2:1,2,1,1
48347,2021-2022,2,3,CD Lugo,Real Valladolid,0:2,0,2,3
48348,2021-2022,2,3,Real Sociedad B,CF Fuenlabrada,0:0,0,0,2


In [6]:

def calculate_team_stats(df):
    """
    Calcula estadísticas acumulativas para cada equipo por jornada.
    
    Parámetros:
    df (pandas.DataFrame): DataFrame con datos de partidos de fútbol
    
    Retorna:
    pandas.DataFrame: DataFrame con columnas adicionales de estadísticas
    """
    # Crear una copia del DataFrame original
    result_df = df.copy()
    
    # Convertir 'season' a string si no lo está ya
    result_df['season'] = result_df['season'].astype(str)
    
    # Ordenar el DataFrame por season y matchday para asegurar cálculo correcto
    result_df = result_df.sort_values(['season', 'matchday'])
    
    # Función para calcular estadísticas para un equipo (casa o fuera)
    def calculate_team_matchday_stats(df_group, team_type):
        if team_type == 'home':
            team_col = 'home_team'
            score_col = 'home_score'
            opp_score_col = 'away_score'
        else:
            team_col = 'away_team'
            score_col = 'away_score'
            opp_score_col = 'home_score'
        
        # Hacer una copia del grupo para evitar warnings de modificación
        group = df_group.copy()
        
        # Inicializar columnas
        group[f'GF_{team_type}'] = 0
        group[f'GA_{team_type}'] = 0
        group[f'GD_{team_type}'] = 0
        group[f'W_{team_type}'] = 0
        group[f'L_{team_type}'] = 0
        group[f'Pts_{team_type}'] = 0
        
        # Calcular estadísticas acumulativas
        gf_cumsum = 0
        ga_cumsum = 0
        w_cumsum = 0
        l_cumsum = 0
        pts_cumsum = 0
        
        for i, row in group.iterrows():
            gf_cumsum += row[score_col]
            ga_cumsum += row[opp_score_col]
            
            # Determinar resultado
            if team_type == 'home':
                result = row['result']
            else:
                result = 4 - row['result']  # Invertir resultado para equipo visitante
            
            if result == 1:  # Victoria
                w_cumsum += 1
                pts_cumsum += 3
            elif result == 2:  # Empate
                pts_cumsum += 1
            else:  # Derrota
                l_cumsum += 1
            
            group.at[i, f'GF_{team_type}'] = gf_cumsum
            group.at[i, f'GA_{team_type}'] = ga_cumsum
            group.at[i, f'GD_{team_type}'] = gf_cumsum - ga_cumsum
            group.at[i, f'W_{team_type}'] = w_cumsum
            group.at[i, f'L_{team_type}'] = l_cumsum
            group.at[i, f'Pts_{team_type}'] = pts_cumsum
        
        return group
    
    # Calcular estadísticas para equipos de casa y fuera
    result_df = (result_df.groupby('season', group_keys=False)
        .apply(lambda x: calculate_team_matchday_stats(x, 'home'))
        .groupby('season', group_keys=False)
        .apply(lambda x: calculate_team_matchday_stats(x, 'away'))
    )
    
    # Resetear índice y ordenar
    result_df = result_df.reset_index(drop=True)
    result_df = result_df.sort_values(['season', 'matchday'])
    
    return result_df


In [7]:
# Combinar todos los datos
f_teams = calculate_team_stats(df)



  .apply(lambda x: calculate_team_matchday_stats(x, 'home'))
  .apply(lambda x: calculate_team_matchday_stats(x, 'away'))


In [8]:
f_teams.head(94)
df = f_teams

We train the model now:

In [9]:
features = ['GF_home','GA_home', "GD_home", "Pts_home", "W_home" , "L_home",'GF_away','GA_away', "GD_away", "Pts_away", "W_away" , "L_away"]
target = ["result"]

df_train = df[(df['season'] > '2000-2001') & (df['season'] < '2020-2021')]
x_train = df_train[features]
y_train = df_train[target].values.ravel()

df_test = df[df['season']=='2020-2021']
x_test = df_test[features]
y_test = df_test[target]


clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
clf_y_pred = clf.predict(x_test)

results_df = x_test.copy()
results_df = df_test.copy()
results_df["match_result_prediction"] = clf_y_pred

model_is_correct = (results_df["match_result_prediction"] == results_df["result"]).sum()
total_tries = len(results_df)
sucess_rate = model_is_correct/total_tries*100

print(f"This model guesses right {sucess_rate:.2f} % of the time")

This model guesses right 39.67 % of the time
