In [89]:
import pandas as pd
import numpy as np
from pyswarms.single.global_best import GlobalBestPSO
import warnings
import pickle
import time
import glob

In [90]:
def get_data():
    file_pattern = '../raw_data/*.csv'
    files = glob.glob(file_pattern)

    full_df = pd.DataFrame()
    for file in files:
        df = pd.read_csv(file, encoding='iso-8859-1')
        full_df = pd.concat([full_df, df], ignore_index=True)

    full_df['Date'] = pd.to_datetime(full_df['Date'], format='%d/%m/%Y')
    # full_df.drop(['Referee'], axis=1, inplace=True)
    full_df.rename(columns={'HomeTeam': 'HT', 'AwayTeam': 'AT'}, inplace=True)
    full_df.reset_index().set_index('Date').sort_index(inplace=True)
    # pad the nan with 0
    full_df.fillna(0, inplace=True)

    return full_df

full_df = get_data()
full_df.to_csv('full.csv', index=False)  

In [91]:
# code from: https://github.com/calvinyeungck/Soccer-Prediction-Challenge-2023/blob/main/berrar_rating.py

def calculate_expected_goals(alpha, beta_H, gamma_H, beta_A, gamma_A, o_H, d_A, o_A, d_H):
    G_H_hat = alpha / (1 + np.exp(- beta_H * (o_H + d_A) - gamma_H))
    G_A_hat = alpha / (1 + np.exp(- beta_A * (o_A + d_H) - gamma_A))
    return G_H_hat, G_A_hat

def update_offensive_defensive_strengths(G_H, G_H_hat, G_A, G_A_hat, o_H, d_H, o_A, d_A, omega_o_H, omega_d_H, omega_o_A, omega_d_A):
    o_H += omega_o_H * (G_H - G_H_hat)
    d_H += omega_d_H * (G_A - G_A_hat)
    o_A += omega_o_A * (G_A - G_A_hat)
    d_A += omega_d_A * (G_H - G_H_hat)
    return o_H, d_H, o_A, d_A

def individual_goal_pred_error(G_H, G_H_hat, G_A, G_A_hat):
    f = 0.5*((G_H - G_H_hat)**2 + (G_A - G_A_hat)**2)
    return f

#ratings parameter
alpha = 5 # max_goals = max(df['FTHG'].max(), df['FTAG'].max())

beta_H = 1 # slope of the logistic function. bounds [0,5]
beta_A = 1 
gamma_H = 0.1 # bias, e.g., home advantage. bounds [-5,5]
gamma_A = 0.1
omega_o_H = 0.2 # used to control the learning rate of the update rule for the offensive and defensive strengths of each team. bounds [0, 1.5]
omega_d_H = 0.2
omega_o_A = 0.2
omega_d_A = 0.2

def total_goal_prediction_error(x=(beta_H, beta_A, gamma_H, gamma_A, omega_o_H, omega_d_H, omega_o_A, omega_d_A)):
    loss=[]

    beta_H = x[:,0] # slope of the logistic function. bounds [0,5]
    beta_A = x[:,1]
    gamma_H = x[:,2] # bias, e.g., home advantage. bounds [-5,5]
    gamma_A = x[:,3] 
    omega_o_H = x[:,4]  # used to control the learning rate of the update rule for the offensive and defensive strengths of each team. bounds [0, 1.5]
    omega_d_H = x[:,5] 
    omega_o_A = x[:,6] 
    omega_d_A = x[:,7]
    team_ratings={}
    #print(round(beta_H,2), round(gamma_H,2), round(beta_A,2), round(gamma_A,2))
    for i in np.unique(df1[['HT', 'AT']].values):
        team_ratings[i]=[0,0,0,0] # Home attack, Home defense, Away Attack, Away defense
    total_goal_pred_error = 0
    total_match=0
    for index, row in df1.iterrows():
        HT = row['HT']
        FTHG = row['FTHG']
        AT = row['AT']
        FTAG = row['FTAG']

        G_H_hat, G_A_hat = calculate_expected_goals(alpha, beta_H, gamma_H, beta_A, gamma_A,
                                                    team_ratings[HT][0], team_ratings[AT][3],
                                                    team_ratings[AT][2], team_ratings[HT][1])
        G_H=FTHG
        G_A=FTAG
        team_ratings[HT][0] += omega_o_H * (G_H - G_H_hat)
        team_ratings[HT][1] += omega_d_H * (G_A - G_A_hat)
        team_ratings[AT][2] += omega_o_A * (G_A - G_A_hat)
        team_ratings[AT][3] += omega_d_A * (G_H - G_H_hat)

        total_goal_pred_error += individual_goal_pred_error(FTHG, G_H_hat, FTAG, G_A_hat)
        total_match+=1
    loss=total_goal_pred_error/total_match
    #print("\n",len(loss),round(min(loss),2))
    #import pdb; pdb.set_trace()
    return loss

def berrar_rating(dataframe,x=(beta_H, beta_A, gamma_H, gamma_A, omega_o_H, omega_d_H, omega_o_A, omega_d_A)):
    df1=dataframe
    beta_H = x[0] # slope of the logistic function. bounds [0,5]
    beta_A = x[1] 
    gamma_H = x[2]  # bias, e.g., home advantage. bounds [-5,5]
    gamma_A = x[3] 
    omega_o_H = x[4]  # used to control the learning rate of the update rule for the offensive and defensive strengths of each team. bounds [0, 1.5]
    omega_d_H = x[5] 
    omega_o_A = x[6] 
    omega_d_A = x[7] 
    team_ratings={}
    for i in np.unique(df1[['HT', 'AT']].values):
        team_ratings[i]=[0,0,0,0] # Home attack, Home defense, Away Attack, Away defense
    for index, row in df1.iterrows():
        HT = row['HT']
        FTHG = row['FTHG']
        AT = row['AT']
        FTAG = row['FTAG']

        #import pdb; pdb.set_trace()
        df1.loc[index, 'HT_H_Off_Rating'] = team_ratings[HT][0]
        #import pdb; pdb.set_trace()
        df1.loc[index, 'HT_H_Def_Rating'] = team_ratings[HT][1]
        df1.loc[index, 'HT_A_Off_Rating'] = team_ratings[HT][2]
        df1.loc[index, 'HT_A_Def_Rating'] = team_ratings[HT][3]
        df1.loc[index, 'AT_H_Off_Rating'] = team_ratings[AT][0]
        df1.loc[index, 'AT_H_Def_Rating'] = team_ratings[AT][1]
        df1.loc[index, 'AT_A_Off_Rating'] = team_ratings[AT][2]
        df1.loc[index, 'AT_A_Def_Rating'] = team_ratings[AT][3]

        G_H_hat, G_A_hat = calculate_expected_goals(alpha, beta_H, gamma_H, beta_A, gamma_A,
                                                    team_ratings[HT][0], team_ratings[AT][3],
                                                    team_ratings[AT][2], team_ratings[HT][1])
        
        df1.loc[index, 'HT_EG'] = G_H_hat
        df1.loc[index, 'AT_EG'] = G_A_hat

        team_ratings[HT][0],team_ratings[HT][1],team_ratings[AT][2],team_ratings[AT][3] = \
            update_offensive_defensive_strengths(FTHG, G_H_hat, FTAG, G_A_hat,team_ratings[HT][0], team_ratings[HT][1], team_ratings[AT][2], team_ratings[AT][3],omega_o_H, omega_d_H, omega_o_A, omega_d_A)
    return df1,team_ratings

def berrar_rating_valid(dataframe,team_ratings_dict,x=(beta_H, beta_A, gamma_H, gamma_A, omega_o_H, omega_d_H, omega_o_A, omega_d_A)):
    df1=dataframe
    beta_H = x[0] # slope of the logistic function. bounds [0,5]
    beta_A = x[1] 
    gamma_H = x[2]  # bias, e.g., home advantage. bounds [-5,5]
    gamma_A = x[3] 
    omega_o_H = x[4]  # used to control the learning rate of the update rule for the offensive and defensive strengths of each team. bounds [0, 1.5]
    omega_d_H = x[5] 
    omega_o_A = x[6] 
    omega_d_A = x[7] 
    team_ratings=team_ratings_dict
    for index, row in df1.iterrows():
        HT = row['HT']
        FTHG = row['FTHG']
        AT = row['AT']
        FTAG = row['FTAG']

        #import pdb; pdb.set_trace()
        df1.loc[index, 'HT_H_Off_Rating'] = team_ratings[HT][0]
        #import pdb; pdb.set_trace()
        df1.loc[index, 'HT_H_Def_Rating'] = team_ratings[HT][1]
        df1.loc[index, 'HT_A_Off_Rating'] = team_ratings[HT][2]
        df1.loc[index, 'HT_A_Def_Rating'] = team_ratings[HT][3]
        df1.loc[index, 'AT_H_Off_Rating'] = team_ratings[AT][0]
        df1.loc[index, 'AT_H_Def_Rating'] = team_ratings[AT][1]
        df1.loc[index, 'AT_A_Off_Rating'] = team_ratings[AT][2]
        df1.loc[index, 'AT_A_Def_Rating'] = team_ratings[AT][3]

        G_H_hat, G_A_hat = calculate_expected_goals(alpha, beta_H, gamma_H, beta_A, gamma_A,
                                                    team_ratings[HT][0], team_ratings[AT][3],
                                                    team_ratings[AT][2], team_ratings[HT][1])
        
        df1.loc[index, 'HT_EG'] = G_H_hat
        df1.loc[index, 'AT_EG'] = G_A_hat

        team_ratings[HT][0],team_ratings[HT][1],team_ratings[AT][2],team_ratings[AT][3] = \
            update_offensive_defensive_strengths(FTHG, G_H_hat, FTAG, G_A_hat,team_ratings[HT][0], team_ratings[HT][1], team_ratings[AT][2], team_ratings[AT][3],omega_o_H, omega_d_H, omega_o_A, omega_d_A)
    return df1,team_ratings

def berrar_rating_valid_final(dataframe,team_ratings_dict,x=(beta_H, beta_A, gamma_H, gamma_A, omega_o_H, omega_d_H, omega_o_A, omega_d_A)):
    df1=dataframe
    beta_H = x[0] # slope of the logistic function. bounds [0,5]
    beta_A = x[1] 
    gamma_H = x[2]  # bias, e.g., home advantage. bounds [-5,5]
    gamma_A = x[3] 
    omega_o_H = x[4]  # used to control the learning rate of the update rule for the offensive and defensive strengths of each team. bounds [0, 1.5]
    omega_d_H = x[5] 
    omega_o_A = x[6] 
    omega_d_A = x[7] 
    team_ratings=team_ratings_dict

    HT = df1['HT']
    AT = df1['AT']


    G_H_hat, G_A_hat = calculate_expected_goals(alpha, beta_H, gamma_H, beta_A, gamma_A,
                                                team_ratings[HT][0], team_ratings[AT][3],
                                                team_ratings[AT][2], team_ratings[HT][1])
    
    return G_H_hat,G_A_hat




In [92]:
start_time = time.time()
for file in ["full"]:
    df = pd.read_csv(f"{file}.csv")
    
    #hyperparameter for PSO
    x_max = np.array([5, 5, 5, 5, 1.5, 1.5, 1.5, 1.5])
    x_min = np.array([0, 0, -5, -5, 0, 0, 0, 0])
    bounds = (x_min, x_max)
    options = {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
    
    warnings.simplefilter('ignore')
    for league in df.Div.unique():
        # change date to datetime
        df1=df[df["Div"]==league]
        df1['Date'] = pd.to_datetime(df1['Date'], format='%Y-%m-%d')
        df1=df1.sort_values(by='Date')
        #team_to_index = {team_name: i for i, team_name in enumerate(np.unique(np.concatenate((df1['HT'].tolist(), df1['AT'].tolist()), axis=0)))}
        optimizer = GlobalBestPSO(n_particles=50, dimensions=8, options=options, bounds=bounds)
        cost, pos = optimizer.optimize(total_goal_prediction_error, 200)
        # pos=([ 4.01929603,  3.99244614,  1.10115083, -2.06364501,  0.80629929,
        #         0.57379537,  0.95790575,  1.00337636])
        df1,team_ratings=berrar_rating(df1,x=pos)
        df1.to_csv(f"../berrar_ratings/{file}_{league}.csv", index=False)
        # with open(f'berrar_ratings2/{file}_{league}_berrarratings_hyperparameters.pickle', 'wb') as handle:
        #     pickle.dump(pos, handle, protocol=pickle.HIGHEST_PROTOCOL)
        # with open(f'berrar_ratings2/{file}_{league}_team_ratings_dict.pickle', 'wb') as handle:
        #     pickle.dump(team_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("--- %s seconds ---" % (time.time() - start_time))

2024-04-19 17:47:24,009 - pyswarms.single.global_best - INFO - Optimize for 200 iters with {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
pyswarms.single.global_best: 100%|██████████|200/200, best_cost=1.46
2024-04-19 17:47:45,257 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.4557169906706415, best pos: [ 0.05829863  0.16010909 -1.08000889 -1.60263112  0.83283565  0.1870689
  0.39763805  0.48785899]
2024-04-19 17:47:47,241 - pyswarms.single.global_best - INFO - Optimize for 200 iters with {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
pyswarms.single.global_best: 100%|██████████|200/200, best_cost=1.21
2024-04-19 17:48:10,587 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 1.2144983693616995, best pos: [ 0.26759074  0.04812612 -1.77634174 -1.93888561  0.51663264  0.53828577
  0.63811865  0.30511918]
2024-04-19 17:48:12,653 - pyswarms.single.global_best - INFO - Optimize for 200 iters with {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
pyswarms.single.global_best: 100%|███

--- 242.18296098709106 seconds ---
