In [8]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

In [9]:
df = pd.read_csv("C:\\Users\\Karol\\Desktop\\Algobetting-Ekstraklasa_OFFICIAL\\data\\merged_normalized.csv", encoding='utf-8-sig', index_col=[0])
odds1 = df.copy() #We will use it shortly
df = df.drop(df.columns[range(10)], axis=1) #So that we only have numerical variables
custom_map = {'Wygrali gospodarze':0,'Remis':1,'Wygrali goście':2}
df = df.dropna()
df['Kto_wygrał'] = df['Kto_wygrał'].map(custom_map)
#print(df.head(10))
#print(df['Kto_wygrał'].isnull().sum())
y = df['Kto_wygrał']
X = df.drop(['Kto_wygrał'], axis=1)

In [10]:
correlation_values = df.corr()['Kto_wygrał'].drop('Kto_wygrał') #Correlation between each variable and result doesn't seem to be very strong, but it won't stop us from achieving very meaningful results
correlation_values

Punkty_Gospodarze          -0.073269
Zwycięstwa_Gospodarze      -0.074656
Remisy_Gospodarze           0.022528
Porażki_Gospodarze          0.067577
Gole_zdobyte_Gospodarze    -0.068445
Gole_stracone_Gospodarze    0.081916
Punkty_Goście               0.092149
Zwycięstwa_Goście           0.096079
Remisy_Goście              -0.041026
Porażki_Goście             -0.077287
Gole_zdobyte_Goście         0.066314
Gole_stracone_Goście       -0.081316
Name: Kto_wygrał, dtype: float64

In [11]:
odds1 = odds1[['Oddsy_gospodarze', 'Oddsy_remis', 'Oddsy_goście', 'Kto_wygrał']]
custom_map = {'Wygrali gospodarze':0,'Remis':1,'Wygrali goście':2}
odds1['Kto_wygrał'] = odds1['Kto_wygrał'].map(custom_map)
odds1 = odds1.dropna()
odds1.head()

Unnamed: 0,Oddsy_gospodarze,Oddsy_remis,Oddsy_goście,Kto_wygrał
0,1.94,3.81,3.67,1.0
1,2.08,3.8,3.33,1.0
2,1.36,5.21,8.39,0.0
3,1.96,3.95,3.49,2.0
4,1.38,5.37,7.25,1.0


##### ExpectedReturnNaive algorithm picks scenario with the highest probability, and returns what we would get back by placing bets according to this strategy

In [12]:
def ExpectedReturnNaive(odds, X, y, i, j, t, wyg, acc):
    data = odds.copy()
    X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=t, random_state=j)
    meczTree = DecisionTreeClassifier(criterion="log_loss", max_depth = i)
    meczTree.fit(X_trainset,y_trainset)
    tree_predictions = meczTree.predict(X_testset)
    indices = list(X_testset.index.astype(int))

    data_tp = data.loc[indices] #dane_tp = dane_tree_predictions; assign testset to bookmaker odds
    data_tp['Predicted_Winner'] = pd.Series(tree_predictions)
    data_tp = data_tp.dropna()
    data_tp['Predicted_Winner'] = data_tp['Predicted_Winner'].astype(int)
    data_tp['Wygrana'] = 0.0
    data_tp.loc[data_tp.Kto_wygrał == data_tp.Predicted_Winner,'Wygrana'] = 1
    data_tp.loc[data_tp.Kto_wygrał == 0,'Wygrana'] = data_tp.Oddsy_gospodarze * 100 * data_tp.Wygrana
    data_tp.loc[data_tp.Kto_wygrał == 1,'Wygrana'] = data_tp.Oddsy_remis * 100 * data_tp.Wygrana
    data_tp.loc[data_tp.Kto_wygrał == 2,'Wygrana'] = data_tp.Oddsy_goście * 100 * data_tp.Wygrana
    wyg = wyg + (data_tp['Wygrana'].sum())/len(data_tp['Wygrana'])
    acc = acc + 100*(round(metrics.accuracy_score(y_testset, tree_predictions),4))
    return wyg, acc, t

##### ExpectedReturnAdvanced algorithm calculates Expected Value of each bet (i.e. P(scenario A) x bookmaker odds for scenario A), picks the one with highest EV, and returns what it'd yield back per 100zł placed, over the course of 2015/16 - 2024/25 seasons

In [13]:
def ExpectedReturnAdvanced(df, odds, X, y, i, j, t, wyg, acc):

    X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=t, random_state=j)
    indices = list(X_trainset.index)
    odds_X_trainset = odds.loc[indices]
    meczTree = DecisionTreeClassifier(criterion="log_loss", max_depth = i)
    meczTree.fit(X_trainset,y_trainset)

    tree_predictions = meczTree.predict_proba(X_testset)
    tree_predictions = pd.DataFrame(tree_predictions)
    indices2 = list(X_testset.index)
    odds_tree_predictions = odds.loc[indices2]
    odds_tree_predictions = odds_tree_predictions.reset_index(drop=True)
    table = odds_tree_predictions.merge(tree_predictions, left_index=True, right_index=True)

    table['Kto_wygrał'] = df['Kto_wygrał'] #It's a bit of a mass, but it works
    table['Potencjalny_return'] = 0
    table['Potencjalny_return'] = np.where(table['Kto_wygrał'] == 0, table['Oddsy_gospodarze']*100, table['Potencjalny_return'])
    table['Potencjalny_return'] = np.where(table['Kto_wygrał'] == 1, table['Oddsy_remis']*100, table['Potencjalny_return'])
    table['Potencjalny_return'] = np.where(table['Kto_wygrał'] == 2, table['Oddsy_goście']*100, table['Potencjalny_return'])
    table['Potencjalny_return'] = table['Potencjalny_return'].astype(int)
    table['Return_gospodarze'] = table['Oddsy_gospodarze'] * table[0]
    table['Return_remis'] = table['Oddsy_remis'] * table[1]
    table['Return_goście'] = table['Oddsy_goście'] * table[2]
    table["Max(EV)"] = table[['Return_gospodarze', 'Return_remis', 'Return_goście']].max(axis=1)
    table['Max(EV)'] = np.where(table['Max(EV)'] == table['Return_gospodarze'], 0, table['Max(EV)'])
    table['Max(EV)'] = np.where(table['Max(EV)'] == table['Return_remis'], 1, table['Max(EV)'])
    table['Max(EV)'] = np.where(table['Max(EV)'] == table['Return_goście'], 2, table['Max(EV)'])
    table['Max(EV)'] = table['Max(EV)'].astype(int)
    table['Actual_return'] = np.where(table['Max(EV)'] == table['Kto_wygrał'], table['Potencjalny_return'], 0)
    return_calkowity = table['Actual_return'].sum()
    return_per_100 = return_calkowity / len(table)
    accuracy_score = 100*(np.count_nonzero(table['Actual_return']))/len(table['Actual_return'])
    wyg = wyg + return_per_100
    acc = acc + accuracy_score
    #print(f"return_calkowity for max_depth={i}, random_state={j}: {return_calkowity}")
    #print(f"return_per_100 for max_depth={i}, random_state={j}: {return_per_100:.3f}")

    return wyg, acc, t

In [14]:
for i in range(1,20):
    return1 = 0
    accuracy1 = 0
    return2 = 0
    accuracy2 = 0
    for j in range(1, 51):
        return1, accuracy1, t = ExpectedReturnNaive(odds1, X, y, i, j, 0.2, return1, accuracy1)
        return2, accuracy2, t = ExpectedReturnAdvanced(df, odds1, X, y, i, j, 0.2, return2, accuracy2)
    print(f"Naive algorithm | Dla max_depth={i}: Average accuracy score = {(accuracy1/50):.3f}%, Average wygrana per 100zł: {(return1/50):.3f}") #50 repetitions, therefore to get an average we divide by 50
    print(f"Advanced algorithm | Dla max_depth={i}: Average accuracy score = {(accuracy2/50):.3f}%, Average wygrana per 100zł: {(return2/50):.3f}")
    print("\n")

Naive algorithm | Dla max_depth=1: Average accuracy score = 43.161%, Average wygrana per 100zł: 99.029
Advanced algorithm | Dla max_depth=1: Average accuracy score = 36.717%, Average wygrana per 100zł: 130.935


Naive algorithm | Dla max_depth=2: Average accuracy score = 43.692%, Average wygrana per 100zł: 97.577
Advanced algorithm | Dla max_depth=2: Average accuracy score = 36.277%, Average wygrana per 100zł: 128.043


Naive algorithm | Dla max_depth=3: Average accuracy score = 43.560%, Average wygrana per 100zł: 97.325
Advanced algorithm | Dla max_depth=3: Average accuracy score = 36.022%, Average wygrana per 100zł: 126.265


Naive algorithm | Dla max_depth=4: Average accuracy score = 43.273%, Average wygrana per 100zł: 97.507
Advanced algorithm | Dla max_depth=4: Average accuracy score = 35.506%, Average wygrana per 100zł: 123.940


Naive algorithm | Dla max_depth=5: Average accuracy score = 42.818%, Average wygrana per 100zł: 98.572
Advanced algorithm | Dla max_depth=5: Average acc

##### As we can see, although correlation between variables and results of each game seemed to be too low, decision tree trained on them is almost good enough to be profitable on its own.
##### Moreover, it turns out that it actually is good enough; using exact values of each scenario's probability, derived from the same exact decision tree, and combining them with bookmaker odds, is enough to make it an exceptionally good model.

In [None]:
#Does it actually work on unseen data though? We can check that by using data from season July - December 2025 (which none of the models have ever seen).
#Let's test it on ExpectedReturnAdvanced with parameters test_data=0.2, max_depth={3,4,5,6,7,8}.

#To be done