In [None]:
# start by loading the key packages
import numpy as np
import pandas as pd

First, let us load in the dataset

In [None]:
# load the dataframe
df = pd.read_csv("../../data/processed/aggregate_data.csv")
# make a copy for edits
df_copy = df
df_copy = df_copy.drop(columns=["Unnamed: 0"])
df_copy.head()

We are first going to split the dataframe into training and test sets (in the ratio 80:20). The test set will be used later, to compare the accuracy of different ML models.

In [None]:
# generate the training and test sets
df_train = df_copy.sample(frac=0.8, ignore_index=True)
df_test = df_copy.drop(df_train.index)

# save them for later comparisons
df_train.to_csv('../../data/processed/training_data.csv')
df_test.to_csv('../../data/processed/test_data.csv')

In the next step, we shall map the data into appropriate values. We shall assign labels of 1,0,-1 for a home win / draw / away win. For the toss, we use 1/-1 for home toss / away toss. 

We shall furthermore introduce two new variables, `rank_diff=away_rank-home_rank` and `rating_diff=home_rating-away_rating`. These are a bit more simple to understand than having two separate ratings and rankings for the different teams.

In [None]:
# set up the mapping
toss_dict = {"home": 1, "away": -1}
result_dict = {"home": 1, "draw": 0, "away": -1}
# map the data
result_map = [result_dict[result] for result in df_train.result]
toss_map = [toss_dict[toss] for toss in df_train.toss]
# introduce rank_diff and rating_diff
df_train["rank_diff"] = df_train.away_rank - df_train.home_rank
df_train["rating_diff"] = df_train.home_rating - df_train.away_rating
# change the dataframe"
df_train.result = result_map
df_train.toss = toss_map
df_train.head()

In the next step we pre-process the features to have mean of zero and standard deviation of unity. We shall only use three features in the first instance, toss, rankings_diff and ratings_diff.

In [None]:
from sklearn import preprocessing

feature_cols = ["rank_diff","rating_diff","toss"]
X_init = df_train[feature_cols]
scaler = preprocessing.StandardScaler().fit(X_init)
X_scaled = scaler.transform(X_init)
Y = df_train.result

The next step is to split our data into training and validation sets. We shall use $k$-fold cross-validation with $k=5$.

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True)

Now we initialize the logistic regression model. First up we shall just try with default mulitnomial regression from `scikit-learn`.

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=0.1)

Next up we shall train our model using the 5-fold cross validation method.

In [None]:
from sklearn import metrics
logreg_score = np.zeros((5))

for i, [train_index, val_index] in enumerate(kf.split(X_init)):
            
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    Y_train, Y_val = Y[train_index], Y[val_index]
    
    logreg.fit(X_train, Y_train)

    y_pred = logreg.predict(X_val)

    logreg_score[i] = logreg.score(X_val, Y_val)
    print(i, logreg_score[i])
    
print(np.mean(logreg_score))

The accuracy is not particularly good! In fact, if we were to naively assign the the home team to always win, we would get an accuracy of 139/296=0.47, so we are hardly improving on the naive prediction...

Let us try to introduce some new parameters based on the teams playing. To do this, we need to transform the categorical team labels into binary arrays, which we shall do via a "one hot encoder" approach.

In [None]:
# check data set wasn't modified by mistake
df_train.head()

# drop the date
df_train = df_train.drop(columns=["date"])

df_train.head()

In [None]:
# get the dummy variables
df_dummy = pd.get_dummies(df_train, prefix=['ht', 'at'])

df_dummy.head()

In [None]:
# make the training set equal to the dummy set
df_train = df_dummy

In [None]:
# get a list of the columns
col_list = df_train.columns.to_list()
for a in ["result","home_rank","away_rank","home_rating", "away_rating"]:
    col_list.remove(a)

In [None]:
print(col_list)

In [None]:
#feature_cols = col_list
feature_cols = ["rank_diff"]
print(feature_cols)
X_init = df_train[feature_cols]
scaler = preprocessing.StandardScaler().fit(X_init)
X_scaled = scaler.transform(X_init)

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=0.1,tol=0.1)

In [None]:
from sklearn import metrics
logreg_score = np.zeros((5))

for i, [train_index, val_index] in enumerate(kf.split(X_init)):
            
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    Y_train, Y_val = Y[train_index], Y[val_index]
    
    logreg.fit(X_train, Y_train)

    y_pred = logreg.predict(X_val)

    logreg_score[i] = logreg.score(X_val, Y_val)
    print(i+1, logreg_score[i])

y_test_pred = logreg.predict(X_train)
print(logreg.score(X_train,Y_train))
print(np.mean(logreg_score))

We will simplify the problem by excluding drawn results for now.

In [None]:
df_train.drop(df_train[df.result == 0].index, inplace=True)

In [None]:
df_train.head()

In [None]:
feature_cols = col_list
#feature_cols = ["rank_diff", "toss", "rating_diff"]
X_init = df_train[feature_cols]
scaler = preprocessing.StandardScaler().fit(X_init)
X_scaled = scaler.transform(X_init)

In [None]:
logreg_score = np.zeros((5))

for i, [train_index, val_index] in enumerate(kf.split(X_init)):
            
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    Y_train, Y_val = Y[train_index], Y[val_index]
    
    logreg.fit(X_train, Y_train)

    y_pred = logreg.predict(X_val)

    logreg_score[i] = logreg.score(X_val, Y_val)
    print(i+1, logreg_score[i], logreg.score(X_train, Y_train))

y_test_pred = logreg.predict(X_train)
print(np.mean(logreg_score))