In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import pandas as pd
from scipy.stats import zscore

In [3]:
country_list = ['england', 'germany', 'france', 'italy', 'spain']
dfs = []
for country in country_list:
    df = (pd.read_csv(f'data/final_dfs/{country}.csv')).drop(columns=['shot', 'corner', 'freeKickShots', 'redCard', 'yellowCard', 'offside'])
    dfs.append(df)

game_events = pd.concat(dfs, ignore_index=True)

In [4]:
game_events.shape

(1826, 23)

In [5]:
cols = ['team1_win', 'team1_height', 'team2_height', 'team1_shot', 'team1_corner', 'team1_freeKickShots', 'team1_redCard', 'team1_yellowCard', 'team1_offside', 'team2_shot', 'team2_corner', 'team2_freeKickShots', 'team2_redCard', 'team2_yellowCard', 'team2_offside']
df_dataset = game_events[cols]

In [6]:
df_dataset.loc[:]['team1_win'] = df_dataset.apply(lambda x: int(x['team1_win']), axis = 'columns')
df_dataset = df_dataset.astype('float')

In [7]:
# zscore columns of dataframe 
exclude = set(['team1_win'])
for key in (set(df_dataset.keys()) - exclude):
    df_dataset.loc[:][key] = zscore(df_dataset[key].astype(float))

In [8]:
df_dataset

Unnamed: 0,team1_win,team1_height,team2_height,team1_shot,team1_corner,team1_freeKickShots,team1_redCard,team1_yellowCard,team1_offside,team2_shot,team2_corner,team2_freeKickShots,team2_redCard,team2_yellowCard,team2_offside
0,0.0,0.391446,-0.830067,-0.664774,0.327491,-0.756350,-0.301959,-1.350896,0.418685,0.976781,1.150234,-0.696247,-0.311344,-1.484954,-0.615003
1,1.0,0.812152,1.833659,-0.261090,-0.866120,-0.756350,-0.301959,0.148640,-1.248142,-1.149085,-0.497082,0.681160,-0.311344,0.698826,-0.615003
2,0.0,0.017485,-0.185618,1.151803,0.625894,-0.756350,-0.301959,-0.601128,-1.248142,-0.440463,-0.826545,0.681160,-0.311344,-1.484954,-0.021767
3,1.0,-0.496712,0.845502,1.757329,0.029089,-0.756350,-0.301959,-1.350896,-0.692533,-1.857707,-1.156008,-0.696247,-0.311344,-1.484954,-0.615003
4,1.0,0.110975,0.587722,-1.068458,-0.567717,-0.756350,-0.301959,1.648175,1.529903,-1.149085,0.161845,0.681160,-0.311344,-1.484954,-0.021767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,0.0,-0.169496,-1.259701,-0.462932,0.029089,-0.756350,3.101607,0.148640,1.529903,-0.204256,-0.167618,-0.696247,-0.311344,1.426752,-1.208240
1822,0.0,-0.403221,-0.013764,-0.261090,0.029089,0.482300,-0.301959,0.148640,-1.248142,-0.440463,-1.156008,-0.696247,3.052642,1.426752,-0.021767
1823,0.0,-1.525104,-0.271544,0.546277,0.029089,0.482300,-0.301959,0.898407,0.418685,0.031951,-0.497082,0.681160,-0.311344,-0.757028,-0.615003
1824,1.0,-0.262986,-2.376747,1.353645,-0.866120,1.720951,-0.301959,0.898407,-0.136924,-1.621500,-1.156008,0.681160,3.052642,0.698826,-0.615003


In [9]:
dataset = df_dataset.to_numpy()

In [10]:
dataset.shape

(1826, 15)

In [61]:
n_games, n_features = dataset[:,1:].shape
trainset, testset = torch.utils.data.random_split(dataset, [n_games - int(0.25 * n_games), int(0.25 * n_games)])

train_loader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=True)

In [88]:
N_EPOCHS = 50
BATCH_SIZE_TRAIN = 64
BATCH_SIZE_TEST = 1000
LEARNING_RATE = 0.0001

LOG_TRAINING = False
LOG_INTERVAL = 10

# random_seed = 1
torch.backends.cudnn.enabled = False
# torch.manual_seed(random_seed)

In [89]:
# Network Parameters
n_hidden_1 = 128  # 1st layer number of neurons
n_hidden_2 = 128  # 2nd layer number of neurons
n_hidden_3 = 64  # 3rd layer number of neurons


model = nn.Sequential(nn.Linear(n_features, n_hidden_1),
                       nn.ReLU(),
                       nn.Linear(n_hidden_1, n_hidden_2),
                       nn.ReLU(),
                       nn.Linear(n_hidden_2, n_hidden_3),
                       nn.ReLU(),
                       nn.Linear(n_hidden_3, 1),
                       nn.Sigmoid())

In [90]:
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.MSELoss()
criterion = nn.BCELoss()

In [91]:
def eval(dataset):
    model.eval()
    eval_loader = torch.utils.data.DataLoader(dataset, batch_size=len(dataset),shuffle=False)
    
    batch = next(iter(eval_loader))    
    targets = batch[:,0].float()
    data = batch[:,1:].float()

    truth_team1_win = np.count_nonzero(targets.numpy() == 1.)
    truth_team2_win = np.count_nonzero(targets.numpy() == 0.)

    with torch.no_grad():
        output = model.forward(data)
        pred = torch.round(output.view(-1))

        correct = pred.eq(targets.data.view_as(pred)).sum().item()

        pred_team1_win = np.count_nonzero(pred.numpy() == 1.)
        pred_team2_win = np.count_nonzero(pred.numpy() == 0.)

    model.train()
    
    print('Test Set Accuracy {}/{} ({})'.format(correct, len(dataset), round(correct/len(dataset), 5)))
    # print('Team 1 Win: P: {}, T: {} -- Team 2 Win: P: {}, T: {}'.format(pred_team1_win, truth_team1_win, pred_team2_win, truth_team2_win))

def train():
    for ep in range(1, N_EPOCHS+1):
        model.train()
        print(f' -- Epoch {ep} --')
        loss_sum = 0

        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()

            targets = batch[:,0].float()
            data = batch[:,1:]

            pred = model.forward(data.float())

            loss = criterion(pred.view(-1), targets)
            loss.backward()

            optimizer.step()

        eval(testset)
        

In [92]:
print('BEFORE TRAINING')
eval(testset)

print('train...')
train()

BEFORE TRAINING
Test Set Accuracy 251/456 (0.55044)
train...
 -- Epoch 1 --
Test Set Accuracy 259/456 (0.56798)
 -- Epoch 2 --
Test Set Accuracy 260/456 (0.57018)
 -- Epoch 3 --
Test Set Accuracy 260/456 (0.57018)
 -- Epoch 4 --
Test Set Accuracy 260/456 (0.57018)
 -- Epoch 5 --
Test Set Accuracy 260/456 (0.57018)
 -- Epoch 6 --
Test Set Accuracy 260/456 (0.57018)
 -- Epoch 7 --
Test Set Accuracy 261/456 (0.57237)
 -- Epoch 8 --
Test Set Accuracy 265/456 (0.58114)
 -- Epoch 9 --
Test Set Accuracy 270/456 (0.59211)
 -- Epoch 10 --
Test Set Accuracy 273/456 (0.59868)
 -- Epoch 11 --
Test Set Accuracy 283/456 (0.62061)
 -- Epoch 12 --
Test Set Accuracy 289/456 (0.63377)
 -- Epoch 13 --
Test Set Accuracy 293/456 (0.64254)
 -- Epoch 14 --
Test Set Accuracy 300/456 (0.65789)
 -- Epoch 15 --
Test Set Accuracy 308/456 (0.67544)
 -- Epoch 16 --
Test Set Accuracy 306/456 (0.67105)
 -- Epoch 17 --
Test Set Accuracy 302/456 (0.66228)
 -- Epoch 18 --
Test Set Accuracy 304/456 (0.66667)
 -- Epoch 19