In [24]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import pandas as pd
from scipy.stats import zscore

In [25]:
country_list = ['england', 'germany', 'france', 'italy', 'spain']
dfs = []
for country in country_list:
    # df = (pd.read_csv(f'data/final_dfs/{country}.csv')).drop(columns=['shot', 'corner', 'freeKickShots', 'redCard', 'yellowCard', 'offside'])
    df = (pd.read_csv(f'data/final_dfs/{country}_adv.csv'))
    dfs.append(df)

game_events = pd.concat(dfs, ignore_index=True)

In [26]:
print(game_events.shape)
print(list(game_events.columns))

(1826, 441)
['game_id', 'team1_id', 'team1_name', 'team2_id', 'team2_name', 'team1_goals', 'team2_goals', 'team1_win', 'team2_win', 'team1_shot_5', 'team1_shot_10', 'team1_shot_15', 'team1_shot_20', 'team1_shot_25', 'team1_shot_30', 'team1_shot_35', 'team1_shot_40', 'team1_shot_45', 'team1_shot_45+', 'team1_shot_50', 'team1_shot_55', 'team1_shot_60', 'team1_shot_65', 'team1_shot_70', 'team1_shot_75', 'team1_shot_80', 'team1_shot_85', 'team1_shot_90', 'team1_shot_90+', 'team1_corner_5', 'team1_corner_10', 'team1_corner_15', 'team1_corner_20', 'team1_corner_25', 'team1_corner_30', 'team1_corner_35', 'team1_corner_40', 'team1_corner_45', 'team1_corner_45+', 'team1_corner_50', 'team1_corner_55', 'team1_corner_60', 'team1_corner_65', 'team1_corner_70', 'team1_corner_75', 'team1_corner_80', 'team1_corner_85', 'team1_corner_90', 'team1_corner_90+', 'team1_freeKickShots_5', 'team1_freeKickShots_10', 'team1_freeKickShots_15', 'team1_freeKickShots_20', 'team1_freeKickShots_25', 'team1_freeKickSh

In [27]:
game_events['team2_redCard_20'].isna().any()

False

In [28]:
game_events = game_events.fillna(0)

In [29]:
# cols = ['team1_win', 'team1_height', 'team2_height', 'team1_shot', 'team1_corner', 'team1_freeKickShots', 'team1_redCard', 'team1_yellowCard', 'team1_offside', 'team2_shot', 'team2_corner', 'team2_freeKickShots', 'team2_redCard', 'team2_yellowCard', 'team2_offside']
# df_dataset = game_events[cols]

drop_cols = ['game_id', 'team1_id', 'team1_name', 'team2_id', 'team2_name', 'team1_goals', 'team2_goals', 'team2_win']
df_dataset = game_events.drop(columns=drop_cols)

In [30]:
df_dataset['team2_redCard_20']

0       0
1       0
2       0
3       0
4       0
       ..
1821    0
1822    0
1823    0
1824    0
1825    0
Name: team2_redCard_20, Length: 1826, dtype: int64

In [31]:
df_dataset.loc[:]['team1_win'] = df_dataset.apply(lambda x: int(x['team1_win']), axis = 'columns')
df_dataset = df_dataset.astype('float')

In [32]:
# zscore columns of dataframe 
exclude = set(['team1_win'])
for key in (set(df_dataset.keys()) - exclude):
    if np.std(df_dataset.loc[:][key]) != 0:
        df_dataset.loc[:][key] = zscore(df_dataset[key].astype(float))

In [33]:
df_dataset.columns[df_dataset.isna().any()].tolist()

[]

In [34]:
dataset = df_dataset.to_numpy()

In [35]:
dataset.shape

(1826, 433)

In [36]:
n_games, n_features = dataset[:,1:].shape
trainset, testset = torch.utils.data.random_split(dataset, [n_games - int(0.25 * n_games), int(0.25 * n_games)])

train_loader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=True)

In [37]:
n_games

1826

In [87]:
N_EPOCHS = 50
BATCH_SIZE_TRAIN = 32
BATCH_SIZE_TEST = 1000
LEARNING_RATE = 0.00001

LOG_TRAINING = False
LOG_INTERVAL = 10

# random_seed = 1
torch.backends.cudnn.enabled = False
# torch.manual_seed(random_seed)

In [101]:
# Network Parameters
n_hidden_1 = 1024  # 1st layer number of neurons
n_hidden_2 = 512  # 2nd layer number of neurons
n_hidden_3 = 128  # 3rd layer number of neurons
n_hidden_4 = 64  # 3rd layer number of neurons


model = nn.Sequential(nn.Linear(n_features, n_hidden_1),
                       nn.Tanh(),
                       nn.Linear(n_hidden_1, n_hidden_2),
                       nn.Tanh(),
                       nn.Dropout(0.2),
                       nn.Linear(n_hidden_2, n_hidden_3),
                       nn.Tanh(),
                       nn.Dropout(0.2),
                       nn.Linear(n_hidden_3, n_hidden_4),
                       nn.Tanh(),
                       nn.Linear(n_hidden_4, 1),
                       nn.Sigmoid())

In [102]:
model

Sequential(
  (0): Linear(in_features=432, out_features=1024, bias=True)
  (1): Tanh()
  (2): Linear(in_features=1024, out_features=512, bias=True)
  (3): Tanh()
  (4): Linear(in_features=512, out_features=128, bias=True)
  (5): Tanh()
  (6): Linear(in_features=128, out_features=64, bias=True)
  (7): Tanh()
  (8): Linear(in_features=64, out_features=1, bias=True)
  (9): Sigmoid()
)

In [103]:
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.MSELoss()
criterion = nn.BCELoss()

In [104]:
def eval(dataset):
    model.eval()
    eval_loader = torch.utils.data.DataLoader(dataset, batch_size=len(dataset),shuffle=False)
    
    batch = next(iter(eval_loader))    
    targets = batch[:,0].float()
    data = batch[:,1:].float()

    truth_team1_win = np.count_nonzero(targets.numpy() == 1.)
    truth_team2_win = np.count_nonzero(targets.numpy() == 0.)

    with torch.no_grad():
        output = model.forward(data)
        pred = torch.round(output.view(-1))

        correct = pred.eq(targets.data.view_as(pred)).sum().item()

        pred_team1_win = np.count_nonzero(pred.numpy() == 1.)
        pred_team2_win = np.count_nonzero(pred.numpy() == 0.)

    model.train()
    
    print('Test Set Accuracy {}/{} ({})'.format(correct, len(dataset), round(correct/len(dataset), 5)))
    # print('Team 1 Win: P: {}, T: {} -- Team 2 Win: P: {}, T: {}'.format(pred_team1_win, truth_team1_win, pred_team2_win, truth_team2_win))

def train():
    for ep in range(1, N_EPOCHS+1):
        model.train()
        print(f' -- Epoch {ep} --')
        loss_sum = 0

        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()

            targets = batch[:,0].float()

            data = batch[:,1:]

            pred = model.forward(data.float())

            loss = criterion(pred.view(-1), targets)
            
            loss.backward()

            optimizer.step()

        eval(testset)
        

In [105]:
print('BEFORE TRAINING')
eval(testset)

print('train...')
train()

BEFORE TRAINING
Test Set Accuracy 235/456 (0.51535)
train...
 -- Epoch 1 --
Test Set Accuracy 264/456 (0.57895)
 -- Epoch 2 --
Test Set Accuracy 277/456 (0.60746)
 -- Epoch 3 --
Test Set Accuracy 293/456 (0.64254)
 -- Epoch 4 --
Test Set Accuracy 296/456 (0.64912)
 -- Epoch 5 --
Test Set Accuracy 298/456 (0.65351)
 -- Epoch 6 --
Test Set Accuracy 302/456 (0.66228)
 -- Epoch 7 --
Test Set Accuracy 302/456 (0.66228)
 -- Epoch 8 --
Test Set Accuracy 304/456 (0.66667)
 -- Epoch 9 --
Test Set Accuracy 307/456 (0.67325)
 -- Epoch 10 --
Test Set Accuracy 308/456 (0.67544)
 -- Epoch 11 --
Test Set Accuracy 310/456 (0.67982)
 -- Epoch 12 --
Test Set Accuracy 311/456 (0.68202)
 -- Epoch 13 --
Test Set Accuracy 310/456 (0.67982)
 -- Epoch 14 --
Test Set Accuracy 311/456 (0.68202)
 -- Epoch 15 --
Test Set Accuracy 312/456 (0.68421)
 -- Epoch 16 --
Test Set Accuracy 313/456 (0.6864)
 -- Epoch 17 --
Test Set Accuracy 313/456 (0.6864)
 -- Epoch 18 --
Test Set Accuracy 313/456 (0.6864)
 -- Epoch 19 --