In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
% cd drive/MyDrive/sp/data
! ls

FakeData_EPL.csv   KaggleDataset_withBO.txt  PL_scraped_ord.csv
KaggleDataset.csv  old_FakeData_EPL.csv


In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Dataset

In [34]:
dataset = pd.read_csv('KaggleDataset_withBO.txt')
dataset.tail(3)

Unnamed: 0,match_id,country,league,season,week,date,home_team,away_team,home_goal,away_goal,result,home_lineup,away_lineup,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA
19524,24495,Spain,Spain LIGA BBVA,2015/2016,38,2016-05-15 00:00:00,Málaga CF,UD Las Palmas,4,1,win,Guillermo Ochoa - Miguel Torres - Raul Albento...,Raul Lizoain - David Garcia - Pedro Bigas - Pa...,1.8,3.75,4.5,1.83,3.7,4.0,1.85,3.45,4.0,1.8,3.6,4.33
19525,24496,Spain,Spain LIGA BBVA,2015/2016,38,2016-05-14 00:00:00,Atlético Madrid,RC Celta de Vigo,2,0,win,Jan Oblak - Juanfran - Stefan Savic - Diego Go...,Sergio Alvarez - Johny - Hugo Mallo - Sergi Go...,1.75,3.75,4.5,1.83,3.6,4.1,1.85,3.7,3.7,1.83,3.6,4.2
19526,24497,Spain,Spain LIGA BBVA,2015/2016,38,2016-05-15 00:00:00,Rayo Vallecano,Levante UD,3,1,win,Yoel Rodriguez - Quini - Antonio Amaya - Tito ...,Diego Marino - Ivan Lopez - David Navarro - Ca...,1.33,5.25,9.0,1.33,4.75,9.25,1.4,5.0,6.0,1.33,5.25,9.0


# Dataset Transform

In [74]:
teams = np.unique(dataset[['away_team', 'home_team']].values)
team_lblenc = LabelEncoder()
team_lblenc.fit(teams)

players = np.unique(
    np.concatenate(
      (
        np.stack(dataset['home_lineup'].apply(lambda lineup: lineup.split(' - '))).reshape(-1),
        np.stack(dataset['away_lineup'].apply(lambda lineup: lineup.split(' - '))).reshape(-1)
      )    
    )
)
player_lblenc = LabelEncoder()
player_lblenc.fit(players)

results = pd.unique(dataset['result'])
result_lblenc = LabelEncoder()
result_lblenc.fit(results)

LabelEncoder()

In [108]:
home_team_labels = team_lblenc.transform(dataset['home_team'])
away_team_labels = team_lblenc.transform(dataset['away_team'])

home_player_labels = player_lblenc.transform(
     np.stack(dataset['home_lineup'].apply(lambda lineup: lineup.split(' - '))).reshape(-1)
).reshape(-1, 11)
away_player_labels = player_lblenc.transform(
     np.stack(dataset['away_lineup'].apply(lambda lineup: lineup.split(' - '))).reshape(-1)
).reshape(-1, 11)

result_labels = result_lblenc.transform(dataset['result'])



In [146]:
str_home_players = np.char.mod('%d', home_player_labels)
str_home_player_labels = np.apply_along_axis(lambda arr: ' - '.join(arr), axis=1, arr=str_home_players)

str_away_players = np.char.mod('%d', away_player_labels)
str_away_player_labels = np.apply_along_axis(lambda arr: ' - '.join(arr), axis=1, arr=str_away_players)

In [155]:
meta_dict = {
        'home_team_label': home_team_labels,
        'away_team_label': away_team_labels,
        'result_label': result_labels,
        'home_lineup_label': str_home_player_labels,
        'away_lineup_label': str_away_player_labels,
}

for column in dataset.columns:
  meta_dict.update({column: dataset[column].values})

transformed_dataset = pd.DataFrame(
    meta_dict
)

# Dataset Split

In [None]:
train_groups = []
dev_groups = []
test_groups = []
for league, data in transformed_dataset.groupby('league'):
  train_set, dev_test = train_test_split(data, shuffle=False, stratify=None, train_size=0.84)
  dev_set, test_set = train_test_split(dev_test, shuffle=False, stratify=None, train_size=0.5)

  train_groups.append(train_set)
  dev_groups.append(dev_set)
  test_groups.append(test_set)

# Team Blade Chest Modeling

In [210]:
import torch
from torch.nn import Embedding,\
                     Module,\
                     Linear,\
                     Dropout,\
                     Tanh,\
                     BatchNorm1d,\
                     LogSoftmax,\
                     NLLLoss

In [361]:
#@title Model
class TeamBladeChest(Module):
  def __init__(self, num_teams, embedding_size, hidden_size, dropout=0.5):
    super(TeamBladeChest, self).__init__()
    self.num_teams = num_teams
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.dropout=dropout

    self.team_embedder = Embedding(self.num_teams, self.embedding_size)
    self.emb_bn = BatchNorm1d(self.embedding_size)

    self.chest_transform = Linear(self.embedding_size, self.hidden_size, bias=False)
    self.chest_bn = BatchNorm1d(self.hidden_size)

    self.blade_transform = Linear(self.embedding_size, self.hidden_size, bias=False)
    self.blade_bn = BatchNorm1d(self.hidden_size)

    self.regularizer = Dropout(p=self.dropout)
    self.activation = Tanh()

    self.result_transform = Linear(1, 3)
    self.classifier = LogSoftmax(dim=-1)

  def _encode_team(self, team):
    embedding = self.team_embedder(team)
    embedding = self.emb_bn(embedding)

    blade = self.blade_transform(embedding)
    blade = self.blade_bn(blade)
    blade = self.activation(blade)
    blade = self.regularizer(blade)

    chest = self.chest_transform(embedding)
    chest = self.chest_bn(chest)
    chest = self.activation(chest)
    chest = self.regularizer(chest)

    return blade, chest

  def _matchup(self, home_blade, home_chest, away_blade, away_chest):
    return (home_blade * away_chest).sum(-1) - (away_blade * home_chest).sum(-1)

  def forward(self, home, away):
    home_blade, home_chest = self._encode_team(home)
    away_blade, away_chest = self._encode_team(away)

    matchup_score = self._matchup(home_blade, home_chest, away_blade, away_chest).reshape(-1, 1)

    result = self.result_transform(matchup_score)
    result = self.classifier(result)
    result = self.regularizer(result)

    return self.classifier(result)

In [362]:
#@title home, away, y
home = torch.from_numpy(transformed_dataset.loc[:, ['home_team_label']].values.reshape(-1))
away = torch.from_numpy(transformed_dataset.loc[:, ['away_team_label']].values.reshape(-1))
y = torch.from_numpy(transformed_dataset.loc[:, ['result_label']].values.reshape(-1))

In [368]:
#@title Hyperparameters
assert home.max() == away.max()
num_teams = home.max() + 1
embedding_size = 8
hidden_size = 24
tbc_model = TeamBladeChest(
    num_teams=num_teams,
    embedding_size=embedding_size,
    hidden_size=hidden_size,
    dropout=0.5
)

optimizer = torch.optim.Adam(tbc_model.parameters())
criterion = NLLLoss()
tbc_model

TeamBladeChest(
  (team_embedder): Embedding(254, 8)
  (emb_bn): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (chest_transform): Linear(in_features=8, out_features=24, bias=False)
  (chest_bn): BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (blade_transform): Linear(in_features=8, out_features=24, bias=False)
  (blade_bn): BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (regularizer): Dropout(p=0.5, inplace=False)
  (activation): Tanh()
  (result_transform): Linear(in_features=1, out_features=3, bias=True)
  (classifier): LogSoftmax(dim=-1)
)

In [342]:
#@title Algorithms
def train(model, optimizer, home, away, y, loss_fn):
  model.train()
  optimizer.zero_grad()

  output = model(home, away) # forward prop
  loss = loss_fn(output, y) # loss
  loss.backward() # backprop
  optimizer.step() # parameter updating

  return loss.item()


@torch.no_grad()
def evaluate(model, home, away, y):
  model.eval()

  output = model(home, away)
  prediction = output.argmax(-1)

  correct = (prediction == y).sum()

  return correct.item(), y.shape[0]


def fit(model, train_set, dev_set, optimizer, loss_fn, num_epochs):
  assert train_set[0].shape[-1] == train_set[1].shape[-1] == train_set[2].shape[0]
  assert dev_set[0].shape[-1] == dev_set[1].shape[-1] == dev_set[2].shape[0]

  train_home, train_away, train_y = train_set
  dev_home, dev_away, dev_y = dev_set

  print('Initial State')
  train_correct, train_all = evaluate(model, train_home.reshape(-1), train_away.reshape(-1), train_y.reshape(-1))
  dev_correct, dev_all = evaluate(model, dev_home.reshape(-1), dev_away.reshape(-1), dev_y.reshape(-1))
  print(f'Train Acc%: {train_correct / train_all * 100:.4f}')
  print(f'Dev   Acc%: {dev_correct / dev_all * 100:.4f}')

  for epoch in range(1, num_epochs + 1):
    print('-' * 60)
    print(f'Epoch {epoch}')
    epoch_loss = 0
    for home_batch, away_batch, y_batch in zip(train_home, train_away, train_y):
      batch_loss = train(model, optimizer, train_home, train_away, train_y, loss_fn)
      epoch_loss += batch_loss
    train_correct, train_all = evaluate(model, train_home.reshape(-1), train_away.reshape(-1), train_y.reshape(-1))
    dev_correct, dev_all = evaluate(model, dev_home.reshape(-1), dev_away.reshape(-1), dev_y.reshape(-1))
    
    print(f'Train Loss: {epoch_loss:.4f}')
    print(f'Train Acc%: {train_correct / train_all * 100:.4f}')
    print(f'Dev   Acc%: {dev_correct / dev_all * 100:.4f}')

In [369]:
#@title Train/Dev/Test
n_epochs = 5
for train_set, dev_set, test_set in zip(train_groups, dev_groups, test_groups):
  home_train = torch.from_numpy(train_set.loc[:, ['home_team_label']].values.reshape(-1))
  away_train = torch.from_numpy(train_set.loc[:, ['away_team_label']].values.reshape(-1))
  y_train = torch.from_numpy(train_set.loc[:, ['result_label']].values.reshape(-1))

  home_dev = torch.from_numpy(dev_set.loc[:, ['home_team_label']].values.reshape(-1))
  away_dev = torch.from_numpy(dev_set.loc[:, ['away_team_label']].values.reshape(-1))
  y_dev = torch.from_numpy(dev_set.loc[:, ['result_label']].values.reshape(-1))

  home_test = torch.from_numpy(test_set.loc[:, ['home_team_label']].values.reshape(-1))
  away_test = torch.from_numpy(test_set.loc[:, ['away_team_label']].values.reshape(-1))
  y_test = torch.from_numpy(test_set.loc[:, ['result_label']].values.reshape(-1))

  print(f'Fitting on the {np.unique(train_set["league"].values).item()} for {n_epochs} epochs')
  print('.' * 60)
  fit(
    tbc_model, 
    (home_train, away_train, y_train),
    (home_dev, away_dev, y_dev),
    optimizer,
    criterion,
    n_epochs
  )

  test_correct, test_all = evaluate(tbc_model, home_test.reshape(-1), away_test.reshape(-1), y_test.reshape(-1))
  print(f'Test Acc%: {test_correct / test_all * 100:.4f}')
  print('=' * 60)


Fitting on the Belgium Jupiler League for 5 epochs
............................................................
Initial State
Train Acc%: 26.4113
Dev   Acc%: 22.3404
------------------------------------------------------------
Epoch 1
Train Loss: 1830.1428
Train Acc%: 54.1331
Dev   Acc%: 44.6809
------------------------------------------------------------
Epoch 2
Train Loss: 1367.9903
Train Acc%: 54.9395
Dev   Acc%: 48.9362
------------------------------------------------------------
Epoch 3
Train Loss: 1348.5321
Train Acc%: 56.5524
Dev   Acc%: 50.0000
------------------------------------------------------------
Epoch 4
Train Loss: 1341.5893
Train Acc%: 57.3589
Dev   Acc%: 48.9362
------------------------------------------------------------
Epoch 5
Train Loss: 1335.7370
Train Acc%: 57.5605
Dev   Acc%: 47.8723
Test Acc%: 48.4211
Fitting on the England Premier League for 5 epochs
............................................................
Initial State
Train Acc%: 42.2813
Dev   Acc%: 44