In [1]:
from DataLoader import DataLoader
import GraphManager
from GNN import HeteroGNN
import numpy as np
import pandas as pd
import torch
import torch.optim
import torch_geometric
import torch_geometric.data
import networkx as nx
import matplotlib.pyplot as plt
import Utils
from tqdm import tqdm
import pickle
import torchinfo

In [2]:
dl = DataLoader('data/KaggleDataset.csv', 11)
gm = GraphManager.GraphManager(dl, Utils.GLOBALS.DEVICE.value)
model = HeteroGNN(
    embedding_dims=[dl.entities.shape[0], Utils.HYPERPARAETERS.EmbeddingDim.value],
    conv_dims=Utils.HYPERPARAETERS.ConvDims.value,
    fc_dims=Utils.HYPERPARAETERS.FCDims.value,
    dropout=Utils.HYPERPARAETERS.DropOuts.value
).to(Utils.GLOBALS.DEVICE.value)

criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=Utils.HYPERPARAETERS.LearningRate.value) 

#Put True to load the Graphs from Load Folder - Put False to make the graphs, save them in Save Folder and use them
already_saved = True

#Put True to Load Model and loss Lists from Load Folder - Put False to start a new training
# ATTENTION: The result of the training will be saved in Save Folder ANYWAYS!!! - Copy Your Work Before Starting
continue_training = False

In [3]:
torchinfo.summary(model, depth=3)

Layer (type:depth-idx)                   Param #
HeteroGNN                                --
├─Embedding: 1-1                         87,048
├─ModuleList: 1-2                        --
│    └─HeteroConv: 2-1                   --
│    │    └─ModuleDict: 3-1              1,360
│    └─HeteroConv: 2-2                   --
│    │    └─ModuleDict: 3-2              2,512
│    └─HeteroConv: 2-3                   --
│    │    └─ModuleDict: 3-3              2,512
├─ModuleList: 1-3                        --
│    └─Linear: 2-4                       528
│    └─Linear: 2-5                       136
│    └─Linear: 2-6                       27
├─LogSoftmax: 1-4                        --
Total params: 93,499
Trainable params: 93,499
Non-trainable params: 0

In [4]:
if not already_saved:
    for league, league_df in dl.dataset.groupby('league'):
        print(f'Making {league} Graphs...')
        gm.make(
            league_df,
            mode='CG',
            validation_portion=0.1,
            test_portion=0.05,
            saveto=f'{Utils.GLOBALS.SavePath.value}{league}.gm')
        print(f'Graph List Length: {len(gm.graph_list)}')
        print(f'Train Mask: {len(gm.train_mask)}')
        print(f'Validation Mask: {len(gm.validation_mask)}')
        print(f'Test Mask: {len(gm.test_mask)}')
        print(f'Saved in: {Utils.GLOBALS.SavePath.value}{league}.gm')
        print('='*80)

In [5]:
def train_step(
    model: HeteroGNN,
    g: torch_geometric.data.HeteroData,
    criterion,
    optimizer: torch.optim.Optimizer
):
    model.train()
    optimizer.zero_grad()

    out = model(g)
    loss = criterion(out, g.y)
    loss.backward()
    optimizer.step()

    pred = torch.argmax(out, dim=-1)
    correct = (pred == g.y).sum().item()
    total = g.y.shape[0]

    return loss.item(), correct, total


@torch.no_grad()
def evaluation(model: HeteroGNN, g: torch_geometric.data.HeteroData):
    model.eval()

    out = model(g)
    pred = torch.argmax(out, dim=-1)
    correct = (pred == g.y).sum().item()
    total = g.y.shape[0]

    model.train()
    return correct, total


In [6]:
if continue_training:
    model = torch.load(f'{Utils.GLOBALS.LoadPath.value}model.pth')
    with open(f'{Utils.GLOBALS.LoadPath.value}lists.pl', 'rb') as pf:
        loss_list, train_acc_list, eval_acc_list = pickle.load(pf)
else:
    model.reset_parameters()
    loss_list = []
    train_acc_list = []
    eval_acc_list = []

In [7]:
for round in range(2):
    print(f'############################## Round {round + 1} ##############################')
    for league, league_df in dl.dataset.groupby('league'):
        print(f'Training On: {league}')
        if already_saved:
            gm = GraphManager.load(f'{Utils.GLOBALS.LoadPath.value}{league}.gm')
        else:
            gm = GraphManager.load(f'{Utils.GLOBALS.SavePath.value}{league}.gm')
        try:
            for epoch in range(40):
                t_loss = 0
                t_correct = 0
                t_total = 0
                for idx in gm.train_mask:
                    g = gm.graph_list[idx]
                    loss, correct, total = train_step(model, g, criterion, optimizer)
                    t_loss += loss
                    t_correct += correct
                    t_total += total
                print(f'=================================== EPOCH {epoch + 1} ===================================')
                print(f'Average Loss: {t_loss / len(gm.train_mask)} - Train Accuracy: {t_correct / t_total: .3f}')
                loss_list.append(t_loss / len(gm.train_mask))
                train_acc_list.append(t_correct / t_total)

                t_correct = 0
                t_total = 0

                for idx in gm.validation_mask:
                    g = gm.graph_list[idx]
                    correct, total = evaluation(model, g)
                    t_correct += correct
                    t_total += total
                print(f'Validation Accuracy: {t_correct / t_total: .3f}')
                eval_acc_list.append(t_correct / t_total)

                if (epoch+1) % Utils.GLOBALS.SaveEvery.value == 0:
                    torch.save(model, f'{Utils.GLOBALS.SavePath.value}model.pth')
                    with open(f'{Utils.GLOBALS.SavePath.value}lists.pl', 'wb') as pf:
                        pickle.dump((loss_list, train_acc_list, eval_acc_list), pf)
                        
        except KeyboardInterrupt:
            pass
        t_correct = 0
        t_total = 0

        for idx in gm.test_mask:
            g = gm.graph_list[idx]
            correct, total = evaluation(model, g)
            t_correct += correct
            t_total += total
        print(f'Test Accuracy: {t_correct / t_total: .3f}')

############################## Round 1 ##############################
Training On: Belgium Jupiler League
Average Loss: 1.3707029982046648 - Train Accuracy:  0.338
Validation Accuracy:  0.289
Average Loss: 1.125305737690492 - Train Accuracy:  0.388
Validation Accuracy:  0.355
Average Loss: 1.0910489624196833 - Train Accuracy:  0.413
Validation Accuracy:  0.461
Average Loss: 1.076831741766496 - Train Accuracy:  0.437
Validation Accuracy:  0.461
Average Loss: 1.0816125396526222 - Train Accuracy:  0.437
Validation Accuracy:  0.461
Average Loss: 1.0694368582783322 - Train Accuracy:  0.456
Validation Accuracy:  0.461
Average Loss: 1.0669629823077809 - Train Accuracy:  0.462
Validation Accuracy:  0.461
Average Loss: 1.0607432957851526 - Train Accuracy:  0.455
Validation Accuracy:  0.461
Average Loss: 1.0634945887507814 - Train Accuracy:  0.456
Validation Accuracy:  0.461
Average Loss: 1.0626356088753903 - Train Accuracy:  0.462
Validation Accuracy:  0.461
Average Loss: 1.0601830677552657 - T

In [8]:
for league, league_df in dl.dataset.groupby('league'):
    print(f'Testing On: {league}')
    if already_saved:
        gm = GraphManager.load(f'{Utils.GLOBALS.LoadPath.value}{league}.gm')
    else:
        gm = GraphManager.load(f'{Utils.GLOBALS.SavePath.value}{league}.gm')
    
    t_correct = 0
    t_total = 0

    for idx in gm.test_mask:
        g = gm.graph_list[idx]
        correct, total = evaluation(model, g)
        t_correct += correct
        t_total += total
    print(f'Test Accuracy: {t_correct / t_total: .3f}')

Testing On: Belgium Jupiler League
Test Accuracy:  0.479
Testing On: England Premier League
Test Accuracy:  0.401
Testing On: France Ligue 1
Test Accuracy:  0.527
Testing On: Germany 1. Bundesliga
Test Accuracy:  0.444
Testing On: Italy Serie A
Test Accuracy:  0.507
Testing On: Netherlands Eredivisie
Test Accuracy:  0.495
Testing On: Poland Ekstraklasa
Test Accuracy:  0.448
Testing On: Portugal Liga ZON Sagres
Test Accuracy:  0.447
Testing On: Scotland Premier League
Test Accuracy:  0.539
Testing On: Spain LIGA BBVA
Test Accuracy:  0.510
Testing On: Switzerland Super League
Test Accuracy:  0.536


## Test Cells - DO NOT RUN

In [8]:
model(gm.graph_list[3])

tensor([[-2.7374e+00, -8.8250e+00, -6.7085e-02],
        [-1.1629e+01, -2.9417e+01, -8.9407e-06]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)

In [8]:
dl.dataset.loc[dl.dataset['league'] == 'Belgium Jupiler League', :]

Unnamed: 0,league,season,week,home_team,away_team,result,home_lineup,away_lineup
0,Belgium Jupiler League,2008/2009,24,KV Mechelen,KRC Genk,win,"[Wouter Biebauw, Kenny van Hoevelen, Nana Asar...","[Davino Verhulst, Joao Carlos, Dimitri Daesela..."
1,Belgium Jupiler League,2008/2009,25,KSV Cercle Brugge,Club Brugge KV,loss,"[Bram Verbist, Denis Viane, Anthony Portier, F...","[Stijn Stijnen, Michael Klukowski, Antolin Alc..."
2,Belgium Jupiler League,2008/2009,25,RSC Anderlecht,SV Zulte-Waregem,win,"[Davy Schollen, Olivier Deschacht, Arnold Krui...","[Sammy Bossuyt, Karel D'Haene, Stijn Minne, Ba..."
3,Belgium Jupiler League,2008/2009,26,KV Mechelen,RSC Anderlecht,win,"[Wouter Biebauw, Kenny van Hoevelen, Nana Asar...","[Davy Schollen, Olivier Deschacht, Roland Juha..."
4,Belgium Jupiler League,2008/2009,26,SV Zulte-Waregem,KSV Roeselare,tie,"[Sammy Bossuyt, Karel D'Haene, Steve Colpaert,...","[Jurgen Sierens, Damir Mirvic, Mladen Lazarevi..."
...,...,...,...,...,...,...,...,...
1209,Belgium Jupiler League,2015/2016,30,SV Zulte-Waregem,Royal Excel Mouscron,win,"[Kenny Steppe, Henrik Dalsgaard, Christophe Le...","[Vagner, Noe Dussenne, Mickael Tirpan, Jean-Ch..."
1210,Belgium Jupiler League,2015/2016,30,Sporting Charleroi,KAA Gent,tie,"[Nicolas Penneteau, Damien Marcq, Gjoko Zajkov...","[Matz Sels, Lasse Nielsen, Rami Gershon, Nana ..."
1211,Belgium Jupiler League,2015/2016,30,Oud-Heverlee Leuven,Club Brugge KV,loss,"[Rudy Riou, Jordan Remacle, Kanu, Romain Reyna...","[Ludovic Butelle, Thomas Meunier, Bjorn Engels..."
1212,Belgium Jupiler League,2015/2016,30,KVC Westerlo,Waasland-Beveren,win,"[Kristof van Hout, Mitch Apau, Kenneth Schuerm...","[Laurent Henkinet, Hugo Sousa, Gary Coulibaly,..."


In [9]:
ht = gm._gen_heterodata(dl.dataset.loc[dl.dataset['league'] == 'Belgium Jupiler League', :], supervision_indcs=dl.dataset.loc[dl.dataset['league'] == 'Belgium Jupiler League', :].index[-20:])

In [10]:
ht

HeteroData(
  [1mteam[0m={ x=[2428] },
  [1mplayer[0m={ x=[26708] },
  [1m(team, used, player)[0m={ edge_index=[2, 26708] },
  [1m(player, playedin, team)[0m={ edge_index=[2, 26708] },
  [1m(team, win, team)[0m={ edge_index=[2, 899] },
  [1m(team, loss, team)[0m={ edge_index=[2, 899] },
  [1m(team, tie, team)[0m={ edge_index=[2, 590] },
  [1m(team, before, team)[0m={ edge_index=[2, 2406] },
  [1m(team, after, team)[0m={ edge_index=[2, 2406] },
  [1m(player, before, player)[0m={ edge_index=[2, 25641] },
  [1m(player, after, player)[0m={ edge_index=[2, 25641] }
)

In [8]:
ht.to_homogeneous()

Data(node_type=[96], x=[96], edge_index=[2, 184], edge_type=[184])

In [9]:
hm = ht.to_homogeneous()
g = torch_geometric.utils.to_networkx(hm)
pos = nx.spring_layout(g)  # positions for all nodes


node_labels = {}
for i, n in enumerate(hm.node_type.cpu().numpy()):
    node_labels[i] = 'team' if n==0 else 'player'

edge_list = hm.edge_index.T.cpu().tolist()
edge_labels = {}
for i, e in enumerate(hm.edge_type.cpu().numpy()):
    if e==0 or e==1: edge_labels[tuple(edge_list[i])] = 'used - played_in'
    elif e==2 or e==3: edge_labels[tuple(edge_list[i])] = 'won - loss'

fig, ax = plt.subplots()
nx.draw_networkx_nodes(
    g,
    ax=ax,
    pos=pos)
nx.draw_networkx_labels(g,pos=pos, ax=ax, labels=node_labels)
nx.draw_networkx_edges(g, pos, ax=ax, connectionstyle='arc3,rad=0.1')
nx.draw_networkx_edge_labels(g, pos=pos, ax=ax, edge_labels=edge_labels)
fig.set_size_inches((100, 100))
fig.savefig('a.png')

NameError: name 'ht' is not defined

In [7]:
for i, e in enumerate(hm.edge_index.cpu().numpy().T):
    print(i)
    print(e)
    break

0
[0 2]


In [8]:
np.moveaxis(np.stack((
            dl.DatasetDataframetoNumpy(dl.dataset)[2],
            dl.DatasetDataframetoNumpy(dl.dataset)[3]
        )), 0, 1).shape


(21309, 2, 11)

In [9]:
dl.DatasetDataframetoNumpy(dl.dataset)[2].shape

(21309, 11)

In [10]:
team_node_features = dl.labeler.transform(np.moveaxis(np.stack((
            dl.DatasetDataframetoNumpy(dl.dataset)[2],
            dl.DatasetDataframetoNumpy(dl.dataset)[3]
        )), 0, 1).flatten())

In [11]:
for i, r in dl.dataset.iterrows():
    print(r['week'])
    break


24


In [12]:
np.stack((
    dl.DatasetDataframetoNodeText(dl.dataset)[0],
    dl.DatasetDataframetoNodeText(dl.dataset)[1]
    )).T.flatten()

array(['KV Mechelen*0', 'KRC Genk*0', 'KSV Cercle Brugge*1', ...,
       'BSC Young Boys*21307', 'FC Zürich*21308', 'FC Vaduz*21308'],
      dtype='<U34')

In [13]:
a = dl.DatasetDataframetoNodeText(dl.dataset)
np.arange(a[0].shape[0] * 2)
t = pd.Series(
    np.arange(a[0].shape[0] * 2),
    index=np.stack((
        dl.DatasetDataframetoNodeText(dl.dataset)[0],
        dl.DatasetDataframetoNodeText(dl.dataset)[1]
    )).T.flatten()
)

In [14]:
p = pd.Series(
    np.arange(a[2].shape[0] * a[2].shape[1] * 2),
    index=np.moveaxis(np.stack((
        dl.DatasetDataframetoNodeText(dl.dataset)[2],
        dl.DatasetDataframetoNodeText(dl.dataset)[3]
    )), 0, 1).flatten())

In [15]:
p.to_numpy()

array([     0,      1,      2, ..., 468795, 468796, 468797])

In [16]:
aaa=np.repeat(t.to_numpy(), dl.minimum_players_per_team)

In [17]:
ttt = torch.stack((
    torch.tensor(aaa),
    torch.tensor(p)
)).T

In [18]:
aaa.shape

(468798,)

In [19]:
np.arange(a[2].shape[0] * a[2].shape[1] * 2).reshape(a[2].shape[0], -1, a[2].shape[1])

array([[[     0,      1,      2, ...,      8,      9,     10],
        [    11,     12,     13, ...,     19,     20,     21]],

       [[    22,     23,     24, ...,     30,     31,     32],
        [    33,     34,     35, ...,     41,     42,     43]],

       [[    44,     45,     46, ...,     52,     53,     54],
        [    55,     56,     57, ...,     63,     64,     65]],

       ...,

       [[468732, 468733, 468734, ..., 468740, 468741, 468742],
        [468743, 468744, 468745, ..., 468751, 468752, 468753]],

       [[468754, 468755, 468756, ..., 468762, 468763, 468764],
        [468765, 468766, 468767, ..., 468773, 468774, 468775]],

       [[468776, 468777, 468778, ..., 468784, 468785, 468786],
        [468787, 468788, 468789, ..., 468795, 468796, 468797]]])

In [20]:
dl.dataset.loc[dl.dataset['result']=='win', :].index.values

array([    0,     2,     3, ..., 21297, 21305, 21308])

In [22]:
dl.DatasetDataframetoNodeText(
                dl.dataset.loc[dl.dataset['result'] == 'win', :]
            )

(array(['KV Mechelen*0', 'RSC Anderlecht*2', 'KV Mechelen*3', ...,
        'FC St. Gallen*21297', 'Lugano*21305', 'FC Zürich*21308'],
       dtype='<U34'),
 array(['KRC Genk*0', 'SV Zulte-Waregem*2', 'RSC Anderlecht*3', ...,
        'FC Zürich*21297', 'FC St. Gallen*21305', 'FC Vaduz*21308'],
       dtype='<U34'),
 array([['Wouter Biebauw@0', 'Kenny van Hoevelen@0', 'Nana Asare@0', ...,
         'Romeo van Dessel@0', 'Wouter Vrancken@0', 'Giuseppe Rossini@0'],
        ['Davy Schollen@2', 'Olivier Deschacht@2', 'Arnold Kruiswijk@2',
         ..., 'Mbark Boussoufa@2', 'Oleksandr Iakovenko@2',
         'Tom De Sutter@2'],
        ['Wouter Biebauw@3', 'Kenny van Hoevelen@3', 'Nana Asare@3', ...,
         'Romeo van Dessel@3', 'Bjoern Vleminckx@3', 'Giuseppe Rossini@3'],
        ...,
        ['Daniel Lopar@21297', 'Alain Wiss@21297', 'Silvan Hefti@21297',
         ..., 'Danijel Aleksic@21297', 'Marco Aratore@21297',
         'Edgar Salli@21297'],
        ['Mirko Salvi@21305', 'Frederic Vese