In [1]:
# #@title PyG Installation { form-width: "25%" }
# # enter these commands in CLI to install Pytorch-Geometric
# !pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu111.html
# !pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu111.html
# !pip install -q git+https://github.com/rusty1s/pytorch_geometric.git

In [2]:
#@title Module Imports { form-width: "20%" }
import pandas as pd
import torch
from torch.nn import Module,\
                     ModuleList,\
                     Embedding,\
                     BatchNorm1d,\
                     LogSoftmax,\
                     Softmax,\
                     Linear,\
                     NLLLoss,\
                     CrossEntropyLoss
from torch.optim import Adam
import torch.nn.functional as F
import torch_geometric as PyG
from torch_geometric.data import Data, HeteroData
from torch_geometric.nn.conv import RGCNConv, GINConv, GATConv, HeteroConv, GCNConv
from torch_geometric.utils import to_networkx
from collections import OrderedDict as od
import logging
import json
from typing import NoReturn
import typing
from new.Utils import Globals


In [3]:
#@title Global Variables
# Global Values

print(f'Device is {Globals.DEVICE.value}')

Device is cuda


In [4]:
#@title GNN Model { form-width: "10%" }
class GNN(Module):
  def __init__(self, embedding_dims: tuple, conv_dims: list, fully_connected_dims: list, dropout: dict)-> NoReturn:
    super(GNN, self).__init__()

    self.mode = None # 'train' or 'test' or 'dev' later 
    self.output_dim = 3 #home_result: win, lose, tie
    self.num_relations = 7 #win/lose/tie/play/use/after/before
    self.dropout = dropout

    #one-hot to latent
    self.embed = Embedding(embedding_dims[0], embedding_dims[1])

    conv_list = [
                  RGCNConv(embedding_dims[1], conv_dims[0], self.num_relations)
                ] + \
                [
                  RGCNConv(conv_dims[i], conv_dims[i+1], self.num_relations)
                  for i in range(len(conv_dims[:-1]))
                ]
  
    batch_norm_list = [
                         BatchNorm1d(conv_dims[i])
                         for i in range(len(conv_dims[:-1]))
                      ]

    fully_connected_list =   [
                                Linear(2*conv_dims[-1], fully_connected_dims[0])
                             ] + \
                             [
                                Linear(fully_connected_dims[i], fully_connected_dims[i+1])
                                for i in range(len(fully_connected_dims[:-1]))
                             ] + \
                             [
                                Linear(fully_connected_dims[-1], self.output_dim)
                             ]
    #graph conv layers
    self.conv_layers = ModuleList(conv_list)
    #batch normalization layers
    self.batch_norm_layers = ModuleList(batch_norm_list)
    #fully connected dense layers
    self.fully_connected_layers = ModuleList(fully_connected_list)

    self.classifier = LogSoftmax()

    
  def reset_parameters(self):
        for conv in self.conv_layers:
            conv.reset_parameters()
        for bn in self.batch_norm_layers:
            bn.reset_parameters()
        for fc in self.fully_connected_layers:
            fc.reset_parameters()
          

  def forward(self, x:torch.Tensor, edge_index:torch.Tensor, edge_type:torch.Tensor, home_list:list, away_list:list) -> torch.Tensor:
    x = self.embed(x)
    if self.training:
      x = F.dropout(x, p=self.dropout["emb"])

    for conv, bn in zip(self.conv_layers[:-1], self.batch_norm_layers):
      x = conv(x, edge_index=edge_index, edge_type=edge_type)
      x = bn(x)
      x = F.relu(x)
      if self.training:
        x = F.dropout(x, p=self.dropout["conv"])


    x = self.conv_layers[-1](x, edge_index, edge_type)
    if self.training:
      x = F.dropout(x, p=self.dropout["conv"])

    ##################################### End of Encoder 

    pred = list()
    for home_team, away_team in zip(home_list, away_list):
      h = torch.cat((x[home_team], x[away_team]))

      for fc in self.fully_connected_layers[:-1]:
        h = fc(h)
        h = F.relu(h)
        if self.training:
          h = F.dropout(h, p=self.dropout["fc"])

      h = self.fully_connected_layers[-1](h)
      if self.training:
        h = F.dropout(h, p=self.dropout["fc"])
      pred.append(self.classifier(h))

    return torch.stack(pred)

In [5]:
#@title HeteroGNN Model { form-width: "10%" }
class HeteroGNN(Module):
  def __init__(self, embedding_dims: tuple, conv_dims: list, fully_connected_dims: list, dropout: dict)-> NoReturn:
    super(HeteroGNN, self).__init__()

    self.mode = None # 'train' or 'test' or 'dev' later 
    self.output_dim = 3 #home_result: win, lose, tie
    self.num_relations = 7 #win/lose/tie/play/use/after/before
    self.dropout = dropout

    #one-hot to latent
    self.embed = Embedding(embedding_dims[0], embedding_dims[1])
    
    conv_list = [
                  HeteroConv(
                      {
                          ('team', 'won', 'team'): GCNConv(embedding_dims[-1], conv_dims[0]),
                          ('team', 'lost_to', 'team'): GCNConv(embedding_dims[-1], conv_dims[0]),
                          ('team', 'tied_with', 'team'): GCNConv(embedding_dims[-1], conv_dims[0]),
                          ('player', 'played_for', 'team'): GATConv(embedding_dims[-1], conv_dims[0], heads=1),
                          ('team', 'used', 'player'): GATConv(embedding_dims[-1], conv_dims[0], heads=1),
                          ('player', 'is_before', 'player'): GCNConv(embedding_dims[-1], conv_dims[0]),
                          ('player', 'is_after', 'player'): GCNConv(embedding_dims[-1], conv_dims[0]),
                          ('team', 'is_before', 'team'): GCNConv(embedding_dims[-1], conv_dims[0]),
                          ('team', 'is_after', 'team'): GCNConv(embedding_dims[-1], conv_dims[0])
                      }, aggr='sum'
                  )
                ] + \
                [
                  HeteroConv(
                      {
                          ('team', 'won', 'team'): GCNConv(conv_dims[i], conv_dims[i+1]),
                          ('team', 'lost_to', 'team'): GCNConv(conv_dims[i], conv_dims[i+1]),
                          ('team', 'tied_with', 'team'): GCNConv(conv_dims[i], conv_dims[i+1]),
                          ('player', 'played_for', 'team'): GATConv(conv_dims[i], conv_dims[i+1], heads=1),
                          ('team', 'used', 'player'): GATConv(conv_dims[i], conv_dims[i+1], heads=1),
                          ('player', 'is_before', 'player'): GCNConv(conv_dims[i], conv_dims[i+1]),
                          ('player', 'is_after', 'player'): GCNConv(conv_dims[i], conv_dims[i+1]),
                          ('team', 'is_before', 'team'): GCNConv(conv_dims[i], conv_dims[i+1]),
                          ('team', 'is_after', 'team'): GCNConv(conv_dims[i], conv_dims[i+1])
                      }, aggr='sum'
                  )
                  for i in range(len(conv_dims[:-1]))
                ]


              

  
    # batch_norm_list = [
    #                      BatchNorm1d(conv_dims[i])
    #                      for i in range(len(conv_dims[:-1]))
    #                   ]

    fully_connected_list =   [
                                Linear(2*conv_dims[-1], fully_connected_dims[0])
                             ] + \
                             [
                                Linear(fully_connected_dims[i], fully_connected_dims[i+1])
                                for i in range(len(fully_connected_dims[:-1]))
                             ] + \
                             [
                                Linear(fully_connected_dims[-1], self.output_dim)
                             ]
    #graph conv layers
    self.conv_layers = ModuleList(conv_list)
    #batch normalization layers

    # self.batch_norm_layers = ModuleList(batch_norm_list)

    #fully connected dense layers
    self.fully_connected_layers = ModuleList(fully_connected_list)

    self.classifier = LogSoftmax(dim=1)
      

  def reset_parameters(self):
      self.embed.reset_parameters()
      for conv in self.conv_layers:
          # for layer in conv:
          #   layer.reset_parameters()
          conv.reset_parameters()
      # for bn in self.batch_norm_layers:
      #     bn.reset_parameters()
      for fc in self.fully_connected_layers:
          fc.reset_parameters()


  def forward(self, data: HeteroData) -> torch.Tensor:
    x_dict = data.x_dict
    home_list = data.home_list
    away_list = data.away_list

    edge_index_dict = data.edge_index_dict
    x_dict = {key: self.embed(x) for key, x in x_dict.items()}
    
    if self.training:
      x_dict = {key: F.dropout(x, p=self.dropout["emb"]) for key, x in x_dict.items()}

    # for conv, bn in zip(self.conv_layers[:-1], self.batch_norm_layers):
    for conv in self.conv_layers[:-1]:
      x_dict = conv(x_dict, edge_index_dict=edge_index_dict)
      x_dict = {key: F.relu(x) for key, x in x_dict.items()}
      if self.training:
        x_dict = {key: F.dropout(x, p=self.dropout["conv"]) for key, x in x_dict.items()}

    x_dict = self.conv_layers[-1](x_dict, edge_index_dict=edge_index_dict)
    if self.training:
      x_dict = {key: F.dropout(x, p=self.dropout["conv"]) for key, x in x_dict.items()}

    ##################################### End of Encoder 
    h = torch.cat(
        (x_dict['team'][home_list], x_dict['team'][away_list]),
        dim=1
    )

    for fc in self.fully_connected_layers[:-1]:
      h = fc(h)
      h = F.relu(h)
      if self.training:
        h = F.dropout(h, p=self.dropout["fc"])

    h = self.fully_connected_layers[-1](h)
    if self.training:
      h = F.dropout(h, p=self.dropout["fc"])

    return self.classifier(h)

    

In [6]:
#@title home_result(row)
def home_result(row: str) -> int:
  if row == 'home':
    return Globals.WON.value
  elif row == 'tie':
    return Globals.TIED_WITH.value
  elif row == 'away':
    return Globals.LOST_TO.value

In [7]:
#@title remove_redundancy(players) { form-width: "15%" }
def remove_redundancy(players: list) -> list:
  new_players = list()

  for player in players:
    if 'Own' in player:
      player = player.replace('Own', '')
    if 'Pen. Scored' in player:
      player = player.replace('Pen. Scored', '')
    if 'Pen. Score' in player:
      player = player.replace('Pen. Score', '')
    if 'Own' in player or 'Scored' in player or 'Score' in player:
      print(player)
      #SHOULD NOT PRINT IF CODE IS CORRECT
    else:
      new_players.append(player.strip())
  return new_players

In [8]:
#@title extract_players(home_lineup, away_lineup) { form-width: "15%" }
def extract_players(home_lineup: str, away_lineup: str) -> list:
  home_players = home_lineup[:-2].split(' - ')
  away_players = away_lineup[:-2].split(' - ')
  
  return remove_redundancy(home_players), remove_redundancy(away_players)

In [9]:
#@title stats(df, show_players, show_teams, show_results) { form-width: "10%" }
def stats(df: pd.DataFrame, show_players: bool=False, show_teams: bool=False, show_results: bool=False) -> NoReturn:
  players_set = set()
  players_list = list()
  teams_set = set()

  teams_list = list()
  results = dict()
  for index, (h_team, a_team, result, h_lineup, a_lineup) in df.iterrows():
    home_players, away_players = extract_players(h_lineup, a_lineup)
    players_set.update(home_players + away_players)
    players_list.extend(home_players + away_players)
    if result == 'home':
      results.update({f'{h_team} #Wins': results.get(f'{h_team} #Wins', 0)+1})
      results.update({f'{a_team} #Losses': results.get(f'{a_team} #Losses', 0)+1})
    elif result == 'tie':
      results.update({f'{h_team} #Ties': results.get(f'{h_team} #Ties', 0)+1})
      results.update({f'{a_team} #Ties': results.get(f'{a_team} #Ties', 0)+1})
    else:
      results.update({f'{a_team} #Wins': results.get(f'{a_team} #Wins', 0)+1})
      results.update({f'{h_team} #Losses': results.get(f'{h_team} #Losses', 0)+1})

    teams_list.extend([h_team, a_team])
    teams_set.update([h_team, a_team])
    
  if show_players:
    for player in players_set:
      print(f'{player} played in {players_list.count(player)} matches.')
  if show_teams:
    for team in teams_set:
      print(f'{team} played {teams_list.count(team)} matches.')
  if show_results:
    results = od(sorted(results.items()))
    for key, val in results.items():
      print(f'{key}: {val}')

In [10]:
#@title extract_entities(df) { form-width: "15%" }
def extract_entities(df: pd.DataFrame) -> typing.Tuple[set, set]:
  players_set = set()
  players_list = list()
  teams_set = set()

  teams_list = list()
  # results = dict()
  for index, (season, week, h_team, a_team, result, h_lineup, a_lineup) in df.iterrows():
    home_players, away_players = extract_players(h_lineup, a_lineup)

    players_set.update(home_players + away_players)
    teams_set.update([h_team, a_team])
    
  
  return teams_set, players_set

In [11]:
#@title gen_entites(df) { form-width: "15%" }
def gen_entities(df: pd.DataFrame) -> dict:
  teams, players = extract_entities(df)
  entities = {entity: index for index, entity in enumerate(list(players) + list(teams))}
  return entities

In [12]:
#@title nodes_gen(df) OK_HETERO { form-width: "15%" }

def nodes_gen(df: pd.DataFrame) -> typing.Tuple[dict, dict]:
  player_nodes = dict()
  team_nodes = dict()
  player_node_counter = 0
  team_node_counter = 0

  for index, (season, week, h_team, a_team, result, h_lineup, a_lineup) in df.iterrows():
      home_players, away_players = extract_players(h_lineup, a_lineup)

      for player_index, player in enumerate(home_players):
        player_nodes[f'{player}@{index}'] = player_node_counter
        player_node_counter += 1
      for player_index, player in enumerate(away_players):
        player_nodes[f'{player}@{index}'] = player_node_counter
        player_node_counter += 1

      team_nodes[f'{h_team}*{index}'] = team_node_counter
      team_node_counter += 1

      team_nodes[f'{a_team}*{index}'] = team_node_counter
      team_node_counter += 1

  return player_nodes, team_nodes


In [13]:
#@title show_edges(df, edge, edge_type) USELESS { form-width: "15%" }
def show_edges(df: pd.DataFrame, edge: torch.Tensor, edge_type: torch.Tensor, tt:str) -> NoReturn:
  types = {
      0: 'Won',
      1: 'Lost To',
      2: 'Tied With',
      3: 'Played For',
      4: 'Used As Player',
      5: 'Is Before',
      6: 'Is After'
  }
  t = {'p': 0, 't':1}
  nodes = nodes_gen(df)[t[tt]]
  r = {k:v for v, k in nodes.items()}
  for i in range(edge_type.shape[0]):
    head = int(edge[0][i].item())
    tail = int(edge[1][i].item())
    relation = int(edge_type[i].item())
    arrow = f'=== {types[relation]} ===>'
    print(f'{r[head]:<32}   {arrow}   {r[tail]:>32}')

In [14]:
#@title home_won_gen(df) OK_HETERO { form-width: "15%" }
def home_won_gen(df: pd.DataFrame, full_data_frame=None) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
  home_winning_matches = df.loc[df['result'] == 'home']
  home_winners = home_winning_matches['home_team']
  away_losers = home_winning_matches['away_team']

  winning_hashes = list()
  losing_hashes = list()

  for home, away, match in zip(home_winners, away_losers, home_winners.index):
    winning_hashes.append(f'{home}*{match}')
    losing_hashes.append(f'{away}*{match}')

  winning_nodes = list()
  losing_nodes = list()

  if full_data_frame is None:
    full_data_frame = df
  _, team_nodes = nodes_gen(full_data_frame)

  for winner, loser in zip(winning_hashes, losing_hashes):
    winning_nodes.append(team_nodes[winner]) 
    losing_nodes.append(team_nodes[loser])

  won_edges = torch.tensor(
      [
      winning_nodes,
      losing_nodes
      ], 
      dtype=torch.long,
      device=Globals.DEVICE.value
  )

  lost_edges = torch.tensor(
      [
      losing_nodes,
      winning_nodes
      ],
      dtype=torch.long,
      device=Globals.DEVICE.value
  )

  return won_edges, lost_edges

In [15]:
#@title away_won_gen(df) OK_HETERO { form-width: "15%" }
def away_won_gen(df: pd.DataFrame, full_data_frame=None) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
  away_winning_matches = df.loc[df['result'] == 'away']
  away_winners = away_winning_matches['away_team']
  home_losers = away_winning_matches['home_team']

  winning_hashes = list()
  losing_hashes = list()

  for home, away, match in zip(home_losers, away_winners, away_winners.index):
    winning_hashes.append(f'{away}*{match}')
    losing_hashes.append(f'{home}*{match}')

  winning_nodes = list()
  losing_nodes = list()

  if full_data_frame is None:
    full_data_frame = df
  _, team_nodes = nodes_gen(full_data_frame)

  for winner, loser in zip(winning_hashes, losing_hashes):
    winning_nodes.append(team_nodes[winner]) 
    losing_nodes.append(team_nodes[loser])

  won_edges = torch.tensor(
      [
      winning_nodes,
      losing_nodes
      ],
      dtype=torch.long,
      device=Globals.DEVICE.value
  )

  lost_edges = torch.tensor(
      [
      losing_nodes,
      winning_nodes
      ],
      dtype=torch.long,
      device=Globals.DEVICE.value
  )
  
  return won_edges, lost_edges

In [16]:
#@title tied_gen(df) OK_HETERO { form-width: "15%" }
def tied_gen(df: pd.DataFrame, full_data_frame=None) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
  tied_matches = df.loc[df['result'] == 'tie']
  home_teams = tied_matches['home_team']
  away_teams = tied_matches['away_team']

  home_hashes = list()
  away_hashes = list()

  for home, away, match in zip(home_teams, away_teams, away_teams.index):
    away_hashes.append(f'{away}*{match}')
    home_hashes.append(f'{home}*{match}')

  home_nodes = list()
  away_nodes = list()

  if full_data_frame is None:
    full_data_frame = df
  _, team_nodes = nodes_gen(full_data_frame)

  for home, away in zip(home_hashes, away_hashes):
    home_nodes.append(team_nodes[home]) 
    away_nodes.append(team_nodes[away])

  home_tied_edges = torch.tensor(
      [
      home_nodes,
      away_nodes
      ],
      dtype=torch.long,
      device=Globals.DEVICE.value
  )

  away_tied_edges = torch.tensor(
      [
      away_nodes,
      home_nodes
      ], 
      dtype=torch.long,
      device=Globals.DEVICE.value
  )

  return home_tied_edges, away_tied_edges

In [17]:
#@title played_used_gen(df) OK_HETERO { form-width: "15%" }
def played_used_gen(df: pd.DataFrame) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
  team_nodes = list()
  player_nodes = list()

  p_nodes, t_nodes = nodes_gen(df)

  for index, (season, week, h_team, a_team, result, h_lineup, a_lineup) in df.iterrows():
    home_players, away_players = extract_players(h_lineup, a_lineup)

    for home_player, away_player in zip(home_players, away_players):
      player_nodes.append(p_nodes[f'{home_player}@{index}'])
      team_nodes.append(t_nodes[f'{h_team}*{index}'])
      player_nodes.append(p_nodes[f'{away_player}@{index}'])
      team_nodes.append(t_nodes[f'{a_team}*{index}'])

  played_in_edges = torch.tensor(
      [
       player_nodes,
       team_nodes
      ],
      dtype=torch.long,
      device=Globals.DEVICE.value
  )

  used_edges = torch.tensor(
      [
       team_nodes,
       player_nodes
      ],
      dtype=torch.long,
      device=Globals.DEVICE.value
  ) 

  return played_in_edges, used_edges

In [18]:
#@title players_before_after_gen(df) OK_HETERO { form-width: "15%" }
#TODO
def players_before_after_gen(df: pd.DataFrame) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
  player_match_hashes = list()

  for index, (season, week, h_team, a_team, result, h_lineup, a_lineup) in df.iterrows():
      home_players, away_players = extract_players(h_lineup, a_lineup)

      for player in home_players + away_players:
        player_match_hashes.append(f'{player}@{index}')



  sorted_hashes = sorted(
      player_match_hashes,
      key=lambda w: (w.split('@')[0], int(w.split('@')[1]))
  )

  before_nodes = list()
  after_nodes = list()

  player_nodes, _ = nodes_gen(df)

  for index, hash in enumerate(sorted_hashes):
    player, match = hash.split('@')
    before_node = player_nodes[hash]
    try:
      after_node = player_nodes[sorted_hashes[index+1]]
      before_name = player_match_hashes[before_node].split('@')[0]
      after_name = player_match_hashes[after_node].split('@')[0]
      if before_name == after_name:
        before_nodes.append(before_node)
        after_nodes.append(after_node)
    except:
      pass
  before_edges = torch.tensor(
      [
      before_nodes,
      after_nodes
      ], dtype=torch.long,
      device=Globals.DEVICE.value
  )

  after_edges = torch.tensor(
      [
      after_nodes,
      before_nodes
      ], dtype=torch.long,
      device=Globals.DEVICE.value
  )

  return before_edges, after_edges

In [19]:
#@title teams_before_after_gen(df) OK_HETERO { form-width: "15%" }
def teams_before_after_gen(df: pd.DataFrame) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
  team_match_hashes = list()

  for index, (season, week, h_team, a_team, result, h_lineup, a_lineup) in df.iterrows():
      team_match_hashes.append(f'{h_team}*{index}')
      team_match_hashes.append(f'{a_team}*{index}')

  sorted_hashes = sorted(
      team_match_hashes,
      key= lambda w: (w.split('*')[0], int(w.split('*')[1]))
  )

  before_nodes = list()
  after_nodes = list()

  _, team_nodes = nodes_gen(df)

  for index, hash in enumerate(sorted_hashes):
    team, match = hash.split('*')
    before_node = team_nodes[hash]
    try:
      after_node = team_nodes[sorted_hashes[index+1]]
      before_name = team_match_hashes[before_node].split('*')[0]
      after_name = team_match_hashes[after_node].split('*')[0]
      if before_name == after_name:
        before_nodes.append(before_node)
        after_nodes.append(after_node)
    except:
      pass
  before_edges = torch.tensor(
      [
      before_nodes,
      after_nodes
      ], dtype=torch.long,
      device=Globals.DEVICE.value
  )

  after_edges = torch.tensor(
      [
      after_nodes,
      before_nodes
      ], dtype=torch.long,
      device=Globals.DEVICE.value
  )

  return before_edges, after_edges

In [20]:
#@title complete_graph_gen(df, for_players, for_teams) OK_HETERO { form-width: "10%" }
def complete_graph_gen(df: pd.DataFrame, for_players: bool=True, for_teams: bool=True) -> dict:
  home_won, away_lost = home_won_gen(df)
  away_won, home_lost = away_won_gen(df)
  home_tied, away_tied = tied_gen(df)
  player_played, team_used = played_used_gen(df)

  if for_players:
    player_before, player_after = players_before_after_gen(df)
  if for_teams:
    team_before, team_after = teams_before_after_gen(df)
  won_edge_index = torch.cat(
      (home_won, away_won),
      dim=1
  )
  lost_edge_index = torch.cat(
      (away_lost, home_lost),
      dim=1
  )
  tied_edge_index = torch.cat(
      (home_tied, away_tied),
      dim=1
  )
  edge_index = {
      'won': won_edge_index,
      'lost': lost_edge_index,
      'tied': tied_edge_index,
      'played': player_played,
      'used': team_used,
      'p_after':player_after,
      'p_before': player_before,
      't_after': team_after,
      't_before': team_after
  }   
  return edge_index

In [21]:
#@title supervision_graph_gen(df, for_players, for_teams, log_supervision_matches) OK_HETERO { form-width: "10%" }
def supervision_graph_gen(df : pd.DataFrame, messaging: list, supervision: list, for_players: bool=True, for_teams: bool=True, log_supervision_matches: bool=False) -> typing.Tuple[torch.Tensor, torch.Tensor]:
  if log_supervision_matches:
    if model.mode == 'train':
      mode = 'training'
    elif model.mode == 'dev':
      mode = 'validating'
    elif model.mode == 'test':
      mode = 'testing'
    logging.info(
        f'Messaging on matches ({messaging[0] + 1} -> {messaging[-1] + 1:>5}),\ Model is {mode} on matches ({last_match+2} -> {last_match + 11})'
    )

  target_for_nodes = df

  home_won, away_lost = home_won_gen(df.loc[messaging], full_data_frame=target_for_nodes)
  away_won, home_lost = away_won_gen(df.loc[messaging], full_data_frame=target_for_nodes)
  home_tied, away_tied = tied_gen(df.loc[messaging], full_data_frame=target_for_nodes)

  player_played, team_used = played_used_gen(df)

  if for_players:
    player_before, player_after = players_before_after_gen(df)
  if for_teams:
    team_before, team_after = teams_before_after_gen(df)

  won_edge_index = torch.cat(
      (home_won, away_won),
      dim=1
  )
  lost_edge_index = torch.cat(
      (away_lost, home_lost),
      dim=1
  )
  tied_edge_index = torch.cat(
      (home_tied, away_tied),
      dim=1
  )
  edge_index = {
      'won': won_edge_index,
      'lost': lost_edge_index,
      'tied': tied_edge_index,
      'played': player_played,
      'used': team_used,
      'p_after':player_after,
      'p_before': player_before,
      't_after': team_after,
      't_before': team_after
  }  
  return edge_index

In [22]:
#@title data_gen(df, remove_supervision_links, for_players, for_teams, print_edges, log_supervision_matches) OK_HETERO { form-width: "10%" }
def data_gen(df: pd.DataFrame, messaging: list, supervision: list=None, remove_supervision_links: bool=True, for_players: bool=True, for_teams: bool=True, print_edges: bool=False, log_supervision_matches: bool=False) -> HeteroData:
  if print_edges:
    show_edges(df, edge_index, edge_type)
  if remove_supervision_links:
    edge_index = supervision_graph_gen(
        df,
        messaging=messaging,
        supervision=supervision,
        for_players=for_players,
        for_teams=for_teams,
        log_supervision_matches=log_supervision_matches
    )
    y = torch.tensor(
        df.loc[supervision]['result'].map(home_result).values,
        device=Globals.DEVICE.value
    )

  else:
    if supervision is None:
      supervision = df.index
    if messaging is None:
      messaging = df.index
    edge_index = complete_graph_gen(df, for_players, for_teams)
    y = torch.tensor(
        df.loc[supervision]['result'].map(home_result).values,
        device=Globals.DEVICE.value
    )

  data = HeteroData()
  data['player'].x = torch.unique(edge_index['played'][0]).to(Globals.DEVICE.value).type(torch.int64)
  data['team'].x = torch.unique(edge_index['used'][0]).to(Globals.DEVICE.value).type(torch.int64)
  
  data['team', 'won', 'team'].edge_index = edge_index['won']
  data['team', 'lost_to', 'team'].edge_index = edge_index['lost']
  data['team', 'tied_with', 'team'].edge_index = edge_index['tied']
  data['player', 'played_for', 'team'].edge_index = edge_index['played']
  data['team', 'used', 'player'].edge_index = edge_index['used']
  data['player', 'is_before', 'player'].edge_index = edge_index['p_before']
  data['player', 'is_after', 'player'].edge_index = edge_index['p_after']
  data['team', 'is_before', 'team'].edge_index = edge_index['t_before']
  data['team', 'is_after', 'team'].edge_index = edge_index['t_after']
  data.y = y

  return data

In [23]:
#@title visualzie_graph(df, width, height, title, remove_supervision_links) { form-width: "10%" }
def visualize_graph(df:pd.DataFrame, width: int=20, height: int=20, title: str=None, remove_supervision_links: bool=False) -> NoReturn:
  import networkx as nx
  import matplotlib.pyplot as plt
  nodes = nodes_gen(df)
  r = {k:v for v, k in nodes.items()}
  d = data_gen(df, remove_supervision_links=remove_supervision_links)
  G = to_networkx(d)
  types = {
        0: 'Won',
        1: 'Lost To',
        2: 'Tied With',
        3: 'Played For',
        4: 'Used As Player',
        5: 'Is Before',
        6: 'Is After'
  }

  type_color = {
      0: '#00ff00', #won
      1: '#ff0000', #lost to
      2: '#e6d70e', #tied with
      3: '#1338f0', #played for
      4: '#f01373', #used as player
      5: '#0f072e', #is before
      6: '#d909cb' #is after
  }

  double_edge_types = {
      0: '(Won[green] - Lost to[red])',
      1: '(Lost to[red] - Won[green])',
      2: '(Tied with[yellow])',
      3: '(Played for[blue] - Used as Player[pink])',
      4: '(Used as Player[pink] - Played for[blue])',
      5: '(Is Before[dark blue] - Is After[purple])',
      6: '(Is After[purple] - Is Before[dark blue])'
  }

  link_colors = dict(zip(
        types.values(),
        type_color.values()
      )
  )

  node_colors = {
      'player-color': '#8f0ba1',
      'team-color': '#02fae1'   
  }

  all_colors = link_colors.copy()
  all_colors.update(node_colors)

  

  for color_use in all_colors.keys():
      plt.scatter([],[], c=[all_colors[color_use]], label=f'{color_use}')

  edge_colors = list()
  edge_labels = dict()

  ######################################################## NOT OPTIMIZED
  for edge in G.edges():
    e = torch.tensor(edge, device=Globals.DEVICE.value)
    for index, node_node in enumerate(d.edge_index.t()):
      if torch.equal(e, node_node):
        edge_colors.append(type_color[d.edge_type[index].item()])
        label = double_edge_types[d.edge_type[index].item()]
        edge_labels.update({edge:label})
  colors = list()
  node_labels = dict()
  for node in G.nodes():
    if '@' in r[node]:
      colors.append(all_colors['player-color'])
      node_labels.update({node: r[node].split('@')[0]})
    elif '*' in r[node]:
      colors.append(all_colors['team-color'])
      node_labels.update({node:r[node].split('*')[0]})
  ######################################################## NOT OPTIMIZED

  fig = plt.gcf()
  fig.set_size_inches(width, height)
  pos = nx.spring_layout(G)
  nx.draw_networkx_nodes(G, pos, node_color=colors)
  nx.draw_networkx_labels(G, pos, labels=node_labels)
  nx.draw_networkx_edges(G, pos, edge_color=edge_colors, connectionstyle='arc3,rad=0.05')
  nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
  plt.legend()
  plt.title(title)
  fig.show()
  plt.show()

In [24]:
#@title batch_gen(df, entities, log_supervision_matches) OK_HETERO { form-width: "5%" }
def batch_gen(df: pd.DataFrame, entities: dict, messaging: list=None, supervision: list=None, remove_supervision_links: bool=True, log_supervision_matches: bool=False) -> HeteroData:
  graph = data_gen(
      df,
      messaging=messaging,
      supervision=supervision, 
      remove_supervision_links=remove_supervision_links,
      log_supervision_matches=log_supervision_matches
  )
  
  home_teams = list()
  away_teams = list()

  p_nodes, t_nodes = nodes_gen(df)
  nodes = {**p_nodes, **t_nodes}
  
  if supervision is None:
    supervision = df.index

  indices = dict()
  for hash, index in nodes.items():
    if '@' in hash:
      player = hash.split('@')[0]
      player_id = entities[player]
      indices.update({index:player_id})
    elif '*' in hash:
      team = hash.split('*')[0]
      team_id = entities[team]
      indices.update({index: team_id})
  for index, (season, week, h_team, a_team, result, h_lineup, a_lineup) in df.loc[supervision].iterrows():
      home_teams.append(nodes[f'{h_team}*{index}'])
      away_teams.append(nodes[f'{a_team}*{index}'])

  features_player = torch.tensor(
      [indices[i.item()] for i in graph['player'].x],
      device=Globals.DEVICE.value
  )
  features_team = torch.tensor(
      [indices[i.item()] for i in graph['team'].x],
      device=Globals.DEVICE.value
  )

  graph['player'].x = features_player
  graph['team'].x = features_team
  graph.home_list = home_teams
  graph.away_list = away_teams
  
  return graph

In [25]:
#@title train(model, dataset, optimizer, loss_fn) { form-width: "15%" }
def train(model: HeteroGNN, data: HeteroData, optimizer: torch.optim, loss_fn: torch.nn.modules.loss) -> typing.Tuple[float, int, int]:
  batch_loss = 0

  model.train()
  out = model(data)

  optimizer.zero_grad()
  loss = loss_fn(out, data.y)
  batch_loss = loss.item()
  loss.backward()
  optimizer.step()

  prediction = out.argmax(dim=-1)
  correct = (prediction == data.y).sum().item()
  all = data.y.shape[0]

  return batch_loss, correct, all

In [26]:
#@title evaluate(model, dataset) { form-width: "25px" }
@torch.no_grad()
def evaluate(model: HeteroGNN, data: HeteroConv) -> typing.Tuple[int, int]:
  model.eval()

  # for child in model.children():
  #   for ii in range(len(child)):
  #       if type(child[ii]) == BatchNorm1d:
  #           child[ii].track_running_stats = False

  out = model(data)
  prediction = out.argmax(dim=-1)
  correct = (prediction == data.y).sum().item()
  all = data.y.shape[0]
  model.train()

  return correct, all

In [27]:
#@title Dataset Download { form-width: "15%" }
import requests
from os import getcwd

url_epl = "https://raw.githubusercontent.com/jokecamp/FootballData/master/EPL%202011-2019/PL_scraped_ord.csv"
url_fk = "https://raw.githubusercontent.com/masoudmousavi/Sports-Analysis-with-GNNs/main/FakeData_EPL.csv?token=ARGPVT77P62L4SHH6LT2DCLBNRTYS"
# current_directory = getcwd()
filename_rl = 'dataset.csv'
filename_fk = 'data/old_FakeData_EPL.csv'
# req_rl = requests.get(url_epl)
# req_fk = requests.get(url_fk)

dataset_filename = filename_fk

# if req_rl.status_code == 200:
#   with open(filename_rl, 'wb') as fp:
#     fp.write(req_rl.content)
# else:
#   print(f'Error downloading file at {url_epl}')
# if req_fk.status_code == 200:
#   with open(filename_fk, 'wb') as fp:
#     fp.write(req_fk.content)
# else:
#   print(f'Error downloading file at {url_fk}')

In [28]:
#@title Dataset Loading and Cleaning { form-width: "15px" }
dataset = pd.read_csv(
    dataset_filename,
    encoding='latin-1',
    usecols=['season', 'match_week', 'home_team', 'away_team', 'result', 'home_lineup', 'away_lineup']
)
corrupted = dataset.loc[pd.isna(dataset['away_lineup']) | pd.isna(dataset['home_lineup'])]
dataset = dataset.drop(corrupted.index, axis=0)
dataset = dataset.reset_index(drop=True)


In [29]:
#@title Log { form-width: "15%" }
logging.basicConfig(
    filename='model-logs.log',
    filemode='w',
    level=logging.INFO
)


In [30]:
# #@title Hyperparameters File
# hp_file = open('hyperparameters.json', 'w')
# hyperparameters = {
#     "learning_rate": 1e-3,
#     "num_epochs": 200,
#     "fc_dropout":0.01,
#     "conv_dropout": 0.01,
#     "emb_dropout": 0.01,
#     "train_messaging_graph_size": 440,
#     "val_messaging_graph_size": 440,
#     "test_messaging_graph_size": 440,
#     "iter_size": 10,
#     "val_week_denom": 50,
#     "test_week_denom": 60,
#     "embedding_dim": 32,
#     "conv_dims":[
#           32,
#           32, 
#           32,
#           32
#     ],
#     "fully_connected_dims":[
#               32,
#               32
#     ]
# }

# json.dump(hyperparameters, hp_file, indent= 4)
# hp_file.close()

In [31]:
#@title Model and Model Hyperparameters { form-width: "15%" }
log_supervision_matches = True
with open('hyperparameters.json', 'r') as hp_file:
  hyperparameters = json.load(hp_file)
learning_rate = hyperparameters["learning_rate"]
num_epochs = hyperparameters["num_epochs"]
fc_dropout = hyperparameters["fc_dropout"]
conv_dropout = hyperparameters["conv_dropout"]
emb_dropout = hyperparameters["emb_dropout"]

remove_supervision_links = True

entities = gen_entities(dataset)

######################################## Scheme 4
train_messaging_graph_size = hyperparameters["train_messaging_graph_size"]
val_messaging_graph_size = hyperparameters["val_messaging_graph_size"]
test_messaging_graph_size = hyperparameters["test_messaging_graph_size"]
iter_size = hyperparameters["iter_size"]
val_week_denom = hyperparameters["val_week_denom"]
test_week_denom = hyperparameters["test_week_denom"]
######################################## Parameters

model = HeteroGNN(
    embedding_dims=(
        max(entities.values()) + 1,
        hyperparameters["embedding_dim"]
    ),
    conv_dims=hyperparameters["conv_dims"],
    fully_connected_dims=hyperparameters["fully_connected_dims"],
    dropout={
        "emb": emb_dropout,
        "conv": conv_dropout,
        "fc": fc_dropout
    }
).to(Globals.DEVICE.value)

print(model)

optimizer = Adam(
    model.parameters(),
    lr=learning_rate
)
criterion = NLLLoss()

HeteroGNN(
  (embed): Embedding(488, 32)
  (conv_layers): ModuleList(
    (0): HeteroConv(num_relations=9)
    (1): HeteroConv(num_relations=9)
    (2): HeteroConv(num_relations=9)
    (3): HeteroConv(num_relations=9)
  )
  (fully_connected_layers): ModuleList(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): Linear(in_features=32, out_features=32, bias=True)
    (2): Linear(in_features=32, out_features=3, bias=True)
  )
  (classifier): LogSoftmax(dim=1)
)


In [None]:
#@title Data Batch Maker DO NOT RUN{ form-width: "15%" }
train_batches = list()
val_batches = list()
test_batches = list()

for i in range(train_messaging_graph_size, dataset.shape[0], iter_size):
      if i % val_week_denom == 0:
        ######################## Validation ########################
        from_match = i - val_messaging_graph_size
        to_match = i - 1
        model.mode = 'dev'

        validation_df = dataset.loc[from_match: to_match, :]
        val_graph_data = batch_gen(
              validation_df,
              entities=entities,
              messaging=validation_df.index,
              remove_supervision_links=remove_supervision_links,
              log_supervision_matches=log_supervision_matches
          )
        val_batches.append(val_graph_data)

      elif i % test_week_denom == 0:
        ######################## Test ########################
        model.eval()
        model.mode = 'test'
        
        from_match = i - test_messaging_graph_size
        to_match = i - 1

        test_df = dataset.loc[from_match: to_match, :]
        test_graph_data = batch_gen(
            test_df,
            entities=entities,
            messaging=test_df.index,
            remove_supervision_links=remove_supervision_links,
            log_supervision_matches=log_supervision_matches
        )
        
        test_batches.append(test_graph_data)

      else:
        ######################## Train ########################

        from_match = i - train_messaging_graph_size
        to_match = i - 1
        model.mode = 'train'

        train_df = dataset.loc[from_match: to_match, :]
        train_graph_data = batch_gen(
            train_df,
            entities=entities,
            messaging=train_df.index,
            remove_supervision_links=remove_supervision_links,
            log_supervision_matches=log_supervision_matches
        )

        train_batches.append(train_graph_data)

In [None]:
#@title Model Fitting Moving Partial Graph DO NOT RUN { form-width: "15%" }
try:
  train_losses = list()
  train_accuracies = list()
  val_accuracies = list()
  
  for epoch in range(num_epochs):
    epoch_loss = 0
    val_correct = 0
    val_all = 0
    train_all = 0
    train_correct = 0

    for index, train_graph_data in enumerate(train_batches):
       ######################## Train ########################
        model.train()
        model.mode = 'train'

        train_batch_loss, train_batch_correct, train_batch_all = train(
              model=model,
              data=train_graph_data,
              optimizer=optimizer,
              loss_fn=criterion
          )

        print(f'Batch {index + 1} of Epoch {epoch}: Accuracy: {train_batch_correct / train_batch_all:.4f}')

        epoch_loss += train_batch_loss
        train_correct += train_batch_correct
        train_all += train_batch_all

        ######################## Validation ########################
        model.eval()
        model.mode = 'dev'

        val_batch_correct, val_batch_all = evaluate(
            model=model,
            data=val_batches[index%len(val_batches)]
        )

        val_correct += val_batch_correct
        val_all += val_batch_all
      
    ########## end of epoch ###########
    print(f'{"="*32} Epoch {epoch + 1} {"="*32}')
    print(f'Train Loss:          {epoch_loss:.4f}')
    print(f'Train Cost:          {epoch_loss / train_all:.4f}')
    print(f'Train Accuracy:      {train_correct * 100 / train_all:.3f}%')
    print(f'Validation Accuracy: {val_correct * 100 / val_all:.3f}%')
    logging.info(f'{"="*32} Epoch {epoch + 1} {"="*32}')
    logging.info(f'Train Loss:          {epoch_loss:.4f}')
    logging.info(f'Train Cost:          {epoch_loss / train_all:.4f}')
    logging.info(f'Train Accuracy:      {train_correct * 100 / train_all:.3f}%')
    logging.info(f'Validation Accuracy: {val_correct * 100 / val_all:.3f}%')

    train_losses.append(epoch_loss)
    train_accuracies.append(train_correct * 100 / train_all)
    val_accuracies.append(val_correct * 100 / val_all)

except KeyboardInterrupt:
  pass

In [None]:
#@title Results DO NOT RUN{ form-width: "15%" }
import matplotlib.pyplot as plt
import numpy as np
t = [i for i in list(range(len(train_losses)))]
t = np.array(t)
y1 = np.array(train_losses)
y2 = np.array(train_accuracies)
y3 = np.array(val_accuracies)
fig = plt.gcf()
plt.plot(t, y1)
plt.title("Train Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
#plt.legend(['Train', 'Validation'])
fig.set_size_inches(20, 10)

In [None]:
#@title Model Test DO NOT RUN{ form-width: "25%" }

test_correct = 0
test_all = 0

for test_graph_data in test_batches:
  model.eval()
  model.mode = 'test'

  test_batch_correct, test_batch_all = evaluate(
      model=model,
      data=test_graph_data
  )
  test_correct += test_batch_correct
  test_all += test_batch_all

print(f'Test Accuracy: {test_correct * 100 / test_all:.3f}%')
logging.info('=' * 70)
logging.info(f'Test Accuracy: {test_correct * 100 / test_all:.3f}%')



In [None]:
# @title Model Save DO NOT RUN
torch.save(model.state_dict(), 'model.pth')

In [32]:
#@title VERY IMPORTANT MAIN { form-width: "15%" }

# this is the dataframe from which you want to create your graph
dss = dataset.loc[0: 10]

# messaging which should be an iterable of indices contains matches that are in the message-passing graph
messaging = [0, 1, 3, 5, 6, 9, 10]

# supervision which should be an iterable of indices contains matches that are removed
# from the message-passing graph and are used for Link Prediction
# could also be None if remove_supervision_links is False
supervision=[2, 4, 7, 8] 

# hd is the HeteroData graph 
hd = batch_gen(
    dss,
    entities=entities, # must never be anything else
    messaging = messaging, 
    supervision=supervision, 
    remove_supervision_links=True
)

hd_ei = hd.edge_index_dict
print('"won" edge_list')
print(hd_ei[('team', 'won', 'team')].t())
print('#' * 64)
player_nodes, team_nodes = nodes_gen(dss)
player_hashes = {value: key for key, value in player_nodes.items()}
team_hashes = {value: key for key, value in team_nodes.items()}

print('Team nodes and hashes')
for i, j in team_nodes.items():
  print(f'{j:<40}{i}')
print('#' * 64)
print('Result of supervision matches')
print(hd.y)
print('#' * 64)
print('home teams')
print(hd.home_list)
print('#' * 64)
print('away teams')
print(hd.away_list)
print('#' * 64)
print('home nodes')
print([team_hashes[team_node] for team_node in hd.home_list])
print('#' * 64)
print('away nodes')
print([team_hashes[team_node] for team_node in hd.away_list])
dss


"won" edge_list
tensor([[ 6,  7],
        [10, 11],
        [20, 21],
        [ 1,  0],
        [ 3,  2],
        [13, 12],
        [19, 18]], device='cuda:0')
################################################################
Team nodes and hashes
0                                       Queens Park Rangers*0
1                                       Swansea City*0
2                                       Bolton Wanderers*1
3                                       Arsenal*1
4                                       Tottenham Hotspur*2
5                                       Fulham*2
6                                       Norwich City*3
7                                       West Bromwich Albion*3
8                                       Liverpool*4
9                                       Stoke City*4
10                                      Manchester United*5
11                                      Sunderland*5
12                                      Wolverhampton Wanderers*6
13              

Unnamed: 0,season,match_week,home_team,away_team,result,home_lineup,away_lineup
0,1990,1,Queens Park Rangers,Swansea City,away,Diniyar Bilyaletdinov - Ronald Zubar - Rafael ...,Ali Al Habsi - Aaron Ramsey - Grant Holt - Eri...
1,1990,1,Bolton Wanderers,Arsenal,away,Ramires - David N'Gog - Steven Taylor - Sam Ri...,Brad Guzan - Theo Walcott - Phil Neville - Cra...
2,1990,1,Tottenham Hotspur,Fulham,away,Ricardo Fuller - Titus Bramble - GaÃ«l Clichy ...,Peter Crouch - John Terry - Kevin Foley - Stew...
3,1990,1,Norwich City,West Bromwich Albion,home,Park Ji-Sung - Pavel Pogrebnyak - Magaye Gueye...,Jack Rodwell - Lucas Leiva - David Stockdale -...
4,1990,1,Liverpool,Stoke City,away,Stefan Savic - David Jones - Luka Modric - Luk...,Philippe Senderos - Scott Dann - Simon Vukcevi...
5,1990,1,Manchester United,Sunderland,home,Sam Hutchinson - Frank Lampard - Jay Bothroyd ...,David Goodwillie - Ross Barkley - Chris Smalli...
6,1990,1,Wolverhampton Wanderers,Aston Villa,away,Ryan Giggs - Danny Guthrie - Fabrice Muamba - ...,Shaun Wright-Phillips - Gabriel Obertan - Sylv...
7,1990,1,Chelsea,Newcastle United,home,Brett Emerton - Fitz Hall - Elliott Bennett - ...,Younes Kaboul - Tuncay Sanli - Christopher Sam...
8,1990,1,Everton,Manchester City,away,Danny Simpson - Fraizer Campbell - Jake Liverm...,Declan Rudd - Leighton Baines - David Junior H...
9,1990,1,Blackburn Rovers,Wigan Athletic,away,Mikel Arteta - Matt Upson - Jamie Mackie - Ben...,David Silva - Wayne Hennessey - Wes Brown - Da...


In [54]:
model(hd)

tensor([[-0.3599, -1.4278, -2.7743],
        [-0.9239, -0.5582, -3.4802],
        [-0.4334, -1.2613, -2.6826],
        [-0.5614, -1.0585, -2.4934]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward>)

In [32]:
weeks = []

for season, seasondf in dataset.groupby('season'):
    for w, weekdf in seasondf.groupby('match_week'):
        weeks.append(weekdf)
        if len(weeks) > 100: break
    if len(weeks) > 100: break

        


In [33]:
weeks[10]

Unnamed: 0,season,match_week,home_team,away_team,result,home_lineup,away_lineup
100,1990,11,Queens Park Rangers,Blackburn Rovers,home,Craig Gardner - Ahmed El Mohamady - Adam Hammi...,Yaya TourÃ© - Ryan Taylor - Gabriel Tamas - Ci...
101,1990,11,Wigan Athletic,Everton,tie,Jason Roberts - Rafael - Wes Brown - Anton Fer...,Bryan Ruiz - Fraizer Campbell - Yohan Cabaye -...
102,1990,11,Manchester City,Chelsea,home,Adam Drury - Dimitar Berbatov - Leighton Baine...,Maynor Figueroa - Fitz Hall - Joe Allen - Elli...
103,1990,11,Newcastle United,Wolverhampton Wanderers,tie,MÃ­chel Salgado - Christopher Samba - Cheick T...,Kevin Doyle - Vincent Kompany - Danny Guthrie ...
104,1990,11,Aston Villa,Manchester United,home,Peter Odemwingie - Steven Nzonzi - John Ruddy ...,Danny Higginbotham - Sam Hutchinson - Patrice ...
105,1990,11,Sunderland,Liverpool,away,Danny Welbeck - Glenn Whelan - AndrÃ© Santos -...,Michael Dawson - Luke Young - Martin Petrov - ...
106,1990,11,Stoke City,Norwich City,away,Shaun Maloney - Scott Dann - Stiliyan Petrov -...,Pavel Pogrebnyak - Mario Balotelli - Phil Jagi...
107,1990,11,West Bromwich Albion,Tottenham Hotspur,away,Adel Taarabt - John Heitinga - Ivan Klasnic - ...,Zat Knight - Branislav Ivanovic - Steven Fletc...
108,1990,11,Fulham,Bolton Wanderers,tie,Zak Whitbread - Nedum Onuoha - Peter Crouch - ...,Petr Cech - Marc Albrighton - Ben Foster - Tom...
109,1990,11,Arsenal,Swansea City,away,Jordan Henderson - Anders Lindegaard - Kieran ...,Aaron Ramsey - Gareth Barry - Emmerson Boyce -...


In [34]:
from tqdm.notebook import tqdm 

In [35]:
#Last week is supervision and removed
train_window_size = 30
eval_window_size = 10
first_batch = batch_gen(
        dataset.loc[
            weeks[0].index[0]: weeks[0].index[-1],
            :
        ],
        entities=entities, # must never be anything else
        messaging = [], 
        supervision= list(range(weeks[0].index[0], weeks[0].index[-1] + 1)), 
        remove_supervision_links=True
    )
train_graphs = []
train_graphs.append(first_batch)
eval_graphs = []
for weeknumber, w in enumerate(tqdm(weeks)):
    train_start_point = max(0, weeknumber - train_window_size + 1)
    train_end_point = weeknumber
    #print('train', train_start_point, train_end_point)
    #Call Train On weeks[startpoint: endpoint + 1]
    if train_end_point > 0:
        hd = batch_gen(
            dataset.loc[
                weeks[0].index[0]: weeks[train_end_point].index[-1]
            ],
            entities=entities, # must never be anything else
            messaging = list(range(weeks[0].index[0], weeks[train_end_point - 1].index[-1] + 1)), 
            supervision= list(range(weeks[train_end_point].index[0], weeks[train_end_point].index[-1] + 1)), 
            remove_supervision_links=True
        )
        train_graphs.append(hd)

    
    # print(f'train window: [{train_start_point}:{train_end_point + 1}) - window train loss: {train_window_loss: .4f} - window train accuracy: {train_window_acc*100: .2f}%')
    # gm.addWeekMatchresulttoGraph(weeks[train_end_point], dt)



    eval_start_point = train_end_point + 1
    eval_end_point = eval_start_point + eval_window_size - 1
    #print('eval', eval_start_point, eval_end_point)
    if eval_end_point > len(weeks) - 1:
        eval_end_point = len(weeks) - 1
        #print('eval cor', eval_start_point, eval_end_point)
        
    if weeknumber == len(weeks) - 1:
        continue

    #if eval_start_point == 1:
    ehd = batch_gen(
        dataset.loc[
            weeks[0].index[0]: weeks[eval_end_point].index[-1]
        ],
        entities=entities, # must never be anything else
        messaging = list(range(weeks[0].index[0], weeks[train_end_point].index[-1] + 1)), 
        supervision= list(range(weeks[eval_start_point].index[0], weeks[eval_end_point].index[-1] + 1)), 
        remove_supervision_links=True
    )
    eval_graphs.append(ehd)
    # else:
    #     gm.addWeekMatchNodestoGraph(weeks[eval_end_point], dt)
    #     gm.addWeekPlayerstoGraph(weeks[eval_end_point], dt)
    # #TODO Call Eval
    # eval_window_loss, eval_window_acc = eval_OnWindow(weeks[eval_start_point: eval_end_point + 1], gm)
    # print(f'eval window: [{eval_start_point}:{eval_end_point}) - window eval loss: {eval_window_loss: .4f} - window eval accuracy: {eval_window_acc*100: .2f}%')
    # #print(f'weeks[{start_point} : {end_point + 1}]')
    # #print(f'Week: {weeknumber}, Start: {start_point}, End: {end_point}')


  0%|          | 0/101 [00:00<?, ?it/s]

In [None]:
#All weeks are supervision - nothing removed
train_window_size = 30
eval_window_size = 10
first_batch = batch_gen(
        dataset.loc[
            weeks[0].index[0]: weeks[0].index[-1],
            :
        ],
        entities=entities, # must never be anything else
        messaging = list(range(weeks[0].index[0], weeks[0].index[-1] + 1)), 
        supervision= list(range(weeks[0].index[0], weeks[0].index[-1] + 1)), 
        remove_supervision_links=False
    )
train_graphs = []
train_graphs.append(first_batch)
eval_graphs = []
for weeknumber, w in enumerate(tqdm(weeks)):
    train_start_point = max(0, weeknumber - train_window_size + 1)
    train_end_point = weeknumber
    #print('train', train_start_point, train_end_point)
    #Call Train On weeks[startpoint: endpoint + 1]
    if train_end_point > 0:
        hd = batch_gen(
            dataset.loc[
                weeks[0].index[0]: weeks[train_end_point].index[-1]
            ],
            entities=entities, # must never be anything else
            messaging = list(range(weeks[0].index[0], weeks[train_end_point].index[-1] + 1)), 
            supervision= list(range(weeks[0].index[0], weeks[train_end_point].index[-1] + 1)), 
            remove_supervision_links=False
        )
        train_graphs.append(hd)

    
    # print(f'train window: [{train_start_point}:{train_end_point + 1}) - window train loss: {train_window_loss: .4f} - window train accuracy: {train_window_acc*100: .2f}%')
    # gm.addWeekMatchresulttoGraph(weeks[train_end_point], dt)



    eval_start_point = train_end_point + 1
    eval_end_point = eval_start_point + eval_window_size - 1
    #print('eval', eval_start_point, eval_end_point)
    if eval_end_point > len(weeks) - 1:
        eval_end_point = len(weeks) - 1
        #print('eval cor', eval_start_point, eval_end_point)
        
    if weeknumber == len(weeks) - 1:
        continue

    #if eval_start_point == 1:
    ehd = batch_gen(
        dataset.loc[
            weeks[0].index[0]: weeks[eval_end_point].index[-1]
        ],
        entities=entities, # must never be anything else
        messaging = list(range(weeks[0].index[0], weeks[train_end_point].index[-1] + 1)), 
        supervision= list(range(weeks[eval_start_point].index[0], weeks[eval_end_point].index[-1] + 1)), 
        remove_supervision_links=True
    )
    eval_graphs.append(ehd)
    # else:
    #     gm.addWeekMatchNodestoGraph(weeks[eval_end_point], dt)
    #     gm.addWeekPlayerstoGraph(weeks[eval_end_point], dt)
    # #TODO Call Eval
    # eval_window_loss, eval_window_acc = eval_OnWindow(weeks[eval_start_point: eval_end_point + 1], gm)
    # print(f'eval window: [{eval_start_point}:{eval_end_point}) - window eval loss: {eval_window_loss: .4f} - window eval accuracy: {eval_window_acc*100: .2f}%')
    # #print(f'weeks[{start_point} : {end_point + 1}]')
    # #print(f'Week: {weeknumber}, Start: {start_point}, End: {end_point}')


In [36]:
test_graph = batch_gen(
    dataset.loc[
        weeks[0].index[0]: weeks[len(weeks) - 1].index[-1] + 10
    ],
    entities=entities, # must never be anything else
    messaging = list(range(weeks[0].index[0],  weeks[len(weeks) - 1].index[-1] + 1)), 
    supervision= list(range(weeks[len(weeks) - 1].index[-1] + 1, weeks[len(weeks) - 1].index[-1] + 10 + 1)), 
    remove_supervision_links=True
)

In [37]:
for epoch in range(30):
    tcorrect = 0
    tloss = 0.0
    ttotal = 0
    ecorrect = 0
    etotal = 0
    for i in tqdm(range(len(eval_graphs)), leave= False):
        itloss, itcorrect, ittotal = train(model, train_graphs[i], optimizer, criterion)
        tloss += itloss
        tcorrect += itcorrect
        ttotal += ittotal

        iecorrect, ietotal = evaluate(model, eval_graphs[i])
        ecorrect += iecorrect
        etotal += ietotal
    itloss, itcorrect, ittotal = train(model, train_graphs[-1], optimizer, criterion)
    tloss += itloss
    tcorrect += itcorrect
    ttotal += ittotal

    print(f'epoch: {epoch} - train loss {tloss / len(train_graphs)} - train acc: {tcorrect / ttotal} - eval acc: {ecorrect/ etotal}')


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 0 - train loss 1.028381904753128 - train acc: 0.4801980198019802 - eval acc: 0.4630366492146597


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 1 - train loss 0.9872966172671555 - train acc: 0.5188118811881188 - eval acc: 0.5002094240837697


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 2 - train loss 0.9859013758083381 - train acc: 0.5118811881188119 - eval acc: 0.5101570680628272


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 3 - train loss 0.9547280049560094 - train acc: 0.5485148514851486 - eval acc: 0.533717277486911


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 4 - train loss 0.9171098098896517 - train acc: 0.5792079207920792 - eval acc: 0.5475392670157068


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 5 - train loss 0.8660227954387665 - train acc: 0.598019801980198 - eval acc: 0.5867015706806282


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 6 - train loss 0.8238770114903403 - train acc: 0.6475247524752475 - eval acc: 0.6108900523560209


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 7 - train loss 0.7887769120164437 - train acc: 0.6643564356435644 - eval acc: 0.6530890052356021


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 8 - train loss 0.7295259520559028 - train acc: 0.6861386138613862 - eval acc: 0.663979057591623


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 9 - train loss 0.6698221649863932 - train acc: 0.7178217821782178 - eval acc: 0.7007329842931938


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 10 - train loss 0.5994050483597387 - train acc: 0.7475247524752475 - eval acc: 0.726282722513089


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 11 - train loss 0.5485693306320965 - train acc: 0.7673267326732673 - eval acc: 0.7479581151832461


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 12 - train loss 0.5073806448738174 - train acc: 0.7871287128712872 - eval acc: 0.7641884816753927


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 13 - train loss 0.4505410390620184 - train acc: 0.807920792079208 - eval acc: 0.796020942408377


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 14 - train loss 0.4126871133794879 - train acc: 0.8465346534653465 - eval acc: 0.7962303664921466


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 15 - train loss 0.43850302511807715 - train acc: 0.8287128712871287 - eval acc: 0.792041884816754


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 16 - train loss 0.40128934228479274 - train acc: 0.8287128712871287 - eval acc: 0.8048167539267016


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 17 - train loss 0.33419535652098087 - train acc: 0.8663366336633663 - eval acc: 0.8339267015706806


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 18 - train loss 0.31896022917463046 - train acc: 0.8633663366336634 - eval acc: 0.8494240837696335


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 19 - train loss 0.3480239796756518 - train acc: 0.8712871287128713 - eval acc: 0.8419895287958116


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 20 - train loss 0.3026605173530481 - train acc: 0.8950495049504951 - eval acc: 0.8656544502617801


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 21 - train loss 0.23600103577027226 - train acc: 0.9168316831683169 - eval acc: 0.8760209424083769


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 22 - train loss 0.21033620749286716 - train acc: 0.9198019801980198 - eval acc: 0.886806282722513


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 23 - train loss 0.24097525511895962 - train acc: 0.9118811881188119 - eval acc: 0.8879581151832461


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 24 - train loss 0.24757891590015427 - train acc: 0.9148514851485149 - eval acc: 0.8735078534031414


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 25 - train loss 0.20041987890525176 - train acc: 0.9267326732673268 - eval acc: 0.8944502617801047


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 26 - train loss 0.19624797130778138 - train acc: 0.9297029702970298 - eval acc: 0.8852356020942408


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 27 - train loss 0.20453559263554025 - train acc: 0.9277227722772278 - eval acc: 0.8873298429319372


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 28 - train loss 0.16846260459780102 - train acc: 0.9306930693069307 - eval acc: 0.8958115183246074


  0%|          | 0/100 [00:00<?, ?it/s]

epoch: 29 - train loss 0.16506246460767665 - train acc: 0.9356435643564357 - eval acc: 0.9073298429319372


In [40]:
evaluate(model, test_graph)

(7, 10)

In [55]:
log_supervision_matches = True
with open('hyperparameters.json', 'r') as hp_file:
  hyperparameters = json.load(hp_file)
learning_rate = hyperparameters["learning_rate"]
num_epochs = hyperparameters["num_epochs"]
fc_dropout = hyperparameters["fc_dropout"]
conv_dropout = hyperparameters["conv_dropout"]
emb_dropout = hyperparameters["emb_dropout"]

remove_supervision_links = True

#entities = gen_entities(dataset)

######################################## Scheme 4
train_messaging_graph_size = hyperparameters["train_messaging_graph_size"]
val_messaging_graph_size = hyperparameters["val_messaging_graph_size"]
test_messaging_graph_size = hyperparameters["test_messaging_graph_size"]
iter_size = hyperparameters["iter_size"]
val_week_denom = hyperparameters["val_week_denom"]
test_week_denom = hyperparameters["test_week_denom"]
######################################## Parameters

model2 = HeteroGNN(
    embedding_dims=(
        max(entities.values()) + 1,
        hyperparameters["embedding_dim"]
    ),
    conv_dims=hyperparameters["conv_dims"],
    fully_connected_dims=hyperparameters["fully_connected_dims"],
    dropout={
        "emb": emb_dropout,
        "conv": conv_dropout,
        "fc": fc_dropout
    }
).to(Globals.DEVICE.value)

print(model2)

optimizer2 = Adam(
    model.parameters(),
    lr=learning_rate
)
criterion2 = NLLLoss()

HeteroGNN(
  (embed): Embedding(488, 32)
  (conv_layers): ModuleList(
    (0): HeteroConv(num_relations=9)
    (1): HeteroConv(num_relations=9)
    (2): HeteroConv(num_relations=9)
    (3): HeteroConv(num_relations=9)
  )
  (fully_connected_layers): ModuleList(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): Linear(in_features=32, out_features=32, bias=True)
    (2): Linear(in_features=32, out_features=3, bias=True)
  )
  (classifier): LogSoftmax(dim=1)
)


In [56]:
for i in tqdm(range(len(eval_graphs)), leave= False):
    tcorrect = 0
    tloss = 0.0
    ttotal = 0
    ecorrect = 0
    etotal = 0
    for epoch in range(30):
        itloss, itcorrect, ittotal = train(model2, train_graphs[i], optimizer2, criterion2)
        tloss += itloss
        tcorrect += itcorrect
        ttotal += ittotal

        iecorrect, ietotal = evaluate(model2, eval_graphs[i])
        ecorrect += iecorrect
        etotal += ietotal
    # itloss, itcorrect, ittotal = train(model2, train_graphs[-1], optimizer, criterion)
    # tloss += itloss
    # tcorrect += itcorrect
    # ttotal += ittotal

    print(f'window: {i} - train loss {tloss / 30} - train acc: {tcorrect / ttotal} - eval acc: {ecorrect/ etotal}')


  0%|          | 0/100 [00:00<?, ?it/s]

window: 0 - train loss 1.2793646812438966 - train acc: 0.32666666666666666 - eval acc: 0.29
window: 1 - train loss 1.1261347909768422 - train acc: 0.3233333333333333 - eval acc: 0.29
window: 2 - train loss 1.1144853154818217 - train acc: 0.3933333333333333 - eval acc: 0.28
window: 3 - train loss 1.2660636186599732 - train acc: 0.19 - eval acc: 0.3
window: 4 - train loss 1.1652896801630657 - train acc: 0.23 - eval acc: 0.33
window: 5 - train loss 1.2050739725430806 - train acc: 0.27666666666666667 - eval acc: 0.33
window: 6 - train loss 1.1748838464419047 - train acc: 0.39666666666666667 - eval acc: 0.31
window: 7 - train loss 1.1706843733787538 - train acc: 0.38 - eval acc: 0.28
window: 8 - train loss 1.3565096497535705 - train acc: 0.25333333333333335 - eval acc: 0.32
window: 9 - train loss 1.3142128229141234 - train acc: 0.22333333333333333 - eval acc: 0.35
window: 10 - train loss 1.271288847923279 - train acc: 0.2633333333333333 - eval acc: 0.35
window: 11 - train loss 1.32577224572

In [57]:
evaluate(model2, test_graph)

(32, 100)

In [51]:
dff.iloc[-1].index

Index(['season', 'match_week', 'home_team', 'away_team', 'result',
       'home_lineup', 'away_lineup'],
      dtype='object')

In [63]:
weeks[0]['home_lineup'].iloc[0]

'Diniyar Bilyaletdinov - Ronald Zubar - Rafael van der Vaart - Peter LÃ¸venkrands - Cameron Jerome - Louis Saha - Alex - Mahamadou Diarra - James Collins - Damien Duff - Joleon Lescott - '

In [52]:
#@title Custom Train Method { form-width: "15%" }
model.reset_parameters()

dataframe_train = dataset.loc[0:6800, :]
data_frame_val = dataset.loc[6801:, :]

for _ in range(100):
  messaging_indcs = dataframe.sample(frac=0.85).index
  supervision_indcs = dataframe.drop(messaging_indcs).index
  hetero_data = batch_gen(
      dataframe_train, 
      entities, 
      remove_supervision_links=True,
      messaging=messaging_indcs,
      supervision=supervision_indcs
  )

  print(train(model, hetero_data, optimizer, criterion))

messaging_indcs = dataframe_val.sample(frac=0.85).index
supervision_indcs = dataframe_val.drop(messaging_indcs).index
# evaluate(model, hetero_data2)
hetero_data2 = batch_gen(
    dataframe_val, 
    entities, 
    remove_supervision_links=True,
    supervision=supervision_indcs
)
for _ in range(15):
  print(train(model, hetero_data, optimizer, criterion))


for _ in range(150):
  print(train(model, hetero_data2, optimizer, criterion))

NameError: name 'dataframe' is not defined

In [None]:
#@title Custom Training Method 2  { form-width: "15%" }

ds = dataset.loc[:6549, :]

hd = batch_gen(
    ds,
    entities=entities,
    messaging=ds.index,
    supervision=ds.sample(frac=0.3).index,
    remove_supervision_links=False)
for i in range(10000):
  for j in range(300, 3200, 100):

    ds = dataset.loc[j - 300:j - 1, :]

    r = ds.sample(frac=0.1).index

    hd = batch_gen(
        ds,
        entities=entities,
        messaging=ds.index,
        supervision=r,
        remove_supervision_links=False
    )
    
    print(f'Epoch {i+1}: {train(model, hd, optimizer, criterion)}')



In [33]:
#@title Custom Train Method 3 { form-width: "15%" }
df = dataset.loc[:9, :]

h_data = batch_gen(
    df,
    entities=entities,
    supervision=[2, 3, 4, 9, 6],
    remove_supervision_links=False
)

print(train(model, h_data, optimizer, criterion))
print(evaluate(model, h_data))

(0.14763778448104858, 4, 5)
(5, 5)
