In [4]:
import pandas as pd
import torch
from torch_geometric.data import Data, HeteroData
from torch_geometric.typing import EdgeType
import torch_geometric.transforms as T
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import negative_sampling
from typing import Union, Optional, List
from copy import copy

In [5]:
seasons = [18, 19, 20, 21, 22]
players_df = pd.read_csv("player_data/player_data.csv")
matches_df = pd.read_csv("match_data/match_data.csv")

In [6]:
# Define player club position
#position_cols = [x for x in players_df.columns if "club_position_" in x]
positions = players_df["club_position"].unique()
player_club_position = players_df["club_position"].apply(lambda x: positions.index(x)).to_numpy()
player_club_position = torch.from_numpy(player_club_position).to(torch.float)
print(player_club_position)
print("Player Position Size")
print("====================")
print (player_club_position.size())   # 29 possible positions.

# Define player features:
player_feat = players_df.drop(columns=position_cols+["sofifa_id", "club_name"])
player_feat = player_feat.to_numpy(dtype="float64")
player_feat = torch.from_numpy(player_feat).to(torch.float)
print(player_feat)
print("Player Feature Size")
print("====================")
print (player_feat.size())   # 223 features in total.

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [4]:
# Define match features
feature_cols = ["FTHG", "FTAG", "HS", "AS", "HST", "AST", "HC", "AC", "HY", "AY", "HR", "AR"]
match_result = matches_df["FTR"].str.get_dummies()
match_feat = pd.concat((match_result, matches_df[feature_cols]), axis=1).to_numpy()
match_feat = torch.from_numpy(match_feat).to(torch.long)
print(match_feat)
print()
print("Match Feature Size")
print("====================")
print (match_feat.size())   # 3 results columns + 12 different match statistics.
print()

# Define match results:
#match_result = matches_df["FTR"].str.get_dummies().to_numpy()
#match_result = torch.from_numpy(match_result).to(torch.long)
#print(match_result)
#print()
#print("Match Result Size")
#print("====================")
#print (match_result.size())   # 3 possible results: Home win, Away win, Draw.
#print()

# Define match infos
info_cols = ["Season", "Week"]
match_info = matches_df[info_cols].to_numpy()
match_info = torch.from_numpy(match_info).to(torch.int)
print(match_info)
print()
print("Match Info Size")
print("====================")
print (match_info.size())   # 2 different types of information.
print()


tensor([[0, 1, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0]])

Match Feature Size
torch.Size([23457, 15])

tensor([[18,  1],
        [18,  1],
        [18,  1],
        ...,
        [22, 38],
        [22, 38],
        [22, 38]], dtype=torch.int32)

Match Info Size
torch.Size([23457, 2])



In [5]:
# Create a mapping from unique player indices to range [0, num_player_nodes):
players_df["PlayerID"] = players_df.apply(lambda row: str(row.season) + "_" + str(row.sofifa_id), axis=1)
unique_player_id = pd.DataFrame(data={
    'playerId': players_df["PlayerID"],
    'mappedID': pd.RangeIndex(len(players_df["PlayerID"])),
})
print("Mapping of player IDs to consecutive values:")
print("==========================================")
print(unique_player_id.head())
print()

# Replace team names with unique team IDs
team_ids = list(players_df["club_name"].unique())
team_season_ids = list(players_df.apply(lambda row: str(row.season) + "_" + str(team_ids.index(row.club_name) + 1), axis=1).unique())
players_df["club_id"] = players_df.apply(lambda row: str(row.season) + "_" + str(team_ids.index(row.club_name) + 1), axis=1)
matches_df["HomeID"] = matches_df.apply(lambda row: str(row.Season) + "_" + str(team_ids.index(row.HomeTeam) + 1), axis=1)
matches_df["AwayID"] = matches_df.apply(lambda row: str(row.Season) + "_" + str(team_ids.index(row.AwayTeam) + 1), axis=1)

# Create a mapping from unique club indices to range [0, num_match_nodes):
matches_df['MatchID'] = matches_df.apply(lambda row: str(row.Season) + "_" + str(team_ids.index(row.HomeTeam) + 1) + "_" + str(team_ids.index(row.AwayTeam) + 1), axis=1)
unique_club_id = pd.DataFrame(data={
    'clubId': team_season_ids,
    'mappedID': pd.RangeIndex(len(team_season_ids)),
})
print("Mapping of club IDs to consecutive values:")
print("===========================================")
print(unique_club_id.head())

Mapping of player IDs to consecutive values:
    playerId  mappedID
0   18_20801         0
1  18_158023         1
2  18_167495         2
3  18_176580         3
4  18_190871         4

Mapping of club IDs to consecutive values:
  clubId  mappedID
0   18_1         0
1   18_2         1
2   18_3         2
3   18_4         3
4   18_5         4


In [6]:
# Perform merge to obtain the edges from players and clubs:
player_club_id = torch.from_numpy(unique_player_id['mappedID'].values)
club_player_id = pd.merge(players_df['club_id'], unique_club_id,
                            left_on='club_id', right_on='clubId', how='left')
club_player_id = torch.from_numpy(club_player_id['mappedID'].values)

print()
print("Player_club_id:")
print("=================================================")
print(player_club_id)
print()
print("Club_player_id:")
print("=================================================")
print(club_player_id)

# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_player_to_club = torch.stack([player_club_id, club_player_id], dim=0)
assert edge_index_player_to_club.size() == (2, len(player_club_id))

print()
print("Final edge indices pointing from players to clubs:")
print("=================================================")
print(edge_index_player_to_club)



Player_club_id:
tensor([    0,     1,     2,  ..., 36816, 36817, 36818])

Club_player_id:
tensor([   0,    1,    2,  ..., 1239, 1211, 1197])

Final edge indices pointing from players to clubs:
tensor([[    0,     1,     2,  ..., 36816, 36817, 36818],
        [    0,     1,     2,  ...,  1239,  1211,  1197]])


In [7]:
# Perform merge to obtain the edges from home teams to away teams:
home_club_id = pd.merge(matches_df['HomeID'], unique_club_id,
                            left_on='HomeID', right_on='clubId', how='left')
home_club_id = torch.from_numpy(home_club_id['mappedID'].values)
away_club_id = pd.merge(matches_df['AwayID'], unique_club_id,
                            left_on='AwayID', right_on='clubId', how='left')
away_club_id = torch.from_numpy(away_club_id['mappedID'].values)

print()
print("Home_club_id:")
print("=================================================")
print(home_club_id)
print()
print("Away_club_id:")
print("=================================================")
print(away_club_id)


# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_club_to_club = torch.stack([home_club_id, away_club_id], dim=0)
assert edge_index_club_to_club.size() == (2, len(matches_df))

print()
print("Final edge indices pointing from home to away teams:")
print("=================================================")
print(edge_index_club_to_club)


Home_club_id:
tensor([ 109,  190,  172,  ..., 1080, 1112, 1192])

Away_club_id:
tensor([  92,  194,  198,  ..., 1146, 1183, 1246])

Final edge indices pointing from home to away teams:
tensor([[ 109,  190,  172,  ..., 1080, 1112, 1192],
        [  92,  194,  198,  ..., 1146, 1183, 1246]])


In [8]:
class LinkSplit(RandomLinkSplit):
    def __init__(
        self,
        num_val: Union[int, float] = 0.1,
        num_test: Union[int, float] = 0.2,
        is_undirected: bool = False,
        key: str = 'edge_label',
        split_labels: bool = False,
        add_negative_train_samples: bool = True,
        neg_sampling_ratio: float = 1.0,
        disjoint_train_ratio: Union[int, float] = 0.0,
        edge_types: Optional[Union[EdgeType, List[EdgeType]]] = None,
        rev_edge_types: Optional[Union[EdgeType, List[EdgeType]]] = None,
    ):
        super().__init__(
            num_val,
            num_test,
            is_undirected,
            key,
            split_labels,
            add_negative_train_samples,
            neg_sampling_ratio,
            disjoint_train_ratio,
            edge_types,
            rev_edge_types
        )
    def __call__(self, data: Union[Data, HeteroData]) -> Union[Data, HeteroData]:
        edge_types = self.edge_types
        rev_edge_types = self.rev_edge_types

        train_data, val_data, test_data = copy(data), copy(data), copy(data)

        if isinstance(data, HeteroData):
            if edge_types is None:
                raise ValueError(
                    "The 'RandomLinkSplit' transform expects 'edge_types' to "
                    "be specified when operating on 'HeteroData' objects")

            if not isinstance(edge_types, list):
                edge_types = [edge_types]
                rev_edge_types = [rev_edge_types]

            stores = [data[edge_type] for edge_type in edge_types]
            train_stores = [train_data[edge_type] for edge_type in edge_types]
            val_stores = [val_data[edge_type] for edge_type in edge_types]
            test_stores = [test_data[edge_type] for edge_type in edge_types]
        else:
            rev_edge_types = [None]
            stores = [data._store]
            train_stores = [train_data._store]
            val_stores = [val_data._store]
            test_stores = [test_data._store]

        for item in zip(stores, train_stores, val_stores, test_stores,
                        rev_edge_types):
            store, train_store, val_store, test_store, rev_edge_type = item

            is_undirected = self.is_undirected
            is_undirected &= not store.is_bipartite()
            is_undirected &= (rev_edge_type is None
                              or store._key == data[rev_edge_type]._key)

            edge_index = store.edge_index
            if is_undirected:
                mask = edge_index[0] <= edge_index[1]
                perm = mask.nonzero(as_tuple=False).view(-1)
                perm = perm[torch.randperm(perm.size(0), device=perm.device)]
            else:
                device = edge_index.device
                #perm = torch.randperm(edge_index.size(1), device=device)
                perm = torch.arange(start=0, end=edge_index.size(1), device=device)

            num_val = self.num_val
            if isinstance(num_val, float):
                num_val = int(num_val * perm.numel())
            num_test = self.num_test
            if isinstance(num_test, float):
                num_test = int(num_test * perm.numel())
            
            num_train = perm.numel() - num_val - num_test

            if num_train <= 0:
                raise ValueError("Insufficient number of edges for training")

            train_edges = perm[:num_train]
            val_edges = perm[num_train:num_train + num_val]
            test_edges = perm[num_train + num_val:]
            train_val_edges = perm[:num_train + num_val]
            
            num_disjoint = self.disjoint_train_ratio
            if isinstance(num_disjoint, float):
                num_disjoint = int(num_disjoint * train_edges.numel())
            if num_train - num_disjoint <= 0:
                raise ValueError("Insufficient number of edges for training")

            # Create data splits:
            self._split(train_store, train_edges[num_disjoint:], is_undirected,
                        rev_edge_type)
            self._split(val_store, train_edges, is_undirected, rev_edge_type)
            self._split(test_store, train_val_edges, is_undirected,
                        rev_edge_type)

            # Create negative samples:
            num_neg_train = 0
            if self.add_negative_train_samples:
                if num_disjoint > 0:
                    num_neg_train = int(num_disjoint * self.neg_sampling_ratio)
                else:
                    num_neg_train = int(num_train * self.neg_sampling_ratio)
            num_neg_val = int(num_val * self.neg_sampling_ratio)
            num_neg_test = int(num_test * self.neg_sampling_ratio)

            num_neg = num_neg_train + num_neg_val + num_neg_test

            size = store.size()
            if store._key is None or store._key[0] == store._key[-1]:
                size = size[0]
            neg_edge_index = negative_sampling(edge_index, size,
                                               num_neg_samples=num_neg,
                                               method='sparse')

            # Adjust ratio if not enough negative edges exist
            if neg_edge_index.size(1) < num_neg:
                num_neg_found = neg_edge_index.size(1)
                ratio = num_neg_found / num_neg
                warnings.warn(
                    f"There are not enough negative edges to satisfy "
                    "the provided sampling ratio. The ratio will be "
                    f"adjusted to {ratio:.2f}.")
                num_neg_train = int((num_neg_train / num_neg) * num_neg_found)
                num_neg_val = int((num_neg_val / num_neg) * num_neg_found)
                num_neg_test = num_neg_found - num_neg_train - num_neg_val

            # Create labels:
            if num_disjoint > 0:
                train_edges = train_edges[:num_disjoint]
            self._create_label(
                store,
                train_edges,
                neg_edge_index[:, num_neg_val + num_neg_test:],
                out=train_store,
            )
            self._create_label(
                store,
                val_edges,
                neg_edge_index[:, :num_neg_val],
                out=val_store,
            )
            self._create_label(
                store,
                test_edges,
                neg_edge_index[:, num_neg_val:num_neg_val + num_neg_test],
                out=test_store,
            )

        return train_data, val_data, test_data


In [16]:
data = HeteroData()

# Save node indices:
data["player"].node_id = torch.arange(len(unique_player_id))
data["club"].node_id = torch.arange(len(unique_club_id))

# Add the node features and edge indices:
data["player"].x = player_feat
data["player", "plays_for", "club"].edge_index = edge_index_player_to_club
data["player", "plays_for", "club"].attr = player_club_position
data["club", "plays_match_against", "club"].edge_index = edge_index_club_to_club
data["club", "plays_match_against", "club"].edge_label = match_feat


# We also need to make sure to add the reverse edges from clubs to players
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)
print(data)

assert data.node_types == ["player", "club"]
assert data.edge_types == [("player", "plays_for", "club"),
                           ("club", "plays_match_against", "club"),
                           ("club", "rev_plays_for", "player")]
assert data["player"].num_nodes == 36819
assert data["player"].num_features == 223
assert data["club"].num_nodes == 1269
assert data["club"].num_features == 0
assert data["player", "plays_for", "club"].num_edges == 36819
assert data["club", "rev_plays_for", "player"].num_edges == 36819
assert data["club", "plays_match_against", "club"].num_edges == 23740


HeteroData(
  [1mplayer[0m={
    node_id=[36819],
    x=[36819, 223]
  },
  [1mclub[0m={ node_id=[1269] },
  [1m(player, plays_for, club)[0m={
    edge_index=[2, 36819],
    attr=[36819, 29]
  },
  [1m(club, plays_match_against, club)[0m={
    edge_index=[2, 23740],
    edge_label=[23740, 15]
  },
  [1m(club, rev_plays_for, player)[0m={
    edge_index=[2, 36819],
    attr=[36819, 29]
  }
)


In [17]:
cutoff_club_id = unique_club_id[unique_club_id.clubId=="22_1"]["mappedID"].iloc[0]
train_mask = data["plays_match_against"].edge_index < cutoff_club_id

num_train_nodes = train_mask.sum(axis=1).numpy()[0]
num_val_nodes = int((data["plays_match_against"].num_edges - num_train_nodes) / 2)
num_test_nodes = data["plays_match_against"].num_edges - num_train_nodes - num_val_nodes


In [18]:
# Split data in train, validate and test sets

transform = LinkSplit(
    num_val=num_val_nodes,
    num_test=num_test_nodes,
    key="edge_label",
    edge_types="plays_match_against",
    add_negative_train_samples=False,
    disjoint_train_ratio=0.3
)
train_data, val_data, test_data = transform(data)

train_data, val_data, test_data

(HeteroData(
   [1mplayer[0m={
     node_id=[36819],
     x=[36819, 223]
   },
   [1mclub[0m={ node_id=[1269] },
   [1m(player, plays_for, club)[0m={
     edge_index=[2, 36819],
     attr=[36819, 29]
   },
   [1m(club, plays_match_against, club)[0m={
     edge_index=[2, 13347],
     edge_label=[5719, 15],
     edge_label_index=[2, 5719]
   },
   [1m(club, rev_plays_for, player)[0m={
     edge_index=[2, 36819],
     attr=[36819, 29]
   }
 ),
 HeteroData(
   [1mplayer[0m={
     node_id=[36819],
     x=[36819, 223]
   },
   [1mclub[0m={ node_id=[1269] },
   [1m(player, plays_for, club)[0m={
     edge_index=[2, 36819],
     attr=[36819, 29]
   },
   [1m(club, plays_match_against, club)[0m={
     edge_index=[2, 19066],
     edge_label=[4674, 15],
     edge_label_index=[2, 4674]
   },
   [1m(club, rev_plays_for, player)[0m={
     edge_index=[2, 36819],
     attr=[36819, 29]
   }
 ),
 HeteroData(
   [1mplayer[0m={
     node_id=[36819],
     x=[36819, 223]
   },
   [1mcl

In [19]:
torch.save(train_data, "train_data.pt")
torch.save(val_data, "val_data.pt")
torch.save(test_data, "test_data.pt")