<a href="https://colab.research.google.com/github/ugoetudo/valenced-rel-extraction/blob/main/Graph_NN_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import Tensor
print(torch.__version__)

2.6.0+cu124


In [2]:
import os
os.environ['TORCH'] = torch.__version__

!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install pyg-lib -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_scatter-2.1.2%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt26cu124
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.6.0%2Bcu124/torch_sparse-0.6.18%2Bpt26cu124-cp311-cp311-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt26cu124
Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
Collecting pyg-lib
  Downloading https://data.pyg.org

In [3]:
from torch_geometric.data import download_url, extract_zip

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'),'.')

movies_path = './ml-latest-small/movies.csv'
ratings_path = './ml-latest-small/ratings.csv'

Downloading https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Extracting ./ml-latest-small.zip


In [4]:
import pandas as pd

print(pd.read_csv(movies_path).head())
print(pd.read_csv(ratings_path).head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:
movies_df = pd.read_csv(movies_path, index_col='movieId')
genres = movies_df['genres'].str.get_dummies('|')
movie_feat = torch.from_numpy(genres.values).to(torch.float)
movie_feat.size()

torch.Size([9742, 20])

In [6]:
ratings_df = pd.read_csv(ratings_path)

unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={'userId': unique_user_id,
                                    'mappedID': pd.RangeIndex(
                                                          len(unique_user_id))})
unique_user_id.head()

unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame({'movieId': unique_movie_id,
                                'mappedID':pd.RangeIndex(len(unique_movie_id))})

ratings_user_id = pd.merge(ratings_df, unique_user_id, on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values).to(torch.long)

ratings_movie_id = pd.merge(ratings_df, unique_movie_id, on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values).to(torch.long)

edge_index_user_to_movie = torch.stack([ratings_user_id,
                                        ratings_movie_id], dim=0)
print(edge_index_user_to_movie)

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 3121, 1392, 2873]])


In [7]:
len(unique_movie_id)

9724

In [8]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()

data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(unique_movie_id))

data["movie"].x = movie_feat[:len(unique_movie_id),:]
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie
data = T.ToUndirected()(data)
data


HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9724],
    x=[9724, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)

In [9]:
transform = T.RandomLinkSplit(num_val=0.1,
                              num_test=0.2,
                              is_undirected=True,
                              disjoint_train_ratio=0.7,
                              edge_types=('user', 'rates', 'movie'),
                              rev_edge_types=('movie', 'rev_rates', 'user'),
                              add_negative_train_samples=True)

train_data, val_data, test_data = transform(data)

In [10]:
train_data

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9724],
    x=[9724, 20],
  },
  (user, rates, movie)={
    edge_index=[2, 21176],
    edge_label=[98820],
    edge_label_index=[2, 98820],
  },
  (movie, rev_rates, user)={ edge_index=[2, 21176] }
)

In [11]:
val_data["user", "rates", "movie"].edge_label.long().bincount()

tensor([10083, 10083])

In [12]:
from torch_geometric.loader import LinkNeighborLoader

edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20,10],
    # neg_sampling_ratio=0.7,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

sampled_data = next(iter(train_loader))

print(sampled_data)

HeteroData(
  user={
    node_id=[581],
    n_id=[581],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[1901],
    x=[1901, 20],
    n_id=[1901],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 6952],
    edge_label=[128],
    edge_label_index=[2, 128],
    e_id=[6952],
    num_sampled_edges=[2],
    input_id=[128],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 3907],
    e_id=[3907],
    num_sampled_edges=[2],
  }
)


In [22]:
from torch_geometric.nn import SAGEConv, to_hetero, GATConv

# class GNN(torch.nn.Module):
#   def __init__(self, hidden_channels):
#     super().__init__()
#     self.conv1 = GATConv(hidden_channels, hidden_channels, add_self_loops=False)
#     self.conv2 = GATConv(hidden_channels, hidden_channels, add_self_loops=False)

#   def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
#     x = self.conv1(x, edge_index).relu()
#     x = self.conv2(x, edge_index)
#     return x

class GNN(torch.nn.Module):
  def __init__(self, hidden_channels):
    super().__init__()
    self.conv1 = SAGEConv(hidden_channels, hidden_channels)
    self.conv2 = SAGEConv(hidden_channels, hidden_channels)

  def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
    x = self.conv1(x, edge_index).relu()
    x = self.conv2(x, edge_index)
    return x

class Classifier(torch.nn.Module):
  def forward(self, x_user: Tensor,
              x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
    edge_feat_user = x_user[edge_label_index[0]]
    edge_feat_movie = x_movie[edge_label_index[1]]
    return (edge_feat_user * edge_feat_movie).sum(dim=-1)

class Model(torch.nn.Module):
  def __init__(self, hidden_channels):
    super().__init__()
    self.movie_lin = torch.nn.Linear(20, hidden_channels)
    self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
    self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)
    self.gnn = GNN(hidden_channels)
    self.gnn = to_hetero(self.gnn, metadata=data.metadata())
    self.classifier = Classifier()

  def forward(self, data):
    x_dict = {
        "user": self.user_emb(data["user"].node_id),
        "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
    }
    x_dict = self.gnn(x_dict, data.edge_index_dict)

    pred = self.classifier(x_dict["user"], x_dict["movie"],
                           data["user", "rates", "movie"].edge_label_index)

    return pred

model = Model(hidden_channels=32)
print(model)


Model(
  (movie_lin): Linear(in_features=20, out_features=32, bias=True)
  (user_emb): Embedding(610, 32)
  (movie_emb): Embedding(9724, 32)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(32, 32, aggr=mean)
      (movie__rev_rates__user): SAGEConv(32, 32, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(32, 32, aggr=mean)
      (movie__rev_rates__user): SAGEConv(32, 32, aggr=mean)
    )
  )
  (classifier): Classifier()
)


In [23]:
import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 6):
  total_loss = total_examples = 0
  for sampled_data in tqdm.tqdm(train_loader):
    optimizer.zero_grad()
    sampled_data = sampled_data.to(device)
    model.forward(sampled_data)
    ground_truth = sampled_data["user", "rates", "movie"].edge_label.to(device)
    pred = model(sampled_data)

    loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
    loss.backward()
    optimizer.step()
    total_loss += float(loss) * pred.numel()
    total_examples += pred.numel()
  print(f"Epoch: {epoch}, Loss: {total_loss / total_examples}")


Device: cpu


100%|██████████| 773/773 [00:21<00:00, 35.87it/s]


Epoch: 1, Loss: 0.43448015175423527


100%|██████████| 773/773 [00:20<00:00, 37.71it/s]


Epoch: 2, Loss: 0.35173130966263055


100%|██████████| 773/773 [00:21<00:00, 36.74it/s]


Epoch: 3, Loss: 0.3303050024286018


100%|██████████| 773/773 [00:23<00:00, 33.05it/s]


Epoch: 4, Loss: 0.3154119852220836


100%|██████████| 773/773 [00:20<00:00, 37.31it/s]

Epoch: 5, Loss: 0.30321256601118024





In [24]:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)

sampled_data = next(iter(val_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

Sampled mini-batch:
HeteroData(
  user={
    node_id=[605],
    n_id=[605],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[2639],
    x=[2639, 20],
    n_id=[2639],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 18156],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[18156],
    num_sampled_edges=[2],
    input_id=[384],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7851],
    e_id=[7851],
    num_sampled_edges=[2],
  }
)


In [25]:
from sklearn.metrics import roc_auc_score, classification_report

preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        pred = model(sampled_data.to(device))
        ground_truth = sampled_data["user", "rates", "movie"].edge_label.to(device)
        preds.append(pred.cpu())
        ground_truths.append(ground_truth.cpu())

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")
print(classification_report(ground_truth, pred > 0))

100%|██████████| 53/53 [00:01<00:00, 45.09it/s]


Validation AUC: 0.9029
              precision    recall  f1-score   support

         0.0       0.87      0.76      0.81     10083
         1.0       0.79      0.89      0.83     10083

    accuracy                           0.82     20166
   macro avg       0.83      0.82      0.82     20166
weighted avg       0.83      0.82      0.82     20166




