<a href="https://colab.research.google.com/github/tamara-kostova/IIS/blob/master/lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install torch_geometric
!pip install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.0+cpu.html

Looking in links: https://data.pyg.org/whl/torch-2.2.0+cpu.html


In [4]:
from torch_geometric.datasets import AmazonBook

data = AmazonBook('data/AmazonBook')
dataset = data[0]

# **EXERCISE 1**

In [3]:
from torch_geometric.utils import to_networkx
networkx_graph = to_networkx(AmazonBook('/content/data/Amazon')[0]).to_undirected()

In [4]:
import numpy as np
edges = np.array(networkx_graph.edges())
length: int = edges.shape[0]
size: int = int(length*0.2)

test_edges = np.random.choice(edges.flatten(), size=(size, edges.shape[1]), replace=False)
networkx_graph.remove_edges_from(test_edges)

In [5]:
import networkx as nx
index = nx.jaccard_coefficient(networkx_graph, test_edges)
index = list(index)
index = np.array(index)
y_hats = (index[:, -1] > 0.5).astype(int)

In [6]:
num_nodes = networkx_graph.number_of_nodes()

negative_edges = set()
set_of_edges = set(map(tuple, edges))

while len(negative_edges) < size:
    random_edge = tuple(np.random.randint(0, num_nodes, size=2))
    if random_edge not in set_of_edges and random_edge not in negative_edges:
        negative_edges.add(random_edge)

negative_edges = list(negative_edges)

In [7]:
neg_index = list(nx.jaccard_coefficient(networkx_graph, negative_edges))
neg_index = np.array(neg_index)
y_hats_neg = (neg_index[:, -1] > 0.5).astype(int)

In [8]:
y_score = np.concatenate([y_hats, y_hats_neg])
true_positive = np.ones(test_edges.shape[0])
true_negative = np.zeros(test_edges.shape[0])
y_true = np.concatenate([true_positive, true_negative])

In [9]:
from sklearn.metrics import average_precision_score
average_precision_score(y_true, y_score)

0.5000100109350213

# **EXERCISE 2**

In [6]:
import torch
from torch_geometric.nn import SAGEConv

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

In [7]:
from torch_geometric.nn import Linear
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['book'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

In [8]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels, data):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

## Train

In [9]:
from torch.nn.functional import mse_loss
from torch_geometric.nn import to_hetero

def train_link_prediction(model, train_data, val_data, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        pred = model(train_data.x_dict, train_data.edge_index_dict,
                     train_data['user', 'book'].edge_label_index)
        target = train_data['user', 'book'].edge_label
        loss = mse_loss(pred, target)
        loss.backward()
        optimizer.step()

        model.eval()
        pred = model(val_data.x_dict, val_data.edge_index_dict,
                     val_data['user', 'book'].edge_label_index)
        pred = pred.clamp(min=0, max=5)
        target = val_data['user', 'book'].edge_label.float()
        val_loss = mse_loss(pred, target).sqrt()

        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}')

In [10]:
from torch.optim import SGD
from torch.nn import CrossEntropyLoss
from torch_geometric.nn import to_hetero
from torch_geometric.loader import NeighborLoader

In [2]:
from torch_geometric.transforms import RandomLinkSplit

train_val_test_split = RandomLinkSplit(num_val=0.2,
                                           num_test=0.2,
                                           add_negative_train_samples=True,
                                           edge_types=('user', 'rates', 'book'),
                                           rev_edge_types=('book', 'rated_by', 'user'))

In [11]:
num_users = dataset['user'].num_nodes
num_books = dataset['book'].num_nodes
dataset['user'].x = torch.ones(num_users, 1)
dataset['book'].x = torch.ones(num_books, 1)

In [12]:
train_data, val_data, test_data = train_val_test_split(dataset)

model = Model(hidden_channels=128, data=dataset)

optimizer = SGD(model.parameters(), lr=0.001)

train_link_prediction(model, train_data, val_data, optimizer, 100)

Epoch: 000, Loss: 0.4805, Val Loss: 0.6764
Epoch: 001, Loss: 0.4575, Val Loss: 0.6608
Epoch: 002, Loss: 0.4366, Val Loss: 0.6466
Epoch: 003, Loss: 0.4180, Val Loss: 0.6335
Epoch: 004, Loss: 0.4012, Val Loss: 0.6214
Epoch: 005, Loss: 0.3860, Val Loss: 0.6102
Epoch: 006, Loss: 0.3723, Val Loss: 0.6000
Epoch: 007, Loss: 0.3599, Val Loss: 0.5906
Epoch: 008, Loss: 0.3487, Val Loss: 0.5820
Epoch: 009, Loss: 0.3387, Val Loss: 0.5742
Epoch: 010, Loss: 0.3296, Val Loss: 0.5671
Epoch: 011, Loss: 0.3215, Val Loss: 0.5606
Epoch: 012, Loss: 0.3142, Val Loss: 0.5547
Epoch: 013, Loss: 0.3076, Val Loss: 0.5494
Epoch: 014, Loss: 0.3017, Val Loss: 0.5445
Epoch: 015, Loss: 0.2964, Val Loss: 0.5400
Epoch: 016, Loss: 0.2915, Val Loss: 0.5360
Epoch: 017, Loss: 0.2872, Val Loss: 0.5324
Epoch: 018, Loss: 0.2833, Val Loss: 0.5291
Epoch: 019, Loss: 0.2798, Val Loss: 0.5261
Epoch: 020, Loss: 0.2767, Val Loss: 0.5234
Epoch: 021, Loss: 0.2739, Val Loss: 0.5210
Epoch: 022, Loss: 0.2713, Val Loss: 0.5188
Epoch: 023,

## Test

In [13]:
from sklearn.metrics import classification_report
def test_link_prediction(model, test_data, optimizer, epochs=5):
    with torch.inference_mode():
        out = model(test_data.x_dict, test_data.edge_index_dict,
                      test_data['user', 'book'].edge_label_index).clamp(min=0, max=5)
        target = test_data['user', 'book'].edge_label.float()

        print(classification_report(y_true=target.cpu().numpy(), y_pred=out.round().detach().cpu().numpy()))

In [15]:
test_link_prediction(model, test_data, optimizer, 100)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67    476146
         1.0       0.00      0.00      0.00    476146

    accuracy                           0.50    952292
   macro avg       0.25      0.50      0.33    952292
weighted avg       0.25      0.50      0.33    952292



  _warn_prf(average, modifier, msg_start, len(result))


# **EXERCISE 3**

## Train

In [16]:
import torch

def train_light_gcn(dataset, train_loader, model, optimizer, num_users, num_books, epochs=1):
    for epoch in range(epochs):
        total_loss, total_examples = 0, 0

        for node_ids in train_loader:
            pos_edge_label_index = dataset.edge_index[:, node_ids]
            neg_edge_label_index = torch.stack([pos_edge_label_index[0],
                                                torch.randint(num_users, num_users + num_books,
                                                              (node_ids.numel(),)).to(device)],
                                               dim=0)
            edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)

            optimizer.zero_grad()

            pos_rank, neg_rank = model(dataset.edge_index, edge_label_index).chunk(2)

            loss = model.recommendation_loss(pos_rank, neg_rank, node_id=edge_label_index.unique())
            loss.backward()
            optimizer.step()

            total_loss += float(loss) * pos_rank.numel()
            total_examples += pos_rank.numel()

            print(f'Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}')

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_users, num_books = dataset['user'].num_nodes, dataset['book'].num_nodes
dataset = dataset.to_homogeneous().to(device)

In [18]:
from torch_geometric.transforms import RandomLinkSplit
train_test_split = RandomLinkSplit(num_val=0.01,
                                           num_test=0.98)
train_data, _, _ = train_test_split(dataset)
train_edge_label_index = train_data.edge_index

In [19]:
from torch_geometric.nn import LightGCN

data_loader = torch.utils.data.DataLoader(
    range(train_edge_label_index.size(1)),
    shuffle=True,
    batch_size=16,
)
model = LightGCN(
    num_nodes=dataset.num_nodes,
    embedding_dim=64,
    num_layers=2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
train_light_gcn(dataset.to(device), data_loader, model, optimizer, num_users, num_books, 1)

Epoch: 000, Loss: 0.6931
Epoch: 000, Loss: 0.6928
Epoch: 000, Loss: 0.6906
Epoch: 000, Loss: 0.6857
Epoch: 000, Loss: 0.6767
Epoch: 000, Loss: 0.6698
Epoch: 000, Loss: 0.6628
Epoch: 000, Loss: 0.6563
Epoch: 000, Loss: 0.6503
Epoch: 000, Loss: 0.6517
Epoch: 000, Loss: 0.6480
Epoch: 000, Loss: 0.6217
Epoch: 000, Loss: 0.6088
Epoch: 000, Loss: 0.6030
Epoch: 000, Loss: 0.6010
Epoch: 000, Loss: 0.5934
Epoch: 000, Loss: 0.5861
Epoch: 000, Loss: 0.5796
Epoch: 000, Loss: 0.5716
Epoch: 000, Loss: 0.5604
Epoch: 000, Loss: 0.5636
Epoch: 000, Loss: 0.5546
Epoch: 000, Loss: 0.5527
Epoch: 000, Loss: 0.5556
Epoch: 000, Loss: 0.5537
Epoch: 000, Loss: 0.5485
Epoch: 000, Loss: 0.5535
Epoch: 000, Loss: 0.5474
Epoch: 000, Loss: 0.5444
Epoch: 000, Loss: 0.5368
Epoch: 000, Loss: 0.5347
Epoch: 000, Loss: 0.5324
Epoch: 000, Loss: 0.5283
Epoch: 000, Loss: 0.5225
Epoch: 000, Loss: 0.5192
Epoch: 000, Loss: 0.5276
Epoch: 000, Loss: 0.5222
Epoch: 000, Loss: 0.5166
Epoch: 000, Loss: 0.5097
Epoch: 000, Loss: 0.5049


## Test

In [20]:
from torch_geometric.utils import degree
batch_size=16
@torch.no_grad()
def test_light_gcn(k: int, data, model, device):
    train_edge_label_index = data.edge_label_index
    emb = model.get_embedding(data.edge_index)
    user_emb, book_emb = emb[:num_users], emb[num_users:]

    precision = recall = total_examples = 0
    for start in range(0, num_users, batch_size):
        end = start + batch_size
        logits = user_emb[start:end] @ book_emb.t()

        mask = ((train_edge_label_index[0] >= start) &
                (train_edge_label_index[0] < end))
        logits[train_edge_label_index[0, mask] - start,
               train_edge_label_index[1, mask] - num_users] = float('-inf')

        ground_truth = torch.zeros_like(logits, dtype=torch.bool)
        mask = ((data.edge_label_index[0] >= start) &
                (data.edge_label_index[0] < end))
        ground_truth[data.edge_label_index[0, mask] - start,
                     data.edge_label_index[1, mask] - num_users] = True
        node_count = degree(data.edge_label_index[0, mask] - start,
                            num_nodes=logits.size(0))

        topk_index = logits.topk(k, dim=-1).indices
        isin_mat = ground_truth.gather(1, topk_index)

        precision += float((isin_mat.sum(dim=-1) / k).sum())
        recall += float((isin_mat.sum(dim=-1) / node_count.clamp(1e-6)).sum())
        total_examples += int((node_count > 0).sum())

    return precision / total_examples, recall / total_examples


In [21]:
for k in [1, 10, 20]:
  precision, recall = test_light_gcn(k, train_data, model, device)
  print(f'Precision@ {k: precision}')
  print(f'Recall@ {k: recall}')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
