<a href="https://colab.research.google.com/github/tamara-kostova/IIS/blob/master/lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install torch_geometric
!pip install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.0+cpu.html

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
from torch_geometric.datasets import AmazonBook

data = AmazonBook('data/AmazonBook')
dataset = data[0]

Downloading https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data/amazon-book/user_list.txt
Downloading https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data/amazon-book/item_list.txt
Downloading https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data/amazon-book/train.txt
Downloading https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data/amazon-book/test.txt
Processing...
Done!


# **EXERCISE 1**

In [None]:
from torch_geometric.utils import to_networkx
networkx_graph = to_networkx(AmazonBook('/content/data/Amazon')[0]).to_undirected()

In [None]:
import numpy as np
edges = np.array(networkx_graph.edges())
length: int = edges.shape[0]
size: int = int(length*0.2)

test_edges = np.random.choice(edges.flatten(), size=(size, edges.shape[1]), replace=False)
networkx_graph.remove_edges_from(test_edges)

In [None]:
import networkx as nx
index = nx.jaccard_coefficient(networkx_graph, test_edges)
index = list(index)
index = np.array(index)
y_hats = (index[:, -1] > 0.5).astype(int)

In [None]:
num_nodes = networkx_graph.number_of_nodes()

negative_edges = set()
set_of_edges = set(map(tuple, edges))

while len(negative_edges) < size:
    random_edge = tuple(np.random.randint(0, num_nodes, size=2))
    if random_edge not in set_of_edges and random_edge not in negative_edges:
        negative_edges.add(random_edge)

negative_edges = list(negative_edges)

In [None]:
neg_index = list(nx.jaccard_coefficient(networkx_graph, negative_edges))
neg_index = np.array(neg_index)
y_hats_neg = (neg_index[:, -1] > 0.5).astype(int)

In [None]:
y_score = np.concatenate([y_hats, y_hats_neg])
true_positive = np.ones(test_edges.shape[0])
true_negative = np.zeros(test_edges.shape[0])
y_true = np.concatenate([true_positive, true_negative])

In [None]:
from sklearn.metrics import average_precision_score
average_precision_score(y_true, y_score)

0.5000100109350213

# **EXERCISE 2**

In [None]:
import torch
from torch_geometric.nn import SAGEConv

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

In [None]:
from torch_geometric.nn import Linear
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['book'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

In [None]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels, data):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

## Train

In [None]:
from torch.nn.functional import mse_loss
from torch_geometric.nn import to_hetero

def train_link_prediction(model, train_data, val_data, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        pred = model(train_data.x_dict, train_data.edge_index_dict,
                     train_data['user', 'book'].edge_label_index)
        target = train_data['user', 'book'].edge_label
        loss = mse_loss(pred, target)
        loss.backward()
        optimizer.step()

        model.eval()
        pred = model(val_data.x_dict, val_data.edge_index_dict,
                     val_data['user', 'book'].edge_label_index)
        pred = pred.clamp(min=0, max=5)
        target = val_data['user', 'book'].edge_label.float()
        val_loss = mse_loss(pred, target).sqrt()

        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}')

In [None]:
from torch.optim import SGD
from torch.nn import CrossEntropyLoss
from torch_geometric.nn import to_hetero
from torch_geometric.loader import NeighborLoader

In [None]:
from torch_geometric.transforms import RandomLinkSplit

train_val_test_split = RandomLinkSplit(num_val=0.2,
                                           num_test=0.2,
                                           add_negative_train_samples=True,
                                           edge_types=('user', 'rates', 'book'),
                                           rev_edge_types=('book', 'rated_by', 'user'))

In [None]:
num_users = dataset['user'].num_nodes
num_books = dataset['book'].num_nodes
dataset['user'].x = torch.ones(num_users, 1)
dataset['book'].x = torch.ones(num_books, 1)

In [None]:
train_data, val_data, test_data = train_val_test_split(dataset)

model = Model(hidden_channels=128, data=dataset)

optimizer = SGD(model.parameters(), lr=0.001)

train_link_prediction(model, train_data, val_data, optimizer, 100)

Epoch: 000, Loss: 0.4805, Val Loss: 0.6764
Epoch: 001, Loss: 0.4575, Val Loss: 0.6608
Epoch: 002, Loss: 0.4366, Val Loss: 0.6466
Epoch: 003, Loss: 0.4180, Val Loss: 0.6335
Epoch: 004, Loss: 0.4012, Val Loss: 0.6214
Epoch: 005, Loss: 0.3860, Val Loss: 0.6102
Epoch: 006, Loss: 0.3723, Val Loss: 0.6000
Epoch: 007, Loss: 0.3599, Val Loss: 0.5906
Epoch: 008, Loss: 0.3487, Val Loss: 0.5820
Epoch: 009, Loss: 0.3387, Val Loss: 0.5742
Epoch: 010, Loss: 0.3296, Val Loss: 0.5671
Epoch: 011, Loss: 0.3215, Val Loss: 0.5606
Epoch: 012, Loss: 0.3142, Val Loss: 0.5547
Epoch: 013, Loss: 0.3076, Val Loss: 0.5494
Epoch: 014, Loss: 0.3017, Val Loss: 0.5445
Epoch: 015, Loss: 0.2964, Val Loss: 0.5400
Epoch: 016, Loss: 0.2915, Val Loss: 0.5360
Epoch: 017, Loss: 0.2872, Val Loss: 0.5324
Epoch: 018, Loss: 0.2833, Val Loss: 0.5291
Epoch: 019, Loss: 0.2798, Val Loss: 0.5261
Epoch: 020, Loss: 0.2767, Val Loss: 0.5234
Epoch: 021, Loss: 0.2739, Val Loss: 0.5210
Epoch: 022, Loss: 0.2713, Val Loss: 0.5188
Epoch: 023,

## Test

In [None]:
from sklearn.metrics import classification_report
def test_link_prediction(model, test_data, optimizer, epochs=5):
    with torch.inference_mode():
        out = model(test_data.x_dict, test_data.edge_index_dict,
                      test_data['user', 'book'].edge_label_index).clamp(min=0, max=5)
        target = test_data['user', 'book'].edge_label.float()

        print(classification_report(y_true=target.cpu().numpy(), y_pred=out.round().detach().cpu().numpy()))

In [None]:
test_link_prediction(model, test_data, optimizer, 100)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67    476146
         1.0       0.00      0.00      0.00    476146

    accuracy                           0.50    952292
   macro avg       0.25      0.50      0.33    952292
weighted avg       0.25      0.50      0.33    952292



  _warn_prf(average, modifier, msg_start, len(result))


# **EXERCISE 3**

## Train

In [3]:
import torch

def train_light_gcn(dataset, train_loader, model, optimizer, num_users, num_books, epochs=1):
    for epoch in range(epochs):
        total_loss, total_examples = 0, 0

        for node_ids in train_loader:
            pos_edge_label_index = dataset.edge_index[:, node_ids]
            neg_edge_label_index = torch.stack([pos_edge_label_index[0],
                                                torch.randint(num_users, num_users + num_books,
                                                              (node_ids.numel(),)).to(device)],
                                               dim=0)
            edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)

            optimizer.zero_grad()

            pos_rank, neg_rank = model(dataset.edge_index, edge_label_index).chunk(2)

            loss = model.recommendation_loss(pos_rank, neg_rank, node_id=edge_label_index.unique())
            loss.backward()
            optimizer.step()

            total_loss += float(loss) * pos_rank.numel()
            total_examples += pos_rank.numel()

            print(f'Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}')

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_users, num_books = dataset['user'].num_nodes, dataset['book'].num_nodes
dataset = dataset.to_homogeneous().to(device)

In [10]:
from torch_geometric.transforms import RandomLinkSplit
train_test_split = RandomLinkSplit(num_val=0.01,
                                           num_test=0.98)
train_data, val_data, test_data = train_test_split(dataset)
train_edge_label_index = train_data.edge_index

In [8]:
from torch_geometric.nn import LightGCN

data_loader = torch.utils.data.DataLoader(
    range(train_edge_label_index.size(1)),
    shuffle=True,
    batch_size=16,
)
model = LightGCN(
    num_nodes=dataset.num_nodes,
    embedding_dim=64,
    num_layers=2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
train_light_gcn(dataset.to(device), data_loader, model, optimizer, num_users, num_books, 1)

Epoch: 000, Loss: 0.6931
Epoch: 000, Loss: 0.6928
Epoch: 000, Loss: 0.6881
Epoch: 000, Loss: 0.6866
Epoch: 000, Loss: 0.6847
Epoch: 000, Loss: 0.6737
Epoch: 000, Loss: 0.6711
Epoch: 000, Loss: 0.6655
Epoch: 000, Loss: 0.6525
Epoch: 000, Loss: 0.6462
Epoch: 000, Loss: 0.6393
Epoch: 000, Loss: 0.6395
Epoch: 000, Loss: 0.6239
Epoch: 000, Loss: 0.6161
Epoch: 000, Loss: 0.6036
Epoch: 000, Loss: 0.5954
Epoch: 000, Loss: 0.5874
Epoch: 000, Loss: 0.5838
Epoch: 000, Loss: 0.5691
Epoch: 000, Loss: 0.5625
Epoch: 000, Loss: 0.5536
Epoch: 000, Loss: 0.5448
Epoch: 000, Loss: 0.5365
Epoch: 000, Loss: 0.5275
Epoch: 000, Loss: 0.5210
Epoch: 000, Loss: 0.5116
Epoch: 000, Loss: 0.5054
Epoch: 000, Loss: 0.5005
Epoch: 000, Loss: 0.5145
Epoch: 000, Loss: 0.5087
Epoch: 000, Loss: 0.5036
Epoch: 000, Loss: 0.5087
Epoch: 000, Loss: 0.5085
Epoch: 000, Loss: 0.5010
Epoch: 000, Loss: 0.5005
Epoch: 000, Loss: 0.4988
Epoch: 000, Loss: 0.4935
Epoch: 000, Loss: 0.4915
Epoch: 000, Loss: 0.4854
Epoch: 000, Loss: 0.4870


## Test

In [11]:
data_loader = torch.utils.data.DataLoader(range(test_data.edge_index.size(1)),
                             shuffle=True,
                             batch_size=16)

In [24]:
from torch_geometric.utils import degree
batch_size=16
@torch.no_grad()
def test_light_gcn(model, data_loader, num_users, num_books, k):
  model.eval()
  tp = fp = fn = 0
  total_precision = total_recall = total_examples = 0
  with torch.no_grad():
    for node_ids in data_loader:
        pos_edge_label_index = dataset.edge_index[:, node_ids]
        generated = torch.randint(num_users, num_users + num_books,
                                      (node_ids.numel(),))
        neg_edge_label_index = torch.stack([pos_edge_label_index[0],
                                            generated.to('cuda' if torch.cuda.is_available() else 'cpu')],
                                          dim=0)
        edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)

        pos_rank, neg_rank = model(dataset.edge_index, edge_label_index).chunk(2)

        actual_k = min(k, pos_rank.size(0))

        _, pos_indices = torch.topk(pos_rank.squeeze(), actual_k, largest=True)
        _, neg_indices = torch.topk(neg_rank.squeeze(), actual_k, largest=True)

        tp = torch.sum(pos_indices < actual_k).item()
        fp = torch.sum(neg_indices < actual_k).item()
        fn = actual_k * node_ids.numel() - tp

        if tp + fp > 0:
            precision = tp / (tp + fp)
        else:
            precision = 0

        if tp + fn > 0:
            recall = tp / (tp + fn)
        else:
            recall = 0

        total_precision += precision
        total_recall += recall
        total_examples += 1

    # Average over all batches
    avg_precision = total_precision / total_examples
    avg_recall = total_recall / total_examples

    print(f'Precision@{k}: {avg_precision:.4f}, Recall@{k}: {avg_recall:.4f}')


In [25]:
for k in [1, 10, 20]:
  test_light_gcn(model, data_loader, num_users, num_books, k)

Precision@1: 0.0595, Recall@1: 0.0039
Precision@10: 0.5008, Recall@10: 0.0392
Precision@20: 0.5000, Recall@20: 0.0625
