In [1]:
!pip install torch_geometric
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.5.0+cu124.html
!pip install mlflow

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1
Looking in links: https://data.pyg.org/whl/torch-2.5.0+cu124.html
Collecting pyg_lib
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu124/pyg_lib-0.4.0%2Bpt25cu124-cp311-cp311-linux_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_scatter
  Downloading https://data.pyg.org/w

# Download dataset

In [4]:
import re
import gdown
import joblib

def download_dataset(url: str):
    id: str = re.search(r'/d/([^/]+)', url).group(1)
    gdown.download(f'https://drive.google.com/uc?id={id}', quiet=False)

download_dataset("https://drive.google.com/file/d/15VKSqJfM4LlSXumD4BHl0BnsPbLZy2sh/view?usp=drive_link")
download_dataset("https://drive.google.com/file/d/1TUE4UPZVpnB9UkM0dxtL-jHJrVSVhAks/view?usp=drive_link")
download_dataset("https://drive.google.com/file/d/1GTYAFbYp429GxHPNhaVDnj7OC2Hjb0Sf/view?usp=drive_link")

15VKSqJfM4LlSXumD4BHl0BnsPbLZy2sh


Downloading...
From (original): https://drive.google.com/uc?id=15VKSqJfM4LlSXumD4BHl0BnsPbLZy2sh
From (redirected): https://drive.google.com/uc?id=15VKSqJfM4LlSXumD4BHl0BnsPbLZy2sh&confirm=t&uuid=e9f16f0a-d786-4a14-98f6-9e6058866b85
To: /content/train.pt
100%|██████████| 117M/117M [00:00<00:00, 153MB/s] 


1TUE4UPZVpnB9UkM0dxtL-jHJrVSVhAks


Downloading...
From: https://drive.google.com/uc?id=1TUE4UPZVpnB9UkM0dxtL-jHJrVSVhAks
To: /content/test.pt
100%|██████████| 6.50M/6.50M [00:00<00:00, 35.8MB/s]


1GTYAFbYp429GxHPNhaVDnj7OC2Hjb0Sf


Downloading...
From: https://drive.google.com/uc?id=1GTYAFbYp429GxHPNhaVDnj7OC2Hjb0Sf
To: /content/val.pt
100%|██████████| 6.52M/6.52M [00:00<00:00, 29.0MB/s]


# Download Label Encoder

In [7]:
download_dataset('https://drive.google.com/file/d/1dMKbq9sawiAVH9Z4Nh2c2ZsZ3nle1-1Q/')
label_encoder = joblib.load('product_id_encoder.joblib')

1dMKbq9sawiAVH9Z4Nh2c2ZsZ3nle1-1Q


Downloading...
From: https://drive.google.com/uc?id=1dMKbq9sawiAVH9Z4Nh2c2ZsZ3nle1-1Q
To: /content/product_id_encoder.joblib
100%|██████████| 10.0M/10.0M [00:00<00:00, 85.1MB/s]


# Import Library

In [22]:
import torch_geometric.data as pyg_data
import torch
import pandas as pd
from tqdm import tqdm
import torch_geometric as pyg
import torch.nn as nn
import torch.nn.functional as F
import math
import torch.optim as optim
import copy
import numpy as np

In [36]:
class GraphDataset(pyg_data.InMemoryDataset):
    def __init__(self, root, file_name, sequences=None, transform=None, pre_transform=None):
        self.sequences = sequences
        self.file_name = file_name
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return [f'{self.file_name}.pt']

    def download(self):
        pass

    def process(self):
        if self.sequences is None:
            self.data, self.slices = torch.load(f'{self.file_name}.pt')
            return

        sessions = self.sequences
        data_list = []

        # Add tqdm progress bar for session processing
        for session in tqdm(sessions, desc="Processing sessions", unit="session"):
            session, y = session[:-1], session[-1]
            codes, uniques = pd.factorize(pd.Series(session))
            senders, receivers = codes[:-1], codes[1:]

            # Build Data instance
            edge_index = torch.tensor([senders, receivers], dtype=torch.long)
            x = torch.tensor(uniques, dtype=torch.long).unsqueeze(1)
            y = torch.tensor([y], dtype=torch.long)
            data_list.append(pyg_data.Data(x=x, edge_index=edge_index, y=y))

         # Save processed data
        if len(data_list) > 0:
            data, slices = self.collate(data_list)
            torch.save((data, slices), self.processed_paths[0])
        else:
            raise ValueError("No data to process!")

    def len(self):
        return len(self.sequences)

# Model/layer definition

In [10]:
class GatedSessionGraphConv(pyg.nn.conv.MessagePassing):
    def __init__(self, out_channels, aggr: str = 'add', **kwargs):
        super().__init__(aggr=aggr, **kwargs)
        self.out_channels = out_channels
        self.gru = torch.nn.GRUCell(out_channels, out_channels, bias=False)

    def forward(self, x, edge_index):
        m = self.propagate(edge_index, x=x, size=None)
        x = self.gru(m, x)
        return x

    def message(self, x_j):
        return x_j

    def message_and_aggregate(self, adj_t, x):
        return torch.matmul(adj_t, x, reduce=self.aggr)

In [11]:
class SRGNN(nn.Module):
    def __init__(self, hidden_size, n_items):
        super(SRGNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_items = n_items

        self.embedding = nn.Embedding(self.n_items, self.hidden_size)
        self.gated = GatedSessionGraphConv(self.hidden_size)

        self.q = nn.Linear(self.hidden_size, 1)
        self.W_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.W_2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.W_3 = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False)

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def forward(self, data):
        x, edge_index, batch_map = data.x, data.edge_index, data.batch

        # (0)
        embedding = self.embedding(x).squeeze()

        # (1)-(5)
        v_i = self.gated(embedding, edge_index)

        # Divide nodes by session
        sections = list(torch.bincount(batch_map).cpu())
        v_i_split = torch.split(v_i, sections)

        v_n, v_n_repeat = [], []
        for session in v_i_split:
            v_n.append(session[-1])
            v_n_repeat.append(
                session[-1].view(1, -1).repeat(session.shape[0], 1))
        v_n, v_n_repeat = torch.stack(v_n), torch.cat(v_n_repeat, dim=0)

        q1 = self.W_1(v_n_repeat)
        q2 = self.W_2(v_i)

        # (6)
        alpha = self.q(F.sigmoid(q1 + q2))
        s_g_split = torch.split(alpha * v_i, sections)

        s_g = []
        for session in s_g_split:
            s_g_session = torch.sum(session, dim=0)
            s_g.append(s_g_session)
        s_g = torch.stack(s_g)

        # (7)
        s_l = v_n
        s_h = self.W_3(torch.cat([s_l, s_g], dim=-1))

        # (8)
        z = torch.mm(self.embedding.weight, s_h.T).T
        return z

# Load Dataset

In [37]:
train_dataset = GraphDataset('/content', 'train')
test_dataset = GraphDataset('/content', 'test')
val_dataset = GraphDataset('/content', 'val')

  self.data, self.slices = torch.load(self.processed_paths[0])


In [38]:
train_dataset[0]

TypeError: object of type 'NoneType' has no len()

In [20]:
# Define the hyperparameters.
# Code taken from 2021 Fall CS224W Colab assignments.
args = {
    'batch_size': 100,
    'hidden_dim': 32,
    'epochs': 100,
    'l2_penalty': 0.00001,
    'weight_decay': 0.1,
    'step': 30,
    'lr': 0.001,
    'num_items': label_encoder.classes_.shape[0]
}

class objectview(object):
    def __init__(self, d):
        self.__dict__ = d

args = objectview(args)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Define training loop

In [25]:
def train(args, train_data, val_data):
    train_loader = pyg_data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, drop_last=True)
    val_loader = pyg_data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, drop_last=True)

    # Build model
    model = SRGNN(args.hidden_dim, args.num_items).to(device)

    # Get training components
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.l2_penalty)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=args.step,
                                          gamma=args.weight_decay)
    criterion = nn.CrossEntropyLoss()

    # Train
    losses = []
    test_accs = []
    top_k_accs = []

    best_acc = 0
    best_model = None

    for epoch in range(args.epochs):
        total_loss = 0
        model.train()
        for _, batch in enumerate(tqdm(train_loader)):
            batch.to(device)
            optimizer.zero_grad()

            pred = model(batch)
            label = batch.y
            loss = criterion(pred, label)

            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch.num_graphs

        total_loss /= len(train_loader.dataset)
        losses.append(total_loss)

        scheduler.step()

        if epoch % 1 == 0:
          test_acc, top_k_acc = test(val_loader, model, is_validation=True)
          print(test_acc)
          test_accs.append(test_acc)
          top_k_accs.append(top_k_acc)
          if test_acc > best_acc:
            best_acc = test_acc
            best_model = copy.deepcopy(model)
        else:
          test_accs.append(test_accs[-1])

    return test_accs, top_k_accs, losses, best_model, best_acc, val_loader

In [24]:
def test(loader, test_model, is_validation=False, save_model_preds=False):
    test_model.eval()

    # Define K for Hit@K metrics.
    k = 20
    correct = 0
    top_k_correct = 0

    for _, data in enumerate(tqdm(loader)):
        data.to(device)
        with torch.no_grad():
            # max(dim=1) returns values, indices tuple; only need indices
            score = test_model(data)
            pred = score.max(dim=1)[1]
            label = data.y

        if save_model_preds:
          data = {}
          data['pred'] = pred.view(-1).cpu().detach().numpy()
          data['label'] = label.view(-1).cpu().detach().numpy()

          df = pd.DataFrame(data=data)
          # Save locally as csv
          df.to_csv('pred.csv', sep=',', index=False)

        correct += pred.eq(label).sum().item()

        # We calculate Hit@K accuracy only at test time.
        if not is_validation:
            score = score.cpu().detach().numpy()
            for row in range(pred.size(0)):
                top_k_pred = np.argpartition(score[row], -k)[-k:]
                if label[row].item() in top_k_pred:
                    top_k_correct += 1

    if not is_validation:
        return correct / len(loader), top_k_correct / len(loader)
    else:
        return correct / len(loader), 0

In [27]:
test_accs, top_k_accs, losses, best_model, best_acc, test_loader = train(args, train_dataset, val_dataset)

print(test_accs, top_k_accs)
print("Maximum test set accuracy: {0}".format(max(test_accs)))
print("Minimum loss: {0}".format(min(losses)))

# plt.title(dataset.name)
plt.plot(losses, label="training loss" + " - ")
plt.plot(test_accs, label="test accuracy" + " - ")
plt.legend()
plt.show()



TypeError: object of type 'NoneType' has no len()