In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F

from torch_geometric.loader import DataLoader
from torch_geometric.data import Data, InMemoryDataset

In [2]:
PROJECT_DIR = "/home2/vishva.saravanan/projects/GINConv"

# TASK 1: Data Preprocessing
**input**: None  
**output**: preprocessed dataframe

- find and load dataset from file
- preprocess it for easy extraction of features

In [3]:
### DELETE THIS CELL ###

"""
sample dataset
    link: https://www.kaggle.com/chadgostopp/recsys-challenge-2015
    files: yoochoose-buys.dat, yoochoose-clicks.dat
"""

from sklearn.preprocessing import LabelEncoder

# read a sample dataset from file
df = pd.read_csv(f"{PROJECT_DIR}/yoochoose-clicks.dat", header=None)
df.columns = ["session_id", "timestamp", "item_id", "category"]
buy_df = pd.read_csv(f"{PROJECT_DIR}/yoochoose-buys.dat", header=None)
buy_df.columns = ["session_id", "timestamp", "item_id", "price", "quantity"]

# encode item ids as labels
item_encoder = LabelEncoder()
df["item_id"] = item_encoder.fit_transform(df.item_id)

# subsample data
sampled_session_id = np.random.choice(df.session_id.unique(), 10000, replace=False)
df = df.loc[df.session_id.isin(sampled_session_id)]

# to determine the ground truth, we simply check if a session_id in `clicks` is also present in `buys`
df["label"] = df.session_id.isin(buy_df.session_id)

# preview processed data
df.head()

### DELETE THIS CELL ###

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,session_id,timestamp,item_id,category,label
824,257,2014-04-06T14:59:22.715Z,39928,0,False
825,257,2014-04-06T14:59:36.535Z,39932,0,False
826,257,2014-04-06T15:01:10.731Z,39940,0,False
4954,1448,2014-04-02T10:47:50.884Z,22965,0,False
4955,1448,2014-04-02T10:48:22.612Z,45480,0,False


# TASK 2: Dataset Construction
**input**: dataframe  
**output**: train_dataset, val_dataset, test_dataset

- fit it to the required format (implement these methods: raw_file_names, processed_file_names, download, process)
- split it into the required train, validation and testing sets

In [4]:
### DELETE THIS CELL ###

from tqdm import tqdm

class YooChooseBinaryDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(YooChooseBinaryDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return [f"{PROJECT_DIR}/yoochoose_click_binary_1M_sess.dataset"]

    def download(self):
        pass

    def process(self):
        data_list = []

        # process by session_id
        grouped = df.groupby('session_id')
        for session_id, group in tqdm(grouped):     # tqdm is just a progress bar
            sess_item_id = LabelEncoder().fit_transform(group.item_id)
            group = group.reset_index(drop=True)    # TODO: what does this do?
            group["sess_item_id"] = sess_item_id
            
            # TODO: wtaf does this do
            node_features = group.loc[group.session_id==session_id, ["sess_item_id", "item_id"]].sort_values("sess_item_id").item_id.drop_duplicates().values

            # converts into a column vector
            node_features = torch.LongTensor(node_features).unsqueeze(1)

            # TODO: wat dis
            target_nodes = group.sess_item_id.values[1:]
            source_nodes = group.sess_item_id.values[:-1]

            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
            x = node_features
            y = torch.FloatTensor([group.label.values[0]])

            data = Data(x=x, y=y, edge_index=edge_index)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

### DELETE THIS CELL ###

In [5]:
### DELETE THIS CELL ###

dataset = YooChooseBinaryDataset(root=PROJECT_DIR)
train_dataset = dataset[:8000]
val_dataset = dataset[8000:9000]
test_dataset = dataset[9000:]

### DELETE THIS CELL ###

# TASK 3: Building the GINConv Layer
**input**: None  
**output**: GINConv class that can replace the built in torch_geometric.nn.GINConv

- implement the GINConv layer as described in the paper
- do NOT use the MessagePassing parent class (I guess)

In [6]:
### DELETE THIS CELL ###

from torch_geometric.nn import MessagePassing
from torch_geometric.utils import remove_self_loops, add_self_loops

class GINConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(GINConv, self).__init__(aggr="max")
        self.lin = torch.nn.Linear(in_channels, out_channels)
        self.act = torch.nn.ReLU()
        self.update_lin = torch.nn.Linear(in_channels + out_channels, in_channels, bias=False)
        self.update_act = torch.nn.ReLU()
    
    def forward(self, x, edge_index):
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)

    def message(self, x_j):
        x_j = self.lin(x_j)
        x_j = self.act(x_j)

        return x_j
    
    def update(self, aggr_out, x):
        new_embedding = torch.cat([aggr_out, x], dim=1)
        new_embedding = self.update_lin(new_embedding)
        new_embedding = self.update_act(new_embedding)

        return new_embedding

### DELETE THIS CELL ###

# TASK 4: Building the Network
**input**: GINConv, pooling and FC layers, torch functions (F)  
**output**: Net class with the neural network structure (implement these methods: forward)

- implement the network as described in the paper

In [7]:
### DELETE THIS CELL ###

from torch_geometric.nn import TopKPooling
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp

embedding_dim = 128     # TODO: what's this for? what is item embedding?

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = GINConv(embedding_dim, 128)
        self.pool1 = TopKPooling(128, ratio=0.8)
        self.conv2 = GINConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.8)
        self.conv3 = GINConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.8)

        self.item_embedding = torch.nn.Embedding(num_embeddings=df.item_id.max() + 1, embedding_dim=embedding_dim)

        self.lin1 = torch.nn.Linear(256, 128)
        self.act1 = torch.nn.ReLU()
        self.lin2 = torch.nn.Linear(128, 64)
        self.act2 = torch.nn.ReLU()
        self.lin3 = torch.nn.Linear(64, 1)

        self.bn1 = torch.nn.BatchNorm1d(128)    # TODO: what's Batch Norm 1d
        self.bn2 = torch.nn.BatchNorm1d(64)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.item_embedding(x)
        x = x.squeeze(1)    # removes all 1s from x

        # 1st conv layer
        x = F.relu(self.conv1(x, edge_index))   # TODO: why relu-ing the output?

        # 1st pooling layer
        x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        # 2nd conv layer
        x = F.relu(self.conv2(x, edge_index))

        # 2nd pooling layer
        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        # 3rd conv layer
        x = F.relu(self.conv3(x, edge_index))

        # 3rd pooling layer
        x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = x1 + x2 + x3

        # first n-1 fully connected layers
        x = self.lin1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.act2(x)

        # dropout regularization
        x = F.dropout(x, p=0.5, training=self.training)

        # final fully connected layer with sigmoid activation and no 1s
        x = torch.sigmoid(self.lin3(x)).squeeze(1)
        
        return x

### DELETE THIS CELL ###

# TASK 5: Training & Validating the Network
**input**: network model  
**output**: trained network model

- code to train, validate and test the model for variable number of epochs

In [15]:
# hyperparameters and training config
device = torch.device("cpu")   # torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 0.005
batch_size = 256
num_epochs = 1

In [9]:
# model configuration
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = torch.nn.BCELoss()

# instantiate dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [10]:
# function to train the model for 1 epoch
def train(model, dataloader, optimizer, loss_fn):
    model.train()

    total_loss = 0
    for data in dataloader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        label = data.y.to(device)
        loss = loss_fn(output, label)
        loss.backward()
        total_loss += data.num_graphs * loss.item()
        optimizer.step()

    return total_loss / len(train_dataset)

In [11]:
from sklearn.metrics import roc_auc_score

# function to validate performance of model on a dataset
def evaluate(model, dataloader):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in dataloader:
            data = data.to(device)
            pred = model(data).detach().cpu().numpy()
            label = data.y.detach().cpu().numpy()
            predictions.append(pred)
            labels.append(label)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)

    return roc_auc_score(labels, predictions)

In [12]:
# train the model
for epoch in range(num_epochs):
    loss = train(model, train_loader, optimizer, loss_fn)
    train_auc = evaluate(model, train_loader)
    val_auc = evaluate(model, val_loader)
    test_auc = evaluate(model, test_loader)
    print(f"Epoch: {epoch+1:03d}, Loss: {loss:.5f}, Train Auc: {train_auc:.5f}, Val Auc: {val_auc:.5f}, Test Auc: {test_auc:.5f}")

Epoch: 001, Loss: 0.28952, Train Auc: 0.66500, Val Auc: 0.36370, Test Auc: 0.39197
