In [262]:
from pathlib import Path
import collections
import math
import os
import os.path as osp
from tqdm import tqdm
from typing import List
import random
import time
import zipfile

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.display.max_rows = 10
from sklearn import metrics
from tensorly import decomposition

import numpy as np
import pandas as pd
import torch
from torch.functional import tensordot
from torch import nn, optim, Tensor
import torch_geometric
from torch_geometric.data import Dataset, Data
from torch_geometric.nn import MessagePassing
from torch_geometric.typing import Adj

In [218]:
BASE_PATH = Path('data/movie-lens/ml-1m')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
RATING_THRESHOLD = 3.
N_USERS = 200
N_ITEMS = 500
EMBEDDING_DIM = 64

In [168]:
tmp = pd.read_csv(
    BASE_PATH/'users.dat',
    sep='::',
    header=None,
    engine='python',
    encoding='latin-1',
    # usecols=[0, 1, 2],
    # names=['a', 'b', 'c'],
)

In [92]:
tmp.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [254]:
def binarize(data, thresh):
    ratings = data['adj_mat']
    ratings[(ratings < thresh)] = 0
    ratings[(ratings >= thresh)] = 1
    data['adj_mat'] = ratings
    return data

class MovieLensDataset(Dataset):
    def __init__(self, root_dir, transform=None, n_users=100, n_items=200):
        self.root_dir = root_dir
        self.n_users = n_users
        self.n_items = n_items
        self.transform = transform
        self._load()
        self._to_graph()
        
    def _read_table(self, path, cols, nrows=None, usecols=None):
        df = pd.read_table(
            path,
            sep='::',
            header=None,
            engine='python',
            encoding='latin-1',
            usecols=usecols,
            names=cols,
            nrows=nrows,
        )
        return df
        
    def _load(self):
        self.movies = self._read_table(
            path=self.root_dir/'movies.dat',
            cols=['movie_id', 'title', 'genres'],

        )
        self.users = self._read_table(
            path=self.root_dir/'users.dat',
            cols=['user_id', 'gender', 'age', 'occupation', 'zip'],
            nrows=self.n_users,

        )
        self.ratings = self._read_table(
            path=self.root_dir/'ratings.dat',
            usecols=[0, 1, 2],
            cols=['user_id', 'movie_id', 'rating'],

        )
        self.df = pd.merge(
            pd.merge(self.ratings, self.users), 
            self.movies,
        )
    
    def __getitem__(self, idx):
        assert idx <= self.data.shape[0], 'Index out of range'
        return self.data.iloc[idx, :]
    
    def _to_graph(self):
        adj_mat = pd.pivot_table(
            data=self.ratings, 
            index='user_id',
            columns='movie_id',
            values='rating',
        )
        adj_mat = adj_mat.fillna(0)
        adj_mat = torch.tensor(adj_mat.values, device=DEVICE)
        
        self.n_users, self.n_items = adj_mat.shape
        
        self.data = Data(
            adj_mat=adj_mat,
            raw_edge_index=adj_mat.clone(),
            ratings=self.ratings,
            users=self.users['user_id'],
            items=self.movies['movie_id'],
        )
        
        if self.transform:
            self.data = self.transform(self.data)
            
    def _split(self, ratio=0.8):
        n_edges = self.n_users * self.n_items
        # why?
        num_train_replaced = round((1-ratio) * n_edges)
        num_val_show = round((1-ratio) * n_edges)

        user_mask = np.random.randint(0, self.n_users, num_train_replaced)
        movie_mask = np.random.randint(0, self.n_items, num_train_replaced)
        
        val_user_mask = np.random.choice(user_mask, num_val_show)
        val_movie_mask = np.random.choice(movie_mask, num_val_show)

        train_mask = torch.ones(self.n_users, self.n_items)
        train_mask[user_mask, movie_mask] = 0

        val_mask = train_mask.clone()
        val_mask[val_user_mask, val_movie_mask] = 1

        test_mask = torch.ones_like(train_mask)

        return train_mask, val_mask, test_mask

In [308]:
class LightGCNConv(MessagePassing):
    def __init__(self, n_users, n_items, **kwargs):
        super().__init__(**kwargs)
        self.n_users = n_users
        self.n_items = n_items
        
    def forward(self, x, edge_index):
        # sparse matrix to adjacency matrix : users X items
        adj_mat = torch.zeros(self.n_users, self.n_items, device=x.device)
        adj_mat[edge_index[:, 0], edge_index[:, 1]] = 1
        
        user_neighbour_count = adj_mat.sum(axis=1)
        item_neighbout_count = adj_mat.sum(axis=0)
        
        weights = adj_mat / torch.sqrt(
            user_neighbour_count.repeat(self.n_items, 1).T * item_neightbor_counts.repeat(self.n_users, 1),
        )
        weights = torch.nan_to_num(weights, nan=0)
        
        user_embeddings = x[:self.n_users]
        item_embeddings = x[self.n_users:]
        out= torch.concat(
            (weights.T @ user_embeddings, weights @ item_embeddings),
            axis=0,
        )
        return out

In [309]:
class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, n_layers, embed_dim):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.embed_dim = embed_dim
        self.n_layers = n_layers
        
        self.embeddings = nn.Embedding(
            num_embeddings=n_users + n_items,
            embedding_dim=embed_dim,
        )
        
        # experiment: try xavier initialization?
        nn.init.normal_(self.embeddings.weight, std=0.1)
        self.sigmoid = nn.Sigmoid()
        
        self.convs = nn.ModuleList()
        self.convs.append(
            LightGCNConv(
                n_users=n_users,
                n_items=n_items,
            )
        )
        
        for i in range(1, n_layers):
            self.convs.append(
                LightGCNConv(
                    n_users=n_users,
                    n_items=n_items,
                )
            )
            
        self.device = DEVICE
        self.convs.to(DEVICE)
        
    def reset_params(self):
        for conv in self.convs:
            conv.reset_parameters()
            
    def forward(self, x, edge_index):
        embed_lis = []
        print(f'edge_index: {edge_index.shape}')
        # adjacency matrix to sparse 
        edge_index = torch.nonzero(edge_index)
        for i in range(self.n_layers):
            print(f'embed before: {x.shape}')
            x = self.convs[i](x, edge_index)
            print(f'embed after: {x.shape}')
            if self.device is not None:
                x = x.to(self.device)
            embed_lis.append(x)
        embed_lis = torch.stack(embed_lis)
        
        self.alpha = 1 / (1 + self.n_layers) * torch.ones(embed_lis.shape)
        if self.device is not None:
            self.alpha = self.alpha.to(self.device)
            embed_lis = embed_lis.to(self.device)
            
        # sum along K layers
        x = (embed_lis * self.alpha).sum(dim=0)  
        return x

In [310]:
def transform_ratings(data):
    return binarize(data, thresh=RATING_THRESHOLD)

def get_user_rating(model, users, data):
    embeddings = model(
        model.embeddings.weight.clone(),
        data['adj_mat'],
    )
    user_embeddings = embeddings[:len(data['users'])]
    item_embeddings = embeddings[len(data['users']):]
    user_embeddings = user_embeddings[users.long()]
    rating = model.f(torch.matmul(user_embeddings, item_embeddings.t()))
    return rating

def get_embedding(model, users, pos, neg, data, mask):
    n_user = len(data['users'])
    embeddings = model(
        model.embeddings.weight.clone(),
        data['adj_mat'] * mask,
    )
    user_embeddings = embeddings[:len(data['users'])]
    item_embeddings = embeddings[len(data['users']):]
    
    user_embeddings = user_embeddings[users]
    pos_embeddings = user_embeddings[pos]
    neg_embeddings = user_embeddings[neg]
    
    users_emb_ego = model.emb(users)
    pos_emb_ego = model.embeddings(pos + n_user)
    neg_emb_ego = model.embeddings(neg + n_user)
    
    return user_embeddings, pos_embeddings, neg_embeddings, users_emb_ego, pos_emb_ego, neg_emb_ego

def _sample_pos_neg(data, mask, num_samples_per_user):
    samples = []
    all_items = set(range(len(data['items'])))
    for user_index, user in enumerate(data['users']):
        pos_items = set(
            torch.nonzero(data['adj_mat'][user_index])[:, 0].tolist(),
        )
        unknown_items = all_items.difference(
                set(
                    torch.nonzero(
                        data['raw_edge_index'][user_index],
                    )[:, 0].tolist(),
                ),
        )
        neg_items = all_items.difference(
            set(pos_items),
        ).difference(set(unknown_items))
        
        unmasked_items = set(torch.nonzero(mask[user_index])[:, 0].tolist())
        
        if len(unknown_items.union(pos_items)) == 0 or len(unknown_items.union(neg_items)) == 0:
            continue
            
        for _ in range(num_samples_per_user):
            if len(pos_items.intersection(unmasked_items)) == 0:
                pos_item_index = random.choice(
                    list(unknown_items.intersection(unmasked_items)))
            else:
                pos_item_index = random.choice(
                    list(pos_items.intersection(unmasked_items)))
            if len(neg_items.intersection(unmasked_items)) == 0:
                neg_item_index = random.choice(
                    list(unknown_items.intersection(unmasked_items)))
            else:
                neg_item_index = random.choice(
                    list(neg_items.intersection(unmasked_items)))
            samples.append((user_index, pos_item_index, neg_item_index))

    return torch.tensor(samples, dtype=torch.int32)

def sample_pos_neg(data, train_mask, val_mask, test_mask, num_samples_per_user):
    train_samples = _sample_pos_neg(data, train_mask, num_samples_per_user)
    val_samples = _sample_pos_neg(data, val_mask, num_samples_per_user)
    test_samples = _sample_pos_neg(data, test_mask, num_samples_per_user)
    return train_samples, val_samples, test_samples

def bpr_loss(model, users, pos, neg, data, mask):
    assert len(users) == len(pos) and len(users) == len(neg)
    (users_emb, pos_emb, neg_emb, 
    userEmb0,  posEmb0, negEmb0) = get_embedding(model, users.long(), pos.long(),
                                                neg.long(), data, mask)
    reg_loss = (1/2)*(userEmb0.norm(2).pow(2) + 
                        posEmb0.norm(2).pow(2)  +
                        negEmb0.norm(2).pow(2))/float(len(users))
    pos_scores = torch.mul(users_emb, pos_emb)
    pos_scores = torch.sum(pos_scores, dim=1)
    neg_scores = torch.mul(users_emb, neg_emb)
    neg_scores = torch.sum(neg_scores, dim=1)
    
    loss = torch.mean(torch.nn.functional.softplus(neg_scores - pos_scores))
    
    return loss, reg_loss

def personalized_topk(pred, K, user_indices, edge_index):
    per_user_preds = collections.defaultdict(list)
    for index, user in enumerate(user_indices):
        per_user_preds[user.item()].append(pred[index].item())
    precisions = 0.0
    recalls = 0.0
    for user, preds in per_user_preds.items():
        while len(preds) < K:
            preds.append(random.choice(range(edge_index.shape[1])))
        top_ratings, top_items = torch.topk(torch.tensor(preds), K)
        correct_preds = edge_index[user, top_items].sum().item()
        total_pos = edge_index[user].sum().item()
        precisions += correct_preds / K
        recalls += correct_preds / total_pos if total_pos != 0 else 0
    num_users = len(user_indices.unique())
    return precisions / num_users, recalls / num_users

In [255]:
ds = MovieLensDataset(
    root_dir=BASE_PATH,
    transform=transform_ratings,
    n_users=N_USERS,
    n_items=N_ITEMS,
)
train_mask, val_mask, test_mask = ds._split()

In [304]:
config_dict = {
    'num_samples_per_user': 500,
    'num_users': 200,

    'epochs': 100,
    'batch_size': 128,
    'lr': 0.001,
    'weight_decay': 0.1,

    'embedding_size': 64,
    'num_layers': 5,
    'K': 10,
    'mf_rank': 8,

    'minibatch_per_print': 100,
    'epochs_per_print': 1,

    'val_frac': 0.2,
    'test_frac': 0.1,

    'model_name': 'model.pth'
}
n_users = len(ds.data['users'].unique())
n_items = len(ds.data['items'].unique())
print(f'#Users: {n_users}')
print(f'#Items: {n_items}')

#Users: 200
#Items: 3883


In [266]:
samples_train, samples_val, samples_test = sample_pos_neg(
    ds.data, train_mask,
    val_mask, test_mask,
    500,
)

In [311]:
model = LightGCN(
    n_users=n_users,
    n_items=n_items,
    n_layers=5,
    embed_dim=64,
)
model.to(DEVICE)

samples_train=samples_train.to(DEVICE)
samples_val=samples_val.to(DEVICE)
samples_test=samples_test.to(DEVICE)
train_mask=train_mask.to(DEVICE)
val_mask=val_mask.to(DEVICE)
test_mask=test_mask.to(DEVICE)
data = ds.data.to(DEVICE)

num_samples_per_user = config_dict["num_samples_per_user"]
epochs = config_dict["epochs"]
batch_size = config_dict["batch_size"]
lr = config_dict["lr"]
weight_decay = config_dict["weight_decay"]

K = config_dict["K"]

print(f'#Training samples: {len(samples_train)}',
      f'#Validation samples: {len(samples_val)}',
      f'#Test samples: {len(samples_test)}')

optimizer = optim.Adam(model.parameters(), lr=0.001)
print('Optimizer:', optimizer)

#Training samples: 100000 #Validation samples: 100000 #Test samples: 100000
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    maximize: False
    weight_decay: 0
)


In [312]:
epochs_tracked = []
train_topks = []
val_topks = []

for epoch in range(epochs):
    print("Training on the {} epoch".format(epoch))
    model.train()
    loss_sum = 0
    # Shuffle the order of rows.
    samples_train = samples_train[torch.randperm(samples_train.size()[0])]
    for batch_idx in range(math.ceil(len(samples_train) / batch_size)):
        optimizer.zero_grad()

        current_batch = samples_train[batch_idx*batch_size: (batch_idx+1)*batch_size]
        # Shuffle the order of rows.
        current_batch = current_batch[torch.randperm(current_batch.size()[0])]
        users = current_batch[:, 0]
        pos = current_batch[:, 1]
        neg = current_batch[:, 2]

        loss, reg_loss = bpr_loss(
            model, users, 
            pos, neg, 
            data, train_mask,
        )
        reg_loss = reg_loss * weight_decay
        loss = loss + reg_loss
        loss_sum += loss.detach()

        loss.backward()
        optimizer.step()

        if batch_idx % config_dict["minibatch_per_print"] == 0:
            all_users = torch.linspace(start=0,
                                       end=n_users - 1, steps=n_users).long()
            user_indices = current_batch[:, 0]
            user_indices = user_indices.repeat(2).long()
            item_indices = torch.cat(
                (current_batch[:, 1], current_batch[:, 2])).long()
            pred = get_user_rating(
                model,
                all_users,
                data,
            )[user_indices, item_indices]
            truth = data['edge_index'][user_indices, item_indices]
            topk_precision, topk_recall = personalized_topk(
                pred, K, user_indices, data["edge_index"],
            )

            print("Training on epoch {} minibatch {}/{} completed\n".format(epoch, batch_idx+1,
                                                                            math.ceil(len(samples_train) / batch_size)),
                  "bpr_loss on current minibatch is {}, and regularization loss is {}.\n".format(round(float(loss.detach().cpu()), 6),
                                                                                                 round(float(reg_loss.detach().cpu()), 6)),
                  "Top K precision = {}, recall = {}.".format(topk_precision, topk_recall))

    if epoch % config_dict["epochs_per_print"] == 0:
        epochs_tracked.append(epoch)

        # evaluation on both the trainisng and validation set
        model.eval()
        # predict on the training set
        users = samples_train[:, 0:1]
        user_indices = samples_train[:, 0]
        user_indices = user_indices.repeat(2).long()
        item_indices = torch.cat(
            (samples_train[:, 1], samples_train[:, 2])).long()
        pred = get_user_rating(
            model,
            users[:,0],
            data,
        )[user_indices, item_indices]
        truth = data["edge_index"][users.long()[:,0]][user_indices, item_indices]
        train_topk_precision, train_topk_recall = personalized_topk(pred, K, user_indices, data["edge_index"])
        train_topks.append((train_topk_precision, train_topk_recall))

        # predict on the validation set
        users_val = samples_val[:, 0:1]
        pos_val = samples_val[:, 1:2]
        neg_val = samples_val[:, 2:3]

        loss_val, reg_loss_val = bpr_loss(
            model, users_val, pos_val, neg_val, data, val_mask,
        )
        reg_loss_val = reg_loss_val * weight_decay

        # predict on the validation set
        user_indices = samples_val[:, 0]
        user_indices = user_indices.repeat(2).long()
        item_indices = torch.cat((samples_val[:, 1], samples_val[:, 2])).long()
        pred_val = get_user_rating(
            model,
            users_val[:,0],
            data,
        )[user_indices, item_indices]
        truth_val = data["edge_index"][users_val.long()[:,0]][user_indices, item_indices]
        val_topk_precision, val_topk_recall = personalized_topk(
            pred_val, K, user_indices, data["edge_index"],
        )
        val_topks.append((val_topk_precision, val_topk_recall))

        print("\nTraining on {} epoch completed.\n".format(epoch),
              "Average bpr_loss on train set is {} for the current epoch.\n".format(round(float(loss_sum/len(samples_train)), 6)),
              "Training top K precision = {}, recall = {}.\n".format(train_topk_precision, train_topk_recall),
              "Average bpr_loss on the validation set is {}, and regularization loss is {}.\n".format(round(float((loss_val+reg_loss_val)/len(samples_val)), 6),
                                                                                                      round(float(reg_loss_val/len(samples_val)), 6)),
              "Validation top K precision = {}, recall = {}.\n".format(val_topk_precision, val_topk_recall))

Training on the 0 epoch
edge_index: torch.Size([6040, 3706])
embed before: torch.Size([4083, 64])


IndexError: index 3079 is out of bounds for dimension 0 with size 200