# Graph-Learning-Based Recommender System on MovieLens

### Group 9

- AGARWAL, Sahil
- WEI, Yuanjing
- ZHANG, Yujun yzhanglo@connect.ust.hk

Group project of COMP4222@HKUST in 2022 Fall.

# 1 Environment Configuration

In [5]:
import os
os.path.abspath("")


'/data/yzhanglo'

In [7]:
# change the path in the following
try:
    from google.colab import drive
    drive.mount('/content/drive')
    %cd '/content/drive/MyDrive/4222Group9'
except:
    %cd '/data/yzhanglo/4222project'

import comp4222
import recommenders
%pwd
%ls

/data/yzhanglo/4222project
backup_main.ipynb  [0m[38;5;27mLightGCN[0m/      [38;5;27mml-latest-small[0m/  requirements.txt
[38;5;27mcomp4222[0m/          lightgcn.yaml  movielens.ipynb
[38;5;27mKGAT_folder[0m/       main.ipynb     [38;5;9mmovielens.zip[0m
LICENSE            [38;5;27mml-100k[0m/       [38;5;27mrecommenders[0m/


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import tensorflow as tf
import torch
from tensorboardX import SummaryWriter

# easier to print by putting variable as a single line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# make matplotlib figures appear inline in the notebook rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.2f}'.format
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

# http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [9]:
torch.cuda.is_available()
print(torch.cuda.current_device(), torch.cuda.get_device_name(torch.cuda.current_device()))
# torch.cuda.set_device(0)

True

0 NVIDIA GeForce RTX 2080 Ti


# 2 MovieLens


We're using ml-100k from MovieLens. It contains 100000 ratings from 943 users on 1682 movies. Each user has rated at least 20 movies. And the data was collected during the seven-month period from September 19th, 1997 through April 22nd, 1998. The readme.md is avaliable [here](https://files.grouplens.org/datasets/movielens/ml-100k-README.txt).

P.s. As for now, we're using the provided code for dataset loading defined by lightgcn and microsoft recommender repository.

## Data Loading

In [None]:
# Download MovieLens data.
dataset_name = "ml-latest-small"

from os import path
from urllib.request import urlretrieve
import zipfile
if not path.exists("movielens.zip"):
    urlretrieve(f"https://files.grouplens.org/datasets/movielens/{dataset_name}.zip", "movielens.zip")
    zipfile.ZipFile("movielens.zip", "r").extractall()

In [None]:
movies = pd.read_csv(f"{dataset_name}/movies.csv")
genre_cols = [
    "(no genres listed)", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies

In [None]:
tags = pd.read_csv(f"{dataset_name}/tags.csv")
tags

In [None]:
ratings = pd.read_csv(f"{dataset_name}/ratings.csv")
ratings

## Data Exploration

In [None]:
#%pip install altair
import altair as alt
alt.data_transformers.enable('default', max_rows=None)
alt.renderers.enable('colab')

# 3 Preliminaries

There's no much symbol to be defined at this stage.

# 4 Models Definition

In [5]:
# fix random seeds to make sure out runs are reproducible
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

<torch._C.Generator at 0x7f734df11e10>

## LightGCN

### Tuning alpha_ks

In [9]:
from LightGCN.code.main_lgcn import run_lightgcn
# input 0: original layer-stacking weights, 1: modified layer-stacking weights
run_lightgcn(0)

Parameter containing:
tensor([[0.2500],
        [0.2500],
        [0.2500],
        [0.2500]], requires_grad=True)
[0;30;43mstacking_func: 0[0m
[0;30;43muse NORMAL distribution initilizer[0m
loading adjacency matrix
lgn is already to go(dropout:0)
load and save to ./LightGCN/code/checkpoints/lgn-movielens-3-64.pth.tar
we have tensorboard!
[0;30;43m[TEST][0m
{'precision': array([0.01688596, 0.01798246, 0.01758041]), 'recall': array([0.01235403, 0.02833054, 0.04183197]), 'ndcg': array([0.01855914, 0.02388089, 0.02859607])}
EPOCH[1/10] loss0.691-|Sample:0.74|
EPOCH[2/10] loss0.680-|Sample:1.04|
EPOCH[3/10] loss0.626-|Sample:0.70|
EPOCH[4/10] loss0.525-|Sample:0.68|
EPOCH[5/10] loss0.433-|Sample:0.91|
EPOCH[6/10] loss0.371-|Sample:0.66|
EPOCH[7/10] loss0.336-|Sample:0.88|
EPOCH[8/10] loss0.316-|Sample:0.60|
EPOCH[9/10] loss0.308-|Sample:0.65|
EPOCH[10/10] loss0.296-|Sample:1.03|


In [7]:
from LightGCN.code.main_lgcn import run_lightgcn
run_lightgcn(1)

[0;30;43mstacking_func: 1[0m
[0;30;43muse NORMAL distribution initilizer[0m
loading adjacency matrix
lgn is already to go(dropout:0)
load and save to ../code/checkpoints/lgn-movielens-3-64.pth.tar
[0;30;43m[TEST][0m
{'precision': array([0.01765351, 0.01699561, 0.01736111]), 'recall': array([0.01354759, 0.02529469, 0.03854851]), 'ndcg': array([0.02188969, 0.02450799, 0.02936132])}
EPOCH[1/10] loss0.658-|Sample:0.82|
EPOCH[2/10] loss0.547-|Sample:0.90|
EPOCH[3/10] loss0.372-|Sample:0.73|
EPOCH[4/10] loss0.297-|Sample:0.78|
EPOCH[5/10] loss0.264-|Sample:0.95|
EPOCH[6/10] loss0.248-|Sample:0.73|
EPOCH[7/10] loss0.239-|Sample:0.79|
EPOCH[8/10] loss0.223-|Sample:0.72|
EPOCH[9/10] loss0.217-|Sample:0.79|
EPOCH[10/10] loss0.212-|Sample:0.77|


In [None]:
# TODO

In [None]:
from LightGCN.code.main_lgcn import run_lightgcn
run_lightgcn(2)

In [None]:
from LightGCN.code.main_lgcn import run_lightgcn
run_lightgcn(3)

# 5 Hyperparameter Tunning

In [10]:
data_sizes = ["100k"] # Movielens data size: 100k, 1m, 10m, or 20m
algorithms = [ "lightgcn"]

## Sanity Check by Overfitting on Small Data

In [13]:
from LightGCN.code.main_lgcn import sanity_check

Parameter containing:
tensor([[0.2500],
        [0.2500],
        [0.2500],
        [0.2500]], requires_grad=True)
[0;30;43mCpp extension not loaded[0m
>>SEED: 2020
[0;30;43mloading [LightGCN/data/movielens][0m
44140 interactions for training
11235 interactions for testing
movielens Sparsity : 0.03502087022514546
movielens is ready to go
{'A_n_fold': 100,
 'A_split': False,
 'alphas': Parameter containing:
tensor([[0.2500],
        [0.2500],
        [0.2500],
        [0.2500]], requires_grad=True),
 'bigdata': False,
 'bpr_batch_size': 64,
 'decay': 0.0001,
 'dropout': 0,
 'keep_prob': 0.6,
 'latent_dim_rec': 64,
 'lightGCN_n_layers': 3,
 'lr': 0.0001,
 'multicore': 0,
 'pretrain': 0,
 'stacking_func': 3,
 'test_u_batch_size': 45}
cores for test: 10
comment: lgn
tensorboard: 1
LOAD: 0
Weight path: ./checkpoints
Test Topks: [20, 40, 60]
using bpr loss


In [12]:
#!pip install pytorch-lightning
from pytorch_lightning import Trainer, seed_everything


seed_everything(42, workers=True)

model = sanity_check()
trainer = Trainer(max_epochs=10000, overfit_batches=0.01)
trainer.fit(model)

ModuleNotFoundError: No module named 'pytorch'

## Training

In [None]:
data_sizes = ["100k"] # Movielens data size: 100k, 1m, 10m, or 20m
algorithms = [ "lightgcn"]
%%time

# For each data size and each algorithm, a recommender is evaluated. 
cols = ["Data", "Algo", "K", "Train time (s)", "Predicting time (s)", "RMSE", "MAE", "R2", "Explained Variance", "Recommending time (s)", "MAP", "nDCG@k", "Precision@k", "Recall@k"]
df_results = pd.DataFrame(columns=cols)

for data_size in data_sizes:
    # Load the dataset
    df = movielens.load_pandas_df(
        size=data_size,
        header=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL]
    )
    print("Size of Movielens {}: {}".format(data_size, df.shape))
    
    # Split the dataset
    df_train, df_test = python_stratified_split(df,
                                                ratio=0.75, 
                                                min_rating=1, 
                                                filter_by="item", 
                                                col_user=DEFAULT_USER_COL, 
                                                col_item=DEFAULT_ITEM_COL
                                                )
   
    # Loop through the algos
    for algo in algorithms:
        print(f"\nComputing {algo} algorithm on Movielens {data_size}")
        if algo == 'kgat':
            model, data, Ks, device, time_train = train_kgat()
            _, metrics_dict_kgat = evaluate_kgat(model, data, Ks, device)
            print(metrics_dict_kgat)
            # Record results
            #summary = generate_summary('100k', algo, DEFAULT_K, time_train, time_rating, ratings, time_ranking, rankings)
            #df_results.loc[df_results.shape[0] + 1] = summary
            
        else:
            # Data prep for training set
            train = prepare_training_data.get(algo, lambda x,y:(x,y))(df_train, df_test)
            
            # Get model parameters
            model_params = params[algo]
            
            # Train the model
            model, time_train = trainer[algo](model_params, train)
            print(f"Training time: {time_train}s")
                    
            # Predict and evaluate
            train, test = prepare_metrics_data.get(algo, lambda x,y:(x,y))(df_train, df_test)
            
            if "rating" in metrics[algo]:   
                # Predict for rating
                preds, time_rating = rating_predictor[algo](model, test)
                print(f"Rating prediction time: {time_rating}s")
                
                # Evaluate for rating
                ratings = rating_evaluator[algo](test, preds)
            else:
                ratings = None
                time_rating = np.nan
            
            if "ranking" in metrics[algo]:
                # Predict for ranking
                top_k_scores, time_ranking = ranking_predictor[algo](model, test, train)
                print(f"Ranking prediction time: {time_ranking}s")
                
                # Evaluate for rating
                rankings = ranking_evaluator[algo](test, top_k_scores, DEFAULT_K)
            else:
                rankings = None
                time_ranking = np.nan
                
            # Record results
            summary = generate_summary(data_size, algo, DEFAULT_K, time_train, time_rating, ratings, time_ranking, rankings)
            df_results.loc[df_results.shape[0] + 1] = summary
        
print("\nComputation finished")

## Training Plot

click "launch TensorBoard Session" in main_lgcn.py

# 6 Comparisons on Movielens and Movie

In [None]:
data_sizes = ["100k","1m"] # Movielens data size: 100k, 1m, 10m, or 20m
#algorithms = [ "lightgcn"]
algorithms = ["als", "ncf", "lightgcn", "kgat"]

In [None]:
%%time

# For each data size and each algorithm, a recommender is evaluated. 
cols = ["Data", "Algo", "K", "Train time (s)", "Predicting time (s)", "RMSE", "MAE", "R2", "Explained Variance", "Recommending time (s)", "MAP", "nDCG@k", "Precision@k", "Recall@k"]
df_results = pd.DataFrame(columns=cols)

for data_size in data_sizes:
    # Load the dataset
    df = movielens.load_pandas_df(
        size=data_size,
        header=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL]
    )
    print("Size of Movielens {}: {}".format(data_size, df.shape))
    
    # Split the dataset
    df_train, df_test = python_stratified_split(df,
                                                ratio=0.75, 
                                                min_rating=1, 
                                                filter_by="item", 
                                                col_user=DEFAULT_USER_COL, 
                                                col_item=DEFAULT_ITEM_COL
                                                )
   
    # Loop through the algos
    for algo in algorithms:
        print(f"\nComputing {algo} algorithm on Movielens {data_size}")
        if algo == 'kgat':
            model, data, Ks, device, time_train = train_kgat()
            _, metrics_dict_kgat = evaluate_kgat(model, data, Ks, device)
            print(metrics_dict_kgat)
            # Record results
            #summary = generate_summary('100k', algo, DEFAULT_K, time_train, time_rating, ratings, time_ranking, rankings)
            #df_results.loc[df_results.shape[0] + 1] = summary
            
        else:
            # Data prep for training set
            train = prepare_training_data.get(algo, lambda x,y:(x,y))(df_train, df_test)
            
            # Get model parameters
            model_params = params[algo]
            
            # Train the model
            model, time_train = trainer[algo](model_params, train)
            print(f"Training time: {time_train}s")
                    
            # Predict and evaluate
            train, test = prepare_metrics_data.get(algo, lambda x,y:(x,y))(df_train, df_test)
            
            if "rating" in metrics[algo]:   
                # Predict for rating
                preds, time_rating = rating_predictor[algo](model, test)
                print(f"Rating prediction time: {time_rating}s")
                
                # Evaluate for rating
                ratings = rating_evaluator[algo](test, preds)
            else:
                ratings = None
                time_rating = np.nan
            
            if "ranking" in metrics[algo]:
                # Predict for ranking
                top_k_scores, time_ranking = ranking_predictor[algo](model, test, train)
                print(f"Ranking prediction time: {time_ranking}s")
                
                # Evaluate for rating
                rankings = ranking_evaluator[algo](test, top_k_scores, DEFAULT_K)
            else:
                rankings = None
                time_ranking = np.nan
                
            # Record results
            summary = generate_summary(data_size, algo, DEFAULT_K, time_train, time_rating, ratings, time_ranking, rankings)
            df_results.loc[df_results.shape[0] + 1] = summary
        
print("\nComputation finished")

## Print the result summary

In [None]:
df_results

# 7 Credit and Reference

1. https://github.com/microsoft/recommenders