LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation
- Link: [Paper](https://arxiv.org/pdf/2002.02126.pdf)

In [1]:
import sys
sys.path.append('../../')
import random
import time
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch import optim
from torch_sparse import SparseTensor

from helper.lite_gcn_utils import *
from models.lite_gcn import LightGCN

In [2]:
BASE_PATH = Path('../../raw_data/movie-lens/ml-latest-small')
RATING_THRESHOLD = 4.

In [3]:
df_movies = pd.read_csv(BASE_PATH/'movies.csv', index_col='movieId')
df_links = pd.read_csv(BASE_PATH/'links.csv')
df_tags = pd.read_csv(BASE_PATH/'tags.csv')
df_ratings = pd.read_csv(BASE_PATH/'ratings.csv', index_col='userId')

In [4]:
user_map, new_user_map = map_to_index(df_ratings)
item_map, new_item_map = map_to_index(df_movies)
edge_index = get_edges(
    df=df_ratings,
    edge_col='rating',
    item_col='movieId',
    user_map=user_map,
    item_map=item_map,
    thresh=RATING_THRESHOLD,
)

num_users, num_movies = len(user_map), len(item_map)
num_interactions = edge_index.shape[1]

all_indices = [i for i in range(num_interactions)]
train_indices, test_indices = train_test_split(
    all_indices, test_size=0.2, random_state=1,
)
val_indices, test_indices = train_test_split(
    test_indices, test_size=0.5, random_state=1,
)

train_edge_index = edge_index[:, train_indices]
val_edge_index = edge_index[:, val_indices]
test_edge_index = edge_index[:, test_indices]

train_sparse_edge_index = SparseTensor(
    row=train_edge_index[0], col=train_edge_index[1], 
    sparse_sizes=(
        num_users + num_movies, num_users + num_movies,
    ),
)
val_sparse_edge_index = SparseTensor(
    row=val_edge_index[0], col=val_edge_index[1],
    sparse_sizes=(
        num_users + num_movies, num_users + num_movies,
    ),
)
test_sparse_edge_index = SparseTensor(
    row=test_edge_index[0], col=test_edge_index[1],
    sparse_sizes=(
        num_users + num_movies, num_users + num_movies,
    ),
)

In [5]:
EMBEDDING_DIM = 64
ITERATIONS = 20000
BATCH_SIZE = 1024
LR = 1e-3
ITERS_PER_EVAL = 200
ITERS_PER_LR_DECAY = 200
K = 20
LAMBDA = 1e-6

model = LightGCN(
    num_users, 
    num_movies,
    emb_dim=EMBEDDING_DIM,
    num_layers=5,
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device {device}')

model = model.to(device)
model.train()

optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

edge_index = edge_index.to(device)
train_edge_index = train_edge_index.to(device)
train_sparse_edge_index = train_sparse_edge_index.to(device)

val_edge_index = val_edge_index.to(device)
val_sparse_edge_index = val_sparse_edge_index.to(device)

Using device cpu


In [6]:
train_losses = []
val_losses = []

for i in range(ITERATIONS):
    users_emb_final, users_emb_0, items_emb_final, items_emb_0 = model.forward(
        train_sparse_edge_index,
    )
    user_indices, pos_item_indices, neg_item_indices = sample(
        BATCH_SIZE, train_edge_index,
    )
    user_indices, pos_item_indices, neg_item_indices = user_indices.to(device), pos_item_indices.to(device), neg_item_indices.to(device)
    users_emb_final, users_emb_0 = users_emb_final[user_indices], users_emb_0[user_indices]
    pos_items_emb_final, pos_items_emb_0 = items_emb_final[pos_item_indices], items_emb_0[pos_item_indices]
    neg_items_emb_final, neg_items_emb_0 = items_emb_final[neg_item_indices], items_emb_0[neg_item_indices]

    # loss computation
    train_loss = bpr_loss(
        users_emb_final, users_emb_0,
        pos_items_emb_final, pos_items_emb_0,
        neg_items_emb_final, neg_items_emb_0,
        LAMBDA,
    )

    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    if i % ITERS_PER_EVAL == 0:
        model.eval()
        val_loss, recall, precision = evaluation(
            model, val_edge_index, 
            val_sparse_edge_index, [train_edge_index],
            K, LAMBDA,
        )
        print(f"[{i}/{ITERATIONS}] train_loss: {round(train_loss.item(), 5)}, val_loss: {round(val_loss, 5)}, val_recall@{K}: {round(recall, 5)}, val_precision@{K}: {round(precision, 5)}")
        train_losses.append(train_loss.item())
        val_losses.append(val_loss)
        model.train()

    if i % ITERS_PER_LR_DECAY == 0 and i != 0:
        scheduler.step()

[0/20000] train_loss: -0.69508, val_loss: -0.70245, val_recall@20: 0.00091, val_precision@20: 0.0009
[200/20000] train_loss: -0.70131, val_loss: -0.71328, val_recall@20: 0.01325, val_precision@20: 0.0038
[400/20000] train_loss: -0.77043, val_loss: -0.80261, val_recall@20: 0.07197, val_precision@20: 0.02315
[600/20000] train_loss: -1.0555, val_loss: -1.1387, val_recall@20: 0.08206, val_precision@20: 0.02676
[800/20000] train_loss: -1.67893, val_loss: -1.81786, val_recall@20: 0.08218, val_precision@20: 0.02712
[1000/20000] train_loss: -2.55888, val_loss: -2.79091, val_recall@20: 0.08303, val_precision@20: 0.02731
[1200/20000] train_loss: -3.58853, val_loss: -3.89793, val_recall@20: 0.08241, val_precision@20: 0.02685
[1400/20000] train_loss: -4.67084, val_loss: -5.14783, val_recall@20: 0.0826, val_precision@20: 0.02685
[1600/20000] train_loss: -5.74373, val_loss: -6.38108, val_recall@20: 0.08258, val_precision@20: 0.02685
[1800/20000] train_loss: -7.11533, val_loss: -7.64594, val_recall@2

[15400/20000] train_loss: -36.87823, val_loss: -40.90218, val_recall@20: 0.07527, val_precision@20: 0.02523
[15600/20000] train_loss: -37.62657, val_loss: -40.84883, val_recall@20: 0.07527, val_precision@20: 0.02523
[15800/20000] train_loss: -38.36256, val_loss: -41.11439, val_recall@20: 0.07527, val_precision@20: 0.02523
[16000/20000] train_loss: -38.04086, val_loss: -41.10722, val_recall@20: 0.07527, val_precision@20: 0.02523
[16200/20000] train_loss: -37.79985, val_loss: -40.95839, val_recall@20: 0.07527, val_precision@20: 0.02523
[16400/20000] train_loss: -37.75718, val_loss: -41.07351, val_recall@20: 0.07527, val_precision@20: 0.02523
[16600/20000] train_loss: -37.17892, val_loss: -41.06867, val_recall@20: 0.07527, val_precision@20: 0.02523
[16800/20000] train_loss: -37.93659, val_loss: -41.22693, val_recall@20: 0.07527, val_precision@20: 0.02523
[17000/20000] train_loss: -36.6132, val_loss: -41.17287, val_recall@20: 0.07527, val_precision@20: 0.02523
[17200/20000] train_loss: -36

In [7]:
# evaluate on test set
model.eval()
test_edge_index = test_edge_index.to(device)
test_sparse_edge_index = test_sparse_edge_index.to(device)

test_loss, test_recall, test_precision = evaluation(
            model, test_edge_index, test_sparse_edge_index, [train_edge_index, val_edge_index], K, LAMBDA)

print(f"[test_loss: {round(test_loss, 5)}, test_recall@{K}: {round(test_recall, 5)}, test_precision@{K}: {round(test_precision, 5)}")

[test_loss: -17.23666, test_recall@20: 0.07764, test_precision@20: 0.02717


In [8]:
model.eval()
movieid_title = pd.Series(df_movies.title.values).to_dict()
movieid_genres = pd.Series(df_movies.genres.values).to_dict()

user_pos_items = get_user_positive_items(edge_index)

def get_recommendations(df_ratings, df_movies, user_id, n, new_item_map):
    new_user_id = user_map[user_id]
    user_embed = model.user_emb.weight[new_user_id]
    scores = model.item_emb.weight @ user_embed
    
    df_ratings = df_ratings[(df_ratings.movieId == user_id) & (df_ratings.rating >= RATING_THRESHOLD)]
    top_rated = pd.merge(
        df_movies, df_ratings, 
        left_on=df_movies.index, right_on=df_ratings.index,
    ).sort_values(by='rating', ascending=False)[['title', 'genres', 'rating']]
    
    values, indices = torch.topk(scores, k=len(user_pos_items[new_user_id]))
    recommended = []
    m = min(n, indices.shape[0])
    
    for i in range(m):
        if indices[i] not in user_pos_items[new_user_id]:
            recommended.append(new_item_map[indices[i].cpu().item()])
        
    rec_movies = df_movies.iloc[recommended, :][['title', 'genres']]
    
    return top_rated.sample(n), rec_movies.head(n), rec_movies[::-1].head(n)
   

In [9]:
USER_ID = 2
NUM_RECS = 5

rated, rec, least_rec = get_recommendations(df_ratings, df_movies, USER_ID, NUM_RECS, new_item_map)
rated

Unnamed: 0,title,genres,rating
38,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,4.0
11,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama,4.0
1,Tom and Huck (1995),Adventure|Children,4.0
23,When a Man Loves a Woman (1994),Drama|Romance,4.0
6,Unforgettable (1996),Mystery|Sci-Fi|Thriller,4.0


In [10]:
rec

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
338,Virtuosity (1995),Action|Sci-Fi|Thriller
360,I Love Trouble (1994),Action|Comedy
412,"Age of Innocence, The (1993)",Drama
735,Cemetery Man (Dellamorte Dellamore) (1994),Horror
3439,Teenage Mutant Ninja Turtles II: The Secret of...,Action|Children|Fantasy
