Reference links
- Colab PyG setting: https://gist.github.com/ameya98/b193856171d11d37ada46458f60e73e7
- LightGCN implementation: https://colab.research.google.com/drive/1VfP6JlWbX_AJnx88yN1tM3BYE6XAADiy?usp=sharing#scrollTo=IaZK6fwHzyd
- LightGCN implementation & Yelp2018 dataset: https://github.com/gusye1234/LightGCN-PyTorch

In [None]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric 

In [3]:
# import required modules
import warnings
warnings.filterwarnings('ignore')
import random
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import os

import torch
from torch import nn, optim, Tensor
from torch_sparse import SparseTensor, matmul

from torch_geometric.utils import negative_sampling, structured_negative_sampling
from torch_geometric.data import download_url, extract_zip
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd/content/drive/MyDrive/LG_AI_Ground/code

/content/drive/MyDrive/LG_AI_Ground/code


In [6]:
# 경로 설정
data_path = '../data'
saved_path = './saved'
output_path = './submission'

In [49]:
train_file = data_path + '/yelp_train.txt'
test_file = data_path + '/yelp_test.txt'

In [55]:
user_list = data_path + '/yelp_user_list.txt'
item_list = data_path + '/yelp_item_list.txt'

In [58]:
num_users = 0
num_items = 0
with open(user_list) as f:
    for row in f.readlines():
        num_users += 1
with open(item_list) as f:
    for row in f.readlines():
        num_items += 1
print(num_users, num_items)

31669 38049


In [46]:
train_edge_index = []
with open(train_file) as f:
    for row in f.readlines():
        if len(row) > 0:
            all = row.strip('\n').split(' ')
            id = int(all[0])
            items = [int(i) for i in all[1:]]
            for item in items:
                train_edge_index.append([id, item])

In [50]:
train_edge_index = torch.tensor(np.array(train_edge_index).T)
print(train_edge_index.shape)
train_edge_index

torch.Size([2, 1237259])


tensor([[    0,     0,     0,  ..., 31667, 31667, 31667],
        [    0,     1,     2,  ..., 29149,  4927, 31751]])

In [52]:
test_edge_index = []
with open(test_file) as f:
    for row in f.readlines():
        if len(row) > 0:
            all = row.strip('\n').split(' ')
            id = int(all[0])
            items = [int(i) for i in all[1:]]
            for item in items:
                test_edge_index.append([id, item])
test_edge_index = torch.tensor(np.array(test_edge_index).T)
print(test_edge_index.shape)
test_edge_index

torch.Size([2, 324147])


tensor([[    0,     0,     0,  ..., 31667, 31667, 31667],
        [  795,   694,  1531,  ...,  9585, 36460, 26812]])

In [None]:
edge_label = torch.tensor([0.8 for i in range(edge_index.shape[1])])
edge_label.shape

torch.Size([899252])

In [53]:
print(train_edge_index.shape)
print(test_edge_index.shape)

torch.Size([2, 1237259])
torch.Size([2, 324147])


In [59]:
train_sparse_edge_index = SparseTensor(row=train_edge_index[0], col=train_edge_index[1], sparse_sizes=(
    num_users + num_items, num_users + num_items))
test_sparse_edge_index = SparseTensor(row=test_edge_index[0], col=test_edge_index[1], sparse_sizes=(
    num_users + num_items, num_users + num_items))

In [60]:
print(train_sparse_edge_index)
print(test_sparse_edge_index)

SparseTensor(row=tensor([    0,     0,     0,  ..., 31667, 31667, 31667]),
             col=tensor([    0,     1,     2,  ..., 31216, 31751, 33573]),
             size=(69718, 69718), nnz=1237259, density=0.03%)
SparseTensor(row=tensor([    0,     0,     0,  ..., 31667, 31667, 31667]),
             col=tensor([  694,   795,  1531,  ..., 26812, 33811, 36460]),
             size=(69718, 69718), nnz=324147, density=0.01%)


#### Model

In [61]:
def sample_mini_batch(batch_size, edge_index):
    edges = structured_negative_sampling(edge_index, contains_neg_self_loops=False)
    edges = torch.stack(edges, dim=0)
    indices = random.choices([i for i in range(edges[0].shape[0])], k=batch_size)
    batch = edges[:, indices]
    user_indices, pos_item_indices, neg_item_indices = batch[0], batch[1], batch[2]
    return user_indices, pos_item_indices, neg_item_indices

In [62]:
embedding_dim = 256
num_layers = 3

In [63]:
class LightGCN(MessagePassing):
    def __init__(self, num_users, num_items, embedding_dim=embedding_dim, K=num_layers, add_self_loops=False):
        
        super().__init__()
        self.num_users, self.num_items = num_users, num_items
        self.embedding_dim, self.K = embedding_dim, K
        self.add_self_loops = add_self_loops

        self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
        self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0
        # self.linear = nn.Linear(embedding_dim, embedding_dim//2)
        # self.dropout = nn.Dropout(p=0.5)

        # nn.init.normal_(self.users_emb.weight, std=0.1)
        # nn.init.normal_(self.items_emb.weight, std=0.1)
        # nn.init.normal_(self.linear.weight, std=0.1)

    def forward(self, edge_index: SparseTensor):
        
        edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops)
        emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight])
        embs = [emb_0]
        emb_k = emb_0

        for i in range(self.K):
            emb_k = self.propagate(edge_index_norm, x=emb_k)
            embs.append(emb_k)

        embs = torch.stack(embs, dim=1)
        emb_final = torch.mean(embs, dim=1) 
        # emb_final = self.linear(self.dropout(emb_final.relu()))
        users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) 
        return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight

    def message(self, x_j: Tensor) -> Tensor:
        return x_j

    def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
        return matmul(adj_t, x)

In [64]:
def bpr_loss(users_emb_final, users_emb_0, pos_items_emb_final, pos_items_emb_0, neg_items_emb_final, neg_items_emb_0, lambda_test):
    reg_loss = lambda_test * (users_emb_0.norm(2).pow(2) +
                             pos_items_emb_0.norm(2).pow(2) +
                             neg_items_emb_0.norm(2).pow(2)) # L2 loss

    pos_scores = torch.mul(users_emb_final, pos_items_emb_final)
    pos_scores = torch.sum(pos_scores, dim=-1) # predicted scores of positive samples
    neg_scores = torch.mul(users_emb_final, neg_items_emb_final)
    neg_scores = torch.sum(neg_scores, dim=-1) # predicted scores of negative samples

    loss = -torch.mean(torch.nn.functional.softplus(pos_scores - neg_scores)) + reg_loss
    return loss

In [65]:
# helper function to get N_u
def get_user_positive_items(edge_index):
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

In [66]:
# computes recall@K and precision@K
def RecallPrecision_ATk(groundTruth, r, k):
    num_correct_pred = torch.sum(r, dim=-1) # number of correctly predicted items per user
    # number of items liked by each user in the test set
    user_num_liked = torch.Tensor([len(groundTruth[i]) for i in range(len(groundTruth))])
    recall = torch.mean(num_correct_pred / user_num_liked)
    precision = torch.mean(num_correct_pred) / k
    return recall.item(), precision.item()

In [67]:
# computes NDCG@K
def NDCGatK_r(groundTruth, r, k):
    assert len(r) == len(groundTruth)
    test_matrix = torch.zeros((len(r), k))
    for i, items in enumerate(groundTruth):
        length = min(len(items), k)
        test_matrix[i, :length] = 1
    max_r = test_matrix
    idcg = torch.sum(max_r * 1. / torch.log2(torch.arange(2, k + 2)), axis=1)
    dcg = r * (1. / torch.log2(torch.arange(2, k + 2)))
    dcg = torch.sum(dcg, axis=1)
    idcg[idcg == 0.] = 1.
    ndcg = dcg / idcg
    ndcg[torch.isnan(ndcg)] = 0.
    return torch.mean(ndcg).item()

In [68]:
def get_metrics(model, edge_index, exclude_edge_indices, k):
    
    user_embedding = model.users_emb.weight
    item_embedding = model.items_emb.weight

    # get ratings between every user and item - shape is num users x num items
    rating = torch.matmul(user_embedding, item_embedding.T)

    for exclude_edge_index in exclude_edge_indices:
        user_pos_items = get_user_positive_items(exclude_edge_index)
        exclude_users = []
        exclude_items = []
        for user, items in user_pos_items.items():
            exclude_users.extend([user] * len(items))
            exclude_items.extend(items)

        # set ratings of excluded edges to large negative value
        rating[exclude_users, exclude_items] = -(1 << 10)

    # get the top k recommended items for each user
    _, top_K_items = torch.topk(rating, k=k)

    # get all unique users in evaluated split
    users = edge_index[0].unique()
    test_user_pos_items = get_user_positive_items(edge_index)
    test_user_pos_items_list = [test_user_pos_items[user.item()] for user in users]

    # determine the correctness of topk predictions
    r = []
    for user in users:
        ground_truth_items = test_user_pos_items[user.item()]
        label = list(map(lambda x: x in ground_truth_items, top_K_items[user]))
        r.append(label)
    r = torch.Tensor(np.array(r).astype('float'))

    recall, precision = RecallPrecision_ATk(test_user_pos_items_list, r, k)
    ndcg = NDCGatK_r(test_user_pos_items_list, r, k)

    return recall, precision, ndcg

In [69]:
# wrapper function to evaluate model
def evaluation(model, edge_index, sparse_edge_index, exclude_edge_indices, k, lambda_test):

    users_emb_final, users_emb_0, items_emb_final, items_emb_0 = model.forward(sparse_edge_index)
    edges = structured_negative_sampling(edge_index, contains_neg_self_loops=False)

    user_indices, pos_item_indices, neg_item_indices = edges[0], edges[1], edges[2]
    users_emb_final, users_emb_0 = users_emb_final[user_indices], users_emb_0[user_indices]

    pos_items_emb_final, pos_items_emb_0 = items_emb_final[pos_item_indices], items_emb_0[pos_item_indices]
    neg_items_emb_final, neg_items_emb_0 = items_emb_final[neg_item_indices], items_emb_0[neg_item_indices]

    loss = bpr_loss(users_emb_final, users_emb_0, pos_items_emb_final, pos_items_emb_0, neg_items_emb_final, neg_items_emb_0, lambda_test).item()
    recall, precision, ndcg = get_metrics(model, edge_index, exclude_edge_indices, k)

    return loss, recall, precision, ndcg

In [80]:
# define contants
ITERATIONS = 1000
BATCH_SIZE = 256
LR = 0.025
ITERS_PER_eval = 200
ITERS_PER_LR_DECAY = 1000
K = 20
LAMBDA = 1e-3

In [81]:
model = LightGCN(num_users, num_items)

In [82]:
# setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}.")

model = model.to(device)
model.train()

optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

# edge_index = edge_index.to(device)
train_edge_index = train_edge_index.to(device)
train_sparse_edge_index = train_sparse_edge_index.to(device)

test_edge_index = test_edge_index.to(device)
test_sparse_edge_index = test_sparse_edge_index.to(device)

Using device cuda.


In [83]:
# training loop
train_losses = []
test_losses = []

for iter in tqdm(range(ITERATIONS)):
    # forward propagation
    users_emb_final, users_emb_0, items_emb_final, items_emb_0 = model.forward(train_sparse_edge_index)

    # mini batching
    user_indices, pos_item_indices, neg_item_indices = sample_mini_batch(BATCH_SIZE, train_edge_index)
    user_indices, pos_item_indices, neg_item_indices = user_indices.to(device), pos_item_indices.to(device), neg_item_indices.to(device)
    
    users_emb_final, users_emb_0 = users_emb_final[user_indices], users_emb_0[user_indices]

    pos_items_emb_final, pos_items_emb_0 = items_emb_final[pos_item_indices], items_emb_0[pos_item_indices]
    neg_items_emb_final, neg_items_emb_0 = items_emb_final[neg_item_indices], items_emb_0[neg_item_indices]

    # loss computation
    train_loss = bpr_loss(users_emb_final, users_emb_0, pos_items_emb_final,pos_items_emb_0, neg_items_emb_final, neg_items_emb_0, LAMBDA)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    if iter % ITERS_PER_eval == 0 and iter != 0:
        model.eval()
        test_loss, recall, precision, ndcg = evaluation(model, test_edge_index, test_sparse_edge_index, [train_edge_index], K, LAMBDA)
        print(f"[Iter. {iter}/{ITERATIONS}] train_loss: {round(train_loss.item()/(10^6), 5)}, test_loss: {round(test_loss/(10^6), 5)}, test_recall@{K}: {round(recall, 5)}, test_precision@{K}: {round(precision, 5)}, test_ndcg@{K}: {round(ndcg, 5)}")
        train_losses.append(train_loss.item())
        test_losses.append(test_loss)
        
        model.train()

    if iter % ITERS_PER_LR_DECAY == 0 and iter != 0:
        scheduler.step()

  0%|          | 0/1000 [00:00<?, ?it/s]

OutOfMemoryError: ignored

In [None]:
iters = [iter * ITERS_PER_EVAL for iter in range(len(train_losses))]
plt.plot(iters, train_losses, label='train')
plt.plot(iters, test_losses, label='validation')
plt.xlabel('iteration')
plt.ylabel('loss')
plt.title('training and validation loss curves')
plt.legend()
plt.show()

### Make New Recommendatios for a Given User

In [None]:
model.eval()
user_pos_items = get_user_positive_items(edge_index)

In [None]:
idx_to_item = {v:k for k, v in item_mapping.items()}

In [None]:
def make_predictions(user_id, num_recs):
    user = user_mapping[user_id]
    e_u = model.users_emb.weight[user]
    scores = model.items_emb.weight @ e_u

    exclude_items = user_pos_items[user]
    scores[exclude_items] = -(1 << 10)
    values, indices = torch.topk(scores, k=num_recs)
    rec_item_list = [idx_to_item[i] for i in indices.cpu().numpy()]
    return rec_item_list

In [None]:
rec_all = []
for user_id in tqdm(list(user_mapping.keys())):
    rec_item_list = make_predictions(user_id, 25)
    rec_all.append(rec_item_list)
pred_df = pd.DataFrame()
pred_df['profile_id'] = list(user_mapping.keys())
pred_df['predicted_list'] = rec_all
pred_df

  0%|          | 0/8311 [00:00<?, ?it/s]

Unnamed: 0,profile_id,predicted_list
0,3,"[38, 124, 125, 241, 65, 339, 224, 347, 1880, 1..."
1,5,"[339, 347, 1880, 329, 36, 55, 416, 981, 2054, ..."
2,7,"[16, 15, 19, 18, 17, 38, 124, 125, 241, 65, 33..."
3,12,"[16, 15, 19, 18, 17, 38, 124, 125, 241, 65, 33..."
4,16,"[16, 15, 19, 18, 17, 38, 125, 241, 224, 347, 1..."
...,...,...
8306,33022,"[16, 15, 19, 18, 17, 38, 124, 125, 241, 65, 33..."
8307,33023,"[16, 15, 19, 18, 17, 38, 241, 339, 224, 347, 1..."
8308,33026,"[16, 15, 19, 18, 17, 38, 124, 125, 241, 65, 33..."
8309,33027,"[16, 15, 19, 18, 17, 38, 124, 125, 241, 65, 33..."
