In [105]:
import numpy as np

from lightfm.datasets import fetch_stackexchange

data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

train = data['train']
test = data['test']
item_features = data['item_features']
tag_labels = data['item_feature_labels']

print('There are %s distinct tags, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist()))

There are 1246 distinct tags, with values like ['bayesian', 'prior', 'elicitation'].


In [96]:
import sys
import os

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scrapbook as sb

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k

print("System version: {}".format(sys.version))
print("LightFM version: {}".format(lightfm.__version__))

System version: 3.11.2 (v3.11.2:878ead1ac1, Feb  7 2023, 10:02:41) [Clang 13.0.0 (clang-1300.0.29.30)]
LightFM version: 1.17


In [97]:
# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.2
# model learning rate
LEARNING_RATE = 0.001
# no of latent factors
NO_COMPONENTS = 200
# no of epochs to fit model
NO_EPOCHS = 200
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

In [98]:
data = pd.read_csv('../data/interim/100k_clean.csv', index_col=0)
data

Unnamed: 0,user id,item id,rating,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy_sum,Film-Noir_sum,Horror_sum,Musical_sum,Mystery_sum,Romance_sum,Sci-Fi_sum,Thriller_sum,War_sum,Western_sum
0,1,168,5,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,172,5,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,165,5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
3,1,156,4,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,1,0
4,1,196,5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55370,943,1074,4,0,0,0,0,0,1,0,...,0,0,8,2,2,15,8,26,10,2
55371,943,41,4,0,0,0,0,0,1,0,...,0,0,8,2,2,15,8,26,10,2
55372,943,237,4,0,0,0,0,0,0,0,...,0,0,8,2,2,15,8,26,10,2
55373,943,151,4,0,0,1,0,1,1,0,...,0,0,8,2,2,16,8,26,10,2


In [99]:
dataset = Dataset()

In [100]:
dataset.fit(users=data['user id'], 
            items=data['item id'])

# quick check to determine the number of unique users and items in the data
num_users, num_topics = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_topics}.')

Num users: 942, num_topics: 1447.


In [101]:
(interactions, weights) = dataset.build_interactions(data.iloc[:, 0:3].values)

In [90]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED))

In [91]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (942, 1447)
Shape of test interactions: (942, 1447)


In [92]:
model1 = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                learning_rate=LEARNING_RATE,     
                random_state=np.random.RandomState(SEED))

In [93]:
model1.fit(interactions=train_interactions,
          epochs=NO_EPOCHS)

<lightfm.lightfm.LightFM at 0x17ff01d90>

In [108]:
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

# Evaluate precision at k on the test data
precision = precision_at_k(model1, test_interactions, k=K).mean()
recall = lightfm_recall_at_k(model1, test_interactions, k=K).mean()

train_auc = auc_score(model1, train_interactions).mean()
test_auc = auc_score(model1, test_interactions).mean()

print(f'Precision at {K}: {precision}')
print(f'Recall at {K}: {recall}')
print(f'train AUC: {train_auc}')
print(f'test AUC: {test_auc}')

Precision at 10: 0.0803474485874176
Recall at 10: 0.079106510713317
train AUC: 0.8545700907707214
test AUC: 0.8395184278488159
