In [1]:
!pip install lightfm



In [2]:
import time
import itertools
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from lightfm import LightFM
from lightfm.data import Dataset

In [3]:
%%time
dataset = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep="\t")
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())
n_users = len(uq_users)
n_items = len(uq_items)

CPU times: user 60.7 ms, sys: 32.4 ms, total: 93.1 ms
Wall time: 499 ms


In [4]:
topk = 10
rank_list = [i+1 for i in range(topk)]
latent = 50

In [5]:
print('dataset:', dataset.head())
print('user number:', n_users)
print('item number:', n_items)

dataset:    user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
user number: 943
item number: 1682


In [6]:
%%time
lightfm_dataset = Dataset()
lightfm_dataset.fit(users=uq_users, items=uq_items)
user_id_map, _, item_id_map, _ = lightfm_dataset.mapping()
list_dataset = list(dataset[['user_id', 'item_id', 'rating']].itertuples(index=False, name=None))
interactions, weights = lightfm_dataset.build_interactions(list_dataset)

CPU times: user 545 ms, sys: 19.4 ms, total: 565 ms
Wall time: 587 ms


In [7]:
%%time
model = LightFM(no_components=latent, 
                loss='bpr',
                learning_rate=0.05)
model.fit(weights,
          epochs=5,
          num_threads=4)

CPU times: user 2.34 s, sys: 14.7 ms, total: 2.36 s
Wall time: 1.7 s


In [8]:
%%time
prediction = model.predict(user_ids=dataset.user_id.map(user_id_map).values,
                          item_ids=dataset.item_id.map(item_id_map).values,
                          num_threads=4)

CPU times: user 42.2 ms, sys: 29 µs, total: 42.2 ms
Wall time: 29.5 ms


In [9]:
%%time
df_recommend_list = pd.DataFrame(columns=['user_id', 'item_id', 'score', 'rank'])
for user_id in uq_users:
  #i_list = list(set(uq_items) - set(dataset[dataset['user_id']==user_id]['item_id'].tolist()))
  i_list = uq_items
  u_list = [user_id] * len(i_list)
  df_predict = pd.DataFrame()
  df_predict['user_id'] = u_list
  df_predict['item_id'] = i_list
  prediction = model.predict(user_ids=df_predict.user_id.map(user_id_map).values,
                            item_ids=df_predict.item_id.map(item_id_map).values,
                            num_threads=4)
  df_predict['score'] = prediction
  df_recommend = df_predict.sort_values('score', ascending=False)[:topk]
  df_recommend['rank'] = rank_list
  df_recommend_list = df_recommend_list.append(df_recommend, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


CPU times: user 15.9 s, sys: 198 ms, total: 16.1 s
Wall time: 21.6 s


In [10]:
def map_user(y_true, y_pred):
  n_hit = 0
  precision = 0
  for i, item in enumerate(y_pred, 1):
    if item in y_true:
      n_hit += 1
      precision += n_hit / i
  avg_precision = precision / (n_hit+1e-6)
  return avg_precision

def hr_user(y_true, y_pred):
  for item in y_pred:
    if item in y_true:
      return 1
  return 0

def mrr_user(y_true, y_pred):
  for i, item in enumerate(y_pred, 1):
    if item in y_true:
      return 1 / i
  return 0

def ndcg_user(y_true, y_pred):
  def dcg(y_true, y_pred):
    dcg_score = 0.0
    for i, item in enumerate(y_pred): 
      if item in y_true:
        discount = np.log2(i+2)
        dcg_score += 1.0 / discount
    return dcg_score
  actual = dcg(y_true, y_pred)
  best = dcg(y_true, y_true) + 1e-6
  return actual / best

In [11]:
def calc_precision_oriented_score():
  map = 0
  hr = 0.0
  mrr = 0.0
  ndcg = 0.0
  for user_id in uq_users:
    y_true = dataset[dataset['user_id']==user_id]['item_id'].tolist()
    y_pred = df_recommend_list[df_recommend_list['user_id']==user_id].sort_values('rank')['item_id'].tolist()
    map += map_user(y_true, y_pred)
    hr += hr_user(y_true, y_pred)
    mrr += mrr_user(y_true, y_pred)
    ndcg += ndcg_user(y_true, y_pred)
  map /= n_users
  hr /= n_users
  mrr /= n_users
  ndcg /= n_users
  return map, hr, mrr, ndcg

In [12]:
%%time
map, hr, mrr, ndcg = calc_precision_oriented_score()
print('map', map)
print('hr', hr)
print('mrr', mrr)
print('ndcg', ndcg)

map 0.7980949237105699
hr 0.9968186638388123
mrr 0.886840798532209
ndcg 0.19695888214524865
CPU times: user 4.44 s, sys: 61.8 ms, total: 4.5 s
Wall time: 4.52 s


In [13]:
def prefs(item_id):
  return len(dataset[dataset['item_id']==item_id])

def prefs_both(item_id1, item_id2):
  df_both = dataset[(dataset['item_id']==item_id1) | (dataset['item_id']==item_id2)]
  series_user = df_both["user_id"].value_counts()
  return series_user[series_user==2].count()

prefs_dict = {}
for item_id in uq_items:
  prefs_dict[item_id] = prefs(item_id)

def diversity_user(user_id):
  diversity_score = 0.0
  rec_list_user = df_recommend_list[df_recommend_list['user_id']==user_id]['item_id']
  for x1, x2 in list(itertools.combinations(rec_list_user, 2)):
    pref_both = prefs_both(x1, x2)
    if pref_both != 0:
      diversity_score += np.sqrt(prefs_dict[x1]) * np.sqrt(prefs_dict[x2]) / pref_both
  return diversity_score

def novelty_user(user_id):
  novelty_score = 0.0
  rec_list_user = df_recommend_list[df_recommend_list['user_id']==user_id]['item_id']
  for rec_item in rec_list_user:
    pref = prefs(rec_item)
    if pref != 0:
      novelty_score += np.log2(n_users/pref) / topk
  return novelty_score

def serendipity_user(user_id):
  serendipity_score = 0.0
  rec_list_user = df_recommend_list[df_recommend_list['user_id']==user_id]['item_id']
  con_list_user = dataset[dataset['user_id']==user_id]['item_id']
  for rec_item, con_item in list(itertools.product(rec_list_user, con_list_user)):
    pref_both = prefs_both(rec_item, con_item)
    if pref_both != 0:
      serendipity_score += np.sqrt(prefs_dict[rec_item]) * np.sqrt(prefs_dict[con_item]) / pref_both
  return serendipity_score / len(con_list_user)

In [14]:
def calc_diversity_oriented_score():
  diversity = 0.0
  novelty = 0.0
  serendipity = 0.0
  for user_id in uq_users:
    diversity += diversity_user(user_id)
    novelty += novelty_user(user_id)
    serendipity += serendipity_user(user_id)
  
  diversity /= n_users
  novelty /= n_users
  serendipity /= n_users
  uniquness = len(np.unique(df_recommend_list['item_id'])) / topk

  return diversity, novelty, serendipity, uniquness

In [None]:
%%time
diversity, novelty, serendipity, uniquness = calc_diversity_oriented_score()
print('diversity', diversity)
print('novelty', novelty)
print('serendipity,', serendipity,)
print('uniquness', uniquness)

diversity 82.83069308114501
novelty 1.5550977698423956
serendipity, 31.485339156202045
uniquness 13.0
CPU times: user 42min 58s, sys: 1min 29s, total: 44min 27s
Wall time: 44min 35s
