In [1]:
!pip install lightfm



In [2]:
import time
import itertools
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset

In [3]:
%%time
ML100K_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
dataset = pd.read_csv(
  ML100K_URL, 
  names=["user_id", "item_id", "rating", "timestamp"], 
  sep="\t"
)
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())
n_users = len(uq_users)
n_items = len(uq_items)

CPU times: user 30.2 ms, sys: 10.5 ms, total: 40.6 ms
Wall time: 1.06 s


In [4]:
topk = 10
rank_list = [i+1 for i in range(topk)]
latent = 50

In [5]:
print('dataset:', dataset.head())

dataset:    user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [6]:
%%time
lightfm_dataset = Dataset()
lightfm_dataset.fit(
  users=uq_users, 
  items=uq_items
)
user_id_map, _, item_id_map, _ = \
  lightfm_dataset.mapping()
list_dataset = list(
  dataset[
    ['user_id', 'item_id', 'rating']
  ].itertuples(index=False, name=None)
)
interactions, weights = \
  lightfm_dataset.build_interactions(list_dataset)

CPU times: user 332 ms, sys: 3.89 ms, total: 336 ms
Wall time: 339 ms


In [7]:
%%time
model = LightFM(
  no_components=latent, 
  loss='bpr', 
  learning_rate=0.05
)
model.fit(
  weights, 
  epochs=5, 
  num_threads=4
)

CPU times: user 1.61 s, sys: 6.29 ms, total: 1.62 s
Wall time: 1.05 s


In [8]:
%%time
prediction = model.predict(
  user_ids=dataset.user_id.map(user_id_map).values,
  item_ids=dataset.item_id.map(item_id_map).values,
  num_threads=4)

CPU times: user 34.4 ms, sys: 0 ns, total: 34.4 ms
Wall time: 22.8 ms


In [9]:
%%time
df_recommend_list = pd.DataFrame(
  columns=['user_id', 'item_id', 'score', 'rank']
)

for user_id in uq_users:
  """
  精度指向評価のため、既知のアイテムは除外しない
  i_list = list(
    set(uq_items) - 
    set(dataset[
      dataset['user_id']==user_id
    ]['item_id'].tolist()))
  """
  i_list = uq_items
  u_list = [user_id] * len(i_list)
  df_predict = pd.DataFrame()
  df_predict['user_id'] = u_list
  df_predict['item_id'] = i_list
  prediction = model.predict(
    user_ids=df_predict.user_id.map(user_id_map).values,
    item_ids=df_predict.item_id.map(item_id_map).values,
    num_threads=4
  )
  df_predict['score'] = prediction
  
  df_recommend = df_predict.sort_values(
    'score', ascending=False
  )[:topk]
  df_recommend['rank'] = rank_list

  df_recommend_list = \
    df_recommend_list.append(
      df_recommend, 
      ignore_index=True
    )

CPU times: user 9.11 s, sys: 108 ms, total: 9.21 s
Wall time: 11.1 s


In [10]:
df_recommend_list

Unnamed: 0,user_id,item_id,score,rank
0,1,174,-0.659326,1
1,1,50,-0.962921,2
2,1,172,-1.026047,3
3,1,56,-1.202690,4
4,1,204,-1.261539,5
...,...,...,...,...
9425,943,403,0.864149,6
9426,943,204,0.816371,7
9427,943,82,0.804565,8
9428,943,385,0.715285,9


In [11]:
def recall_user(y_true, y_pred):
  num_hit = len(set(y_true) & set(y_pred))
  return num_hit / len(y_true)

def mrr_user(y_true, y_pred):
  for i, item in enumerate(y_pred, 1):
    if item in y_true:
      return 1 / i
  return 0

def ndcg_user(y_true, y_pred):
  def dcg(y_true, y_pred):
    dcg_score = 0.0
    for i, item in enumerate(y_pred): 
      if item in y_true:
        discount = np.log2(i+2)
        dcg_score += 1.0 / discount
    return dcg_score
  actual = dcg(y_true, y_pred)
  best = dcg(y_true, y_true) + 1e-6
  return actual / best

def hr_user(y_true, y_pred):
  for item in y_pred:
    if item in y_true:
      return 1
  return 0

def precision_user(y_true, y_pred):
  num_hit = len(set(y_true) & set(y_pred))
  return num_hit / len(y_pred)
    
def map_user(y_true, y_pred):
  n_hit = 0
  precision = 0
  for i, item in enumerate(y_pred, 1):
    if item in y_true:
      n_hit += 1
      precision += n_hit / i
  avg_precision = precision / (n_hit+1e-6)
  return avg_precision

In [12]:
def calc_accuracy_oriented_score():
  # 初期化
  recall = 0.0
  mrr = 0.0
  ndcg = 0.0
  hr = 0.0
  precision = 0.0
  map = 0.0
  
  # ユーザごとに、精度指向評価を計算する
  for user_id in uq_users:
    # 正解データを取得する
    y_true = dataset[
      dataset['user_id']==user_id
    ]['item_id'].tolist()
    # 推薦結果を取得する
    y_pred = df_recommend_list[
      df_recommend_list['user_id']==user_id
    ].sort_values('rank')['item_id'].tolist()

    # ユーザごとのスコアを算出し加算する
    recall += recall_user(y_true, y_pred)
    mrr += mrr_user(y_true, y_pred)
    ndcg += ndcg_user(y_true, y_pred)
    hr += hr_user(y_true, y_pred)
    precision += precision_user(y_true, y_pred)
    map += map_user(y_true, y_pred)
    
  #平均をとる
  recall /= n_users
  mrr /= n_users
  ndcg /= n_users
  hr /= n_users
  precision /= n_users
  map /= n_users
  
  return recall, mrr, ndcg, hr, precision, map

In [13]:
%%time
recall, mrr, ndcg, hr, precision, map = \
  calc_accuracy_oriented_score()
print('recall', recall)
print('mrr', mrr)
print('ndcg', ndcg)
print('hr', hr)
print('precision', precision)
print('map', map)

recall 0.10907170504173386
mrr 0.8856187614671182
ndcg 0.19829736952976917
hr 0.9968186638388123
precision 0.6790031813361594
map 0.7985117671492827
CPU times: user 3.91 s, sys: 20.6 ms, total: 3.93 s
Wall time: 5.26 s


In [14]:
def prefs(item_id):
  return len(dataset[dataset['item_id']==item_id])

def prefs_both(item_id1, item_id2):
  df_both = dataset[
    (dataset['item_id']==item_id1) | 
    (dataset['item_id']==item_id2)
  ]
  series_user = df_both['user_id'].value_counts()
  return series_user[series_user==2].count()

prefs_dict = {}
for item_id in uq_items:
  prefs_dict[item_id] = prefs(item_id)

In [15]:
def diversity_user(user_id):
  diversity_score = 0.0
  rec_list_user = \
    df_recommend_list[
      df_recommend_list['user_id']==user_id
    ]['item_id']
  for x1, x2 in list(
    itertools.combinations(rec_list_user, 2)
  ):
    pref_both = prefs_both(x1, x2)
    if pref_both != 0:
      diversity_score += \
        np.sqrt(prefs_dict[x1]) * \
        np.sqrt(prefs_dict[x2]) / pref_both
  return diversity_score

def novelty_user(user_id):
  novelty_score = 0.0
  rec_list_user = df_recommend_list[
    df_recommend_list['user_id']==user_id
  ]['item_id']
  for rec_item in rec_list_user:
    pref = prefs(rec_item)
    if pref != 0:
      novelty_score += np.log2(n_users/pref) / topk
  return novelty_score

def serendipity_user(user_id):
  serendipity_score = 0.0
  rec_list_user = df_recommend_list[
    df_recommend_list['user_id']==user_id
  ]['item_id']
  con_list_user = dataset[
    dataset['user_id']==user_id
  ]['item_id']
  for rec_item, con_item in list(
    itertools.product(rec_list_user, con_list_user)
  ):
    pref_both = prefs_both(rec_item, con_item)
    if pref_both != 0:
      serendipity_score += \
        np.sqrt(prefs_dict[rec_item]) * \
        np.sqrt(prefs_dict[con_item]) / pref_both
  return serendipity_score / len(con_list_user)

In [16]:
def calc_diversity_oriented_score():
  #初期化
  diversity = 0.0
  novelty = 0.0
  serendipity = 0.0
  
  # ユーザごとのスコアを算出し加算する
  for user_id in uq_users:
    diversity += diversity_user(user_id)
    novelty += novelty_user(user_id)
    serendipity += serendipity_user(user_id)
  
  # 平均を取る
  diversity /= n_users
  novelty /= n_users
  serendipity /= n_users
  
  # Uniquenessの計算
  uniquness = \
    len(np.unique(
      df_recommend_list['item_id'])
    ) / topk

  return diversity, novelty, serendipity, uniquness

In [17]:
%%time
diversity, novelty, serendipity, uniquness = \
  calc_diversity_oriented_score()
print('diversity', diversity)
print('novelty', novelty)
print('serendipity', serendipity)
print('uniquness', uniquness)

diversity 83.78732385006832
novelty 1.581585173415766
serendipity 31.584996560166864
uniquness 12.8
CPU times: user 19min 35s, sys: 2.44 s, total: 19min 37s
Wall time: 19min 41s
