In [1]:
import time
import itertools
import pandas as pd
import numpy as np

In [2]:
%%time
dataset = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.data', names=["user_id", "item_id", "rating", "timestamp"], sep="\t")
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())
n_users = len(uq_users)
n_items = len(uq_items)

CPU times: user 34.1 ms, sys: 23.8 ms, total: 57.8 ms
Wall time: 938 ms


In [3]:
dataset['rating'] /= 5
topk = 10
rank_list = [i+1 for i in range(topk)]

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Dense, Dropout, Concatenate
from tensorflow.keras import layers, regularizers, optimizers

class NeuMF(Model):
  def __init__(self, num_users, num_items, mf_dim=50, layers=[50, 100, 50, 1], reg_layers=[1e-6, 1e-6, 1e-6, 1e-6], reg_mf=[1e-6, 1e-6]):
    super(NeuMF, self).__init__()
    self.MF_Embedding_User = Embedding(
      input_dim=num_users,
      output_dim=mf_dim,
      name='mf_embedding_user',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(reg_mf[0]),
    )
    self.MF_Embedding_Item = Embedding(
      input_dim=num_items,
      output_dim=mf_dim,
      name='mf_embedding_item',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(reg_mf[1]),
    )
    self.MLP_Embedding_User = Embedding(
      input_dim=num_users,
      output_dim=int(layers[0] / 2),
      name='mlp_embedding_user',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(reg_layers[0]),
    )
    self.MLP_Embedding_Item = Embedding(
      input_dim=num_items,
      output_dim=int(layers[0] / 2),
      name='mlp_embedding_item',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(reg_layers[0]),
    )
    self.flatten = Flatten()
    self.mf_vector = Dot(axes=1)
    self.mlp_vector = Concatenate(axis=-1)
    self.dropout = Dropout(0.2)
    self.layer1 = Dense(
      layers[1],
      name='layer1',
      activation='relu',
      kernel_regularizer=regularizers.l2(reg_layers[1]),
    )
    self.layer2 = Dense(
      layers[2],
      name='layer2',
      activation='relu',
      kernel_regularizer=regularizers.l2(reg_layers[2]),
    )
    self.layer3 = Dense(
      layers[3],
      name='layer3',
      activation='relu',
      kernel_regularizer=regularizers.l2(reg_layers[3]),
    )
    self.predict_vector = Concatenate(axis=-1)
    self.layer4 = Dense(
      1,
      activation='sigmoid',
      kernel_initializer='lecun_uniform',
      name='prediction'
    )

  @tf.function
  def call(self, inputs):
    # Embedding
    MF_Embedding_User = self.MF_Embedding_User(inputs[0])
    MF_Embedding_Item = self.MF_Embedding_Item(inputs[1])
    MLP_Embedding_User = self.MLP_Embedding_User(inputs[0])
    MLP_Embedding_Item = self.MLP_Embedding_Item(inputs[1])

    # MF(GMF)
    mf_user_latent = self.flatten(MF_Embedding_User)
    mf_item_latent = self.flatten(MF_Embedding_Item)
    mf_vector = self.mf_vector([mf_user_latent, mf_item_latent])

    # MLP
    mlp_user_latent = self.flatten(MLP_Embedding_User)
    mlp_item_latent = self.flatten(MLP_Embedding_Item)
    mlp_vector = self.mlp_vector([mlp_user_latent, mlp_item_latent])
    mlp_vector = self.dropout(mlp_vector)
    mlp_vector = self.layer1(mlp_vector)
    mlp_vector = self.dropout(mlp_vector)
    mlp_vector = self.layer2(mlp_vector)
    mlp_vector = self.dropout(mlp_vector)
    mlp_vector = self.layer3(mlp_vector)

    # NeuMF
    vector = self.predict_vector([mf_vector, mlp_vector])
    output = self.layer4(vector)

    return output

In [5]:
%%time
model = NeuMF(n_users, n_items)
model.compile(optimizer=optimizers.Adam(lr=0.001), loss='mean_squared_error')
history = model.fit([dataset.user_id, dataset.item_id], dataset.rating, epochs=10)

Epoch 1/10


  super(Adam, self).__init__(name, **kwargs)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 2min 12s, sys: 14.1 s, total: 2min 27s
Wall time: 2min 25s


In [6]:
%%time
df_recommend_list = pd.DataFrame(columns=['user_id', 'item_id', 'score', 'rank'])
for user_id in uq_users:
  #i_list = list(set(uq_items) - set(dataset[dataset['user_id']==user_id]['item_id'].tolist()))
  i_list = uq_items
  u_list = [user_id] * len(i_list)
  df_predict = pd.DataFrame()
  df_predict['user_id'] = u_list
  df_predict['item_id'] = i_list
  prediction = model.predict([df_predict.user_id, df_predict.item_id])
  df_predict['score'] = prediction
  df_recommend = df_predict.sort_values('score', ascending=False)[:topk]
  df_recommend['rank'] = rank_list
  df_recommend_list = df_recommend_list.append(df_recommend, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


CPU times: user 2min 37s, sys: 15.7 s, total: 2min 53s
Wall time: 2min 43s


In [7]:
df_recommend_list

Unnamed: 0,user_id,item_id,score,rank
0,1,511,0.998069,1
1,1,313,0.996873,2
2,1,50,0.996505,3
3,1,285,0.995812,4
4,1,181,0.994023,5
...,...,...,...,...
9425,943,313,0.835199,6
9426,943,114,0.834619,7
9427,943,50,0.829663,8
9428,943,127,0.826389,9


In [8]:
def map_user(y_true, y_pred):
  n_hit = 0
  precision = 0
  for i, item in enumerate(y_pred, 1):
    if item in y_true:
      n_hit += 1
      precision += n_hit / i
  avg_precision = precision / (n_hit+1e-6)
  return avg_precision

def hr_user(y_true, y_pred):
  for item in y_pred:
    if item in y_true:
      return 1
  return 0

def mrr_user(y_true, y_pred):
  for i, item in enumerate(y_pred, 1):
    if item in y_true:
      return 1 / i
  return 0

def ndcg_user(y_true, y_pred):
  def dcg(y_true, y_pred):
    dcg_score = 0.0
    for i, item in enumerate(y_pred): 
      if item in y_true:
        discount = np.log2(i+2)
        dcg_score += 1.0 / discount
    return dcg_score
  actual = dcg(y_true, y_pred)
  best = dcg(y_true, y_true) + 1e-6
  return actual / best

In [9]:
def calc_precision_oriented_score():
  map = 0.0
  hr = 0.0
  mrr = 0.0
  ndcg = 0.0
  for user_id in uq_users:
    y_true = dataset[dataset['user_id']==user_id]['item_id'].tolist()
    y_pred = df_recommend_list[df_recommend_list['user_id']==user_id].sort_values('rank')['item_id'].tolist()
    map += map_user(y_true, y_pred)
    hr += hr_user(y_true, y_pred)
    mrr += mrr_user(y_true, y_pred)
    ndcg += ndcg_user(y_true, y_pred)
  map /= n_users
  hr /= n_users
  mrr /= n_users
  ndcg /= n_users
  return map, hr, mrr, ndcg

In [10]:
%%time
map, hr, mrr, ndcg = calc_precision_oriented_score()
print('map', map)
print('hr', hr)
print('mrr', mrr)
print('ndcg', ndcg)

map 0.5093442512082469
hr 0.9331919406150583
mrr 0.5758651887761107
ndcg 0.0891852694572674
CPU times: user 3.46 s, sys: 44.2 ms, total: 3.51 s
Wall time: 3.47 s


In [11]:
def prefs(item_id):
  return len(dataset[dataset['item_id']==item_id])

def prefs_both(item_id1, item_id2):
  df_both = dataset[(dataset['item_id']==item_id1) | (dataset['item_id']==item_id2)]
  series_user = df_both["user_id"].value_counts()
  return series_user[series_user==2].count()

prefs_dict = {}
for item_id in uq_items:
  prefs_dict[item_id] = prefs(item_id)

In [12]:
def diversity_user(user_id):
  diversity_score = 0.0
  rec_list_user = df_recommend_list[df_recommend_list['user_id']==user_id]['item_id']
  for x1, x2 in list(itertools.combinations(rec_list_user, 2)):
    pref_both = prefs_both(x1, x2)
    if pref_both != 0:
      diversity_score += np.sqrt(prefs_dict[x1]) * np.sqrt(prefs_dict[x2]) / pref_both
  return diversity_score

def novelty_user(user_id):
  novelty_score = 0.0
  rec_list_user = df_recommend_list[df_recommend_list['user_id']==user_id]['item_id']
  for rec_item in rec_list_user:
    pref = prefs(rec_item)
    if pref != 0:
      novelty_score += np.log2(n_users/pref) / topk
  return novelty_score

def serendipity_user(user_id):
  serendipity_score = 0.0
  rec_list_user = df_recommend_list[df_recommend_list['user_id']==user_id]['item_id']
  con_list_user = dataset[dataset['user_id']==user_id]['item_id']
  for rec_item, con_item in list(itertools.product(rec_list_user, con_list_user)):
    pref_both = prefs_both(rec_item, con_item)
    if pref_both != 0:
      serendipity_score += np.sqrt(prefs_dict[rec_item]) * np.sqrt(prefs_dict[con_item]) / pref_both
  return serendipity_score / len(con_list_user)

In [13]:
def calc_diversity_oriented_score():
  diversity = 0.0
  novelty = 0.0
  serendipity = 0.0
  for user_id in uq_users:
    diversity += diversity_user(user_id)
    novelty += novelty_user(user_id)
    serendipity += serendipity_user(user_id)
  
  diversity /= n_users
  novelty /= n_users
  serendipity /= n_users
  uniquness = len(np.unique(df_recommend_list['item_id'])) / topk

  return diversity, novelty, serendipity, uniquness

In [14]:
%%time
diversity, novelty, serendipity, uniquness = calc_diversity_oriented_score()
print('diversity', diversity)
print('novelty', novelty)
print('serendipity', serendipity,)
print('uniquness', uniquness)

diversity 162.83190360447153
novelty 2.4028247742208735
serendipity 48.63501732765133
uniquness 64.6
CPU times: user 29min 24s, sys: 7.87 s, total: 29min 32s
Wall time: 29min 25s
