In [1]:
import time
import pandas as pd
import numpy as np
from typing import List, Tuple

In [2]:
%%time
ML100K_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
dataset = pd.read_csv(
  ML100K_URL, 
  names=["user_id", "item_id", "rating", "timestamp"], 
  sep="\t"
)

# ユニークなユーザID、アイテムIDのリストを作成する
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())

# ユーザ数、アイテム数を取得する
n_users = len(uq_users)
n_items = len(uq_items)

CPU times: user 33.3 ms, sys: 17.8 ms, total: 51.1 ms
Wall time: 1.1 s


In [3]:
# 標準化
dataset['rating'] /= 5

topk = 10
rank_list = [i+1 for i in range(topk)]

In [4]:
import tensorflow as tf
from tensorflow.keras import regularizers, optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Input, Dot, Dense, Dropout, Concatenate

class NeuMF(Model):
  def __init__(
    self, 
    num_users: int, #ユーザ数
    num_items: int, #アイテム数
    gmf_latent: int = 50,
    gmf_regs: List[float] = [1e-6, 1e-6],
    ncf_layers: List[int] = [50, 100, 50, 1],
    ncf_regs: List[float] = [1e-6, 1e-6, 1e-6, 1e-6]
  ) -> None:

    super(NeuMF, self).__init__()
    self.GMF_User_Embedding_Layer = Embedding(
      input_dim=num_users,
      output_dim=gmf_latent,
      name='gmf_user_embedding',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(
        gmf_regs[0]
      ),
    )
    self.GMF_Item_Embedding_Layer = Embedding(
      input_dim=num_items,
      output_dim=gmf_latent,
      name='gmf_item_embedding',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(
        gmf_regs[1]
      ),
    )
    self.NCF_User_Embedding_Layer = Embedding(
      input_dim=num_users,
      output_dim=int(ncf_layers[0]),
      name='ncf_user_embedding',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(
        ncf_regs[0]
      ),
    )
    self.NCF_Item_Embedding_Layer = Embedding(
      input_dim=num_items,
      output_dim=int(ncf_layers[0]),
      name='ncf_item_embedding',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(
        ncf_regs[0]
      ),
    )
    self.flatten = Flatten()
    self.gmf_vector = Dot(axes=1)
    self.ncf_vector = Concatenate(axis=-1)
    self.dropout = Dropout(0.2)
    self.layer1 = Dense(
      ncf_layers[1],
      name='layer1',
      activation='relu',
      kernel_regularizer=regularizers.l2(
        ncf_regs[1]
      ),
    )
    self.layer2 = Dense(
      ncf_layers[2],
      name='layer2',
      activation='relu',
      kernel_regularizer=regularizers.l2(
        ncf_regs[2]
      ),
    )
    self.layer3 = Dense(
      ncf_layers[3],
      name='layer3',
      activation='relu',
      kernel_regularizer=regularizers.l2(
        ncf_regs[3]
      ),
    )
    self.predict_vector = Concatenate(axis=-1)
    self.layer4 = Dense(
      1,
      activation='sigmoid',
      kernel_initializer='lecun_uniform',
      name='prediction'
    )

  @tf.function
  def call(self, inputs: Tuple[int, int]) \
    -> tf.Tensor:
    # ユーザ・アイテムをEmbeddingする(GMF用とNCF用)
    # shape: (batch_size, 1, gmf_latent)
    GMF_User_Embedding = \
      self.GMF_User_Embedding_Layer(inputs[0])
    # shape: (batch_size, 1, gmf_latent)
    GMF_Item_Embedding = \
      self.GMF_Item_Embedding_Layer(inputs[1])
    NCF_User_Embedding = \
      self.NCF_User_Embedding_Layer(inputs[0])
    NCF_Item_Embedding = \
      self.NCF_Item_Embedding_Layer(inputs[1])

    # GMF(Generalized Matrix Factorization)
    # 平滑化
    # shape: (batch_size, gmf_latent)
    gmf_user_latent = \
      self.flatten(GMF_User_Embedding)
    # shape: (batch_size, gmf_latent)
    gmf_item_latent = \
      self.flatten(GMF_Item_Embedding)
    # ユーザ・アイテムembeddingのドット積をとる
    # shape: (batch_size, 1)
    gmf_vector = self.gmf_vector(
      [gmf_user_latent, gmf_item_latent]
    )

    # NCF(Neural Collaborative Filtering)
    # shape: (batch_size, embedding_dim)
    ncf_user_latent = \
      self.flatten(NCF_User_Embedding)
    # shape: (batch_size, embedding_dim)
    ncf_item_latent = \
      self.flatten(NCF_Item_Embedding)
    # embedding層の結合
    # shape: (batch_size, embedding_dim * 2)
    ncf_vector = self.ncf_vector(
      [ncf_user_latent, ncf_item_latent]
    )
    # shape: (batch_size, embedding_dim * 2)
    ncf_vector = self.dropout(ncf_vector)
    # 多層パーセプトロン
    # shape: (batch_size, layer1_dim)
    ncf_vector = self.layer1(ncf_vector)
    # shape: (batch_size, layer1_dim)
    ncf_vector = self.dropout(ncf_vector)
    # shape: (batch_size, layer2_dim)
    ncf_vector = self.layer2(ncf_vector)
    # shape: (batch_size, layer2_dim)
    ncf_vector = self.dropout(ncf_vector)
    # shape: (batch_size, layer3_dim)
    ncf_vector = self.layer3(ncf_vector)

    # NeuMF(GMFとNCFを結合する)
    # shape: (batch_size, 2)
    vector = self.predict_vector(
      [gmf_vector, ncf_vector]
    )
    # shape: (batch_size, 1)
    output = self.layer4(vector)

    return output

In [5]:
%%time
model = NeuMF(n_users, n_items)
model.compile(
  optimizer=optimizers.Adam(lr=0.001), 
  loss='mean_squared_error'
)
history = model.fit(
  [dataset.user_id, dataset.item_id], 
  dataset.rating, 
  epochs=10
)

Epoch 1/10


  super(Adam, self).__init__(name, **kwargs)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1min 37s, sys: 6.76 s, total: 1min 44s
Wall time: 2min 26s


In [6]:
%%time
# 推薦結果を格納するテーブルを作成する
df_recommend_list = pd.DataFrame(
  columns=['user_id', 'item_id', 'score', 'rank']
)

# 各ユーザに対して、トップ10アイテムを絞り込む
for user_id in uq_users:

  # すでに接触したアイテムを除外する
  i_list = list(
    set(uq_items) - 
    set(dataset[
      dataset['user_id']==user_id
    ]['item_id'].tolist()))
  
  u_list = [user_id] * len(i_list)
  df_predict = pd.DataFrame()
  df_predict['user_id'] = u_list
  df_predict['item_id'] = i_list

  # 対象ユーザと未接触アイテムでスコア予測を行う
  prediction = model.predict(
    [df_predict.user_id, df_predict.item_id]
  )
  df_predict['score'] = prediction

  # スコアでの並び替え、トップ10で絞り込み
  df_recommend = df_predict.sort_values(
    'score', ascending=False
  )[:topk]
  df_recommend['rank'] = rank_list

  # 全体テーブルに格納する
  df_recommend_list = \
    df_recommend_list.append(
      df_recommend, 
      ignore_index=True
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 2min, sys: 8.17 s, total: 2min 8s
Wall time: 2min 10s


In [7]:
df_recommend_list

Unnamed: 0,user_id,item_id,score,rank
0,1,408,0.996449,1
1,1,285,0.992711,2
2,1,963,0.988443,3
3,1,511,0.988160,4
4,1,483,0.988052,5
...,...,...,...,...
9425,943,313,0.814256,6
9426,943,357,0.808628,7
9427,943,134,0.807480,8
9428,943,199,0.807473,9
