In [1]:
import time
import pandas as pd
import numpy as np
from typing import List, Tuple

In [2]:
%%time
ML100K_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
dataset = pd.read_csv(
  ML100K_URL, 
  names=["user_id", "item_id", "rating", "timestamp"], 
  sep="\t"
)

# ユニークなユーザID、アイテムIDのリストを作成する
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())

# ユーザ数、アイテム数を取得する
n_users = len(uq_users)
n_items = len(uq_items)

CPU times: user 36.6 ms, sys: 14.7 ms, total: 51.4 ms
Wall time: 514 ms


In [3]:
# 標準化
dataset['rating'] /= 5

topk = 10
rank_list = [i+1 for i in range(topk)]

In [4]:
import tensorflow as tf
from tensorflow.keras import regularizers, optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, Concatenate

class NCF(Model):
  def __init__(
    self, 
    num_users: int, #ユーザ数
    num_items: int, #アイテム数
    ncf_layers: List[int] = [50, 100, 50, 1],
    ncf_regs: List[float] = [1e-6, 1e-6, 1e-6, 1e-6]
  ) -> None:
  
    super(NCF, self).__init__()
    self.NCF_User_Embedding_Layer = Embedding(
      input_dim=num_users,
      output_dim=ncf_layers[0],
      name='ncf_user_embedding',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(
        ncf_regs[0]
      )
    )
    self.NCF_Item_Embedding_Layer = Embedding(
      input_dim=num_items,
      output_dim=ncf_layers[0],
      name='ncf_item_embedding',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(
        ncf_regs[0]
      )
    )
    self.flatten = Flatten()
    self.ncf_vector = Concatenate(axis=-1)
    self.dropout = Dropout(0.2)
    self.layer1 = Dense(
      ncf_layers[1],
      activation='relu',
      name='layer1',
      kernel_regularizer=regularizers.l2(
        ncf_regs[1]
      ),
    )
    self.layer2 = Dense(
      ncf_layers[2],
      activation='relu',
      name='layer2',
      kernel_regularizer=regularizers.l2(
        ncf_regs[2]
      ),
    )
    self.layer3 = Dense(
      ncf_layers[3],
      activation='relu',
      name='layer3',
      kernel_regularizer=regularizers.l2(
        ncf_regs[3]
      ),
    )
    self.layer4 = Dense(
      1,
      name='prediction',
      activation='sigmoid',
      kernel_initializer='lecun_uniform'
    )

  @tf.function
  def call(self, inputs: Tuple[int, int]) \
    -> tf.Tensor:
    # ユーザ・アイテムをEmbeddingする
    # shape: (batch_size, 1, embedding_dim)
    NCF_User_Embedding = \
      self.NCF_User_Embedding_Layer(inputs[0])
    # shape: (batch_size, 1, embedding_dim)
    NCF_Item_Embedding = \
      self.NCF_Item_Embedding_Layer(inputs[1])

    # 平滑化
    # shape: (batch_size, embedding_dim)
    ncf_user_latent = \
      self.flatten(NCF_User_Embedding)
    # shape: (batch_size, embedding_dim)
    ncf_item_latent = \
      self.flatten(NCF_Item_Embedding)

    # embedding層の結合
    # shape: (batch_size, embedding_dim * 2)
    ncf_vector = self.ncf_vector(
      [ncf_user_latent, ncf_item_latent]
    )
    # shape: (batch_size, embedding_dim * 2)
    ncf_vector = self.dropout(ncf_vector)

    # 多層パーセプトロン
    # shape: (batch_size, layer1_dim)
    ncf_vector = self.layer1(ncf_vector)
    # shape: (batch_size, layer1_dim)
    ncf_vector = self.dropout(ncf_vector)
    # shape: (batch_size, layer2_dim)
    ncf_vector = self.layer2(ncf_vector)
    # shape: (batch_size, layer2_dim)
    ncf_vector = self.dropout(ncf_vector)
    # shape: (batch_size, layer3_dim)
    ncf_vector = self.layer3(ncf_vector)
    # shape: (batch_size, 1)
    output = self.layer4(ncf_vector)

    return output

In [None]:
model = NCF(n_users, n_items)
model.compile(
  optimizer=optimizers.Adam(lr=0.005), 
  loss='mean_squared_error'
)
history = model.fit(
  [dataset.user_id, dataset.item_id], 
  dataset.rating, 
  epochs=10
)

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
%%time
# 推薦結果を格納するテーブルを作成する
df_recommend_list = pd.DataFrame(
  columns=['user_id', 'item_id', 'score', 'rank']
)

# 各ユーザに対して、トップ10アイテムを絞り込む
for user_id in uq_users:

  # すでに接触したアイテムを除外する
  i_list = list(
    set(uq_items) - 
    set(dataset[
      dataset['user_id']==user_id
    ]['item_id'].tolist()))
  
  u_list = [user_id] * len(i_list)
  df_predict = pd.DataFrame()
  df_predict['user_id'] = u_list
  df_predict['item_id'] = i_list

  #対象ユーザと未接触アイテムでスコア予測を行う
  prediction = model.predict(
    [df_predict.user_id, df_predict.item_id]
  )
  df_predict['score'] = prediction

  #  スコアでの並び替え、トップ10で絞り込み
  df_recommend = df_predict.sort_values(
    'score', ascending=False
  )[:topk]
  df_recommend['rank'] = rank_list

  # 全体テーブルに格納する
  df_recommend_list = \
    df_recommend_list.append(
      df_recommend, 
      ignore_index=True
    )

In [None]:
df_recommend_list