In [1]:
import time
import pandas as pd
import numpy as np
from typing import List, Tuple

In [2]:
%%time
ML100K_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
dataset = pd.read_csv(
  ML100K_URL, 
  names=["user_id", "item_id", "rating", "timestamp"], 
  sep="\t"
)

# ユニークなユーザID、アイテムIDのリストを作成する
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())

# ユーザ数、アイテム数を取得する
n_users = len(uq_users)
n_items = len(uq_items)

CPU times: user 46.3 ms, sys: 22.5 ms, total: 68.8 ms
Wall time: 833 ms


In [3]:
# 標準化
dataset['rating'] /= 5

topk = 10
rank_list = [i+1 for i in range(topk)]
latent = 50

In [5]:
import tensorflow as tf
from tensorflow.keras import regularizers, optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Dot, Dense

class GMF(Model):

  def __init__(
    self,
    num_users: int, #ユーザ数
    num_items: int, #アイテム数
    gmf_latent: int, #潜在因子の次元
    gmf_regs: List[float] = [1e-6, 1e-6], #正則化
  ) -> None:

    super(GMF, self).__init__()
    self.GMF_User_Embedding_Layer = Embedding(
      input_dim=num_users,
      output_dim=gmf_latent,
      name='gmf_user_embedding',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(
        gmf_regs[0]
      )
    )
    self.GMF_Item_Embedding_Layer = Embedding(
      input_dim=num_items,
      output_dim=gmf_latent,
      name='gmf_item_embedding',
      embeddings_initializer='random_uniform',
      embeddings_regularizer=regularizers.l2(
        gmf_regs[1]
      )
    )
    self.flatten = Flatten()
    self.gmf_vector = Dot(axes=1)
    self.dense = Dense(
      1,
      activation='sigmoid',
      kernel_initializer='lecun_uniform',
      name='prediction'
    )

  @tf.function
  def call(self, inputs: Tuple[int, int]) \
    -> tf.Tensor:
    
    # ユーザ・アイテムをEmbeddingする
    # shape: (batch_size, 1, gmf_latent)
    GMF_User_Embedding = \
      self.GMF_User_Embedding_Layer(inputs[0])
    # shape: (batch_size, 1, gmf_latent)
    GMF_Item_Embedding = \
      self.GMF_Item_Embedding_Layer(inputs[1])

    # 平滑化
    # shape: (batch_size, gmf_latent)
    gmf_user_latent = \
      self.flatten(GMF_User_Embedding)
    # shape: (batch_size, gmf_latent)
    gmf_item_latent = \
      self.flatten(GMF_Item_Embedding)

    # ユーザ・アイテムembeddingのドット積(行列の掛け算)をとる
    # shape: (batch_size, 1)
    gmf_vector = self.gmf_vector(
      [gmf_user_latent, gmf_item_latent]
    )

    # 予測レイヤー
    # shape: (batch_size, 1)
    output = self.dense(gmf_vector)

    return output

In [6]:
%%time
model = GMF(n_users, n_items, latent)
model.compile(
  optimizer=optimizers.Adam(lr=0.001), 
  loss='mean_squared_error'
)
model.fit(
  [dataset.user_id, dataset.item_id], 
  dataset.rating, 
  epochs=10
)

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1min 10s, sys: 6.06 s, total: 1min 16s
Wall time: 1min 9s


In [None]:
%%time
# 推薦結果を格納するテーブルを作成する
df_recommend_list = pd.DataFrame(
  columns=['user_id', 'item_id', 'score', 'rank']
)

# 各ユーザに対して、トップ10アイテムを絞り込む
for user_id in uq_users:

  # すでに接触したアイテムを除外する
  i_list = list(
    set(uq_items) - 
    set(dataset[
      dataset['user_id']==user_id
    ]['item_id'].tolist()))
  
  u_list = [user_id] * len(i_list)
  df_predict = pd.DataFrame()
  df_predict['user_id'] = u_list
  df_predict['item_id'] = i_list

  # 対象ユーザと未接触アイテムでスコア予測を行う
  prediction = model.predict(
    [df_predict.user_id, df_predict.item_id]
  )
  df_predict['score'] = prediction

  #  スコアでの並び替え、トップ10で絞り込み
  df_recommend = df_predict.sort_values(
    'score', ascending=False
  )[:topk]
  df_recommend['rank'] = rank_list

  # 全体テーブルに格納する
  df_recommend_list = \
    df_recommend_list.append(
      df_recommend, 
      ignore_index=True
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 1min 47s, sys: 7.5 s, total: 1min 55s
Wall time: 1min 44s


In [8]:
df_recommend_list

Unnamed: 0,user_id,item_id,score,rank
0,1,408,0.996779,1
1,1,647,0.991531,2
2,1,483,0.989693,3
3,1,474,0.983405,4
4,1,275,0.983385,5
...,...,...,...,...
9425,943,1177,0.639878,6
9426,943,1176,0.639878,7
9427,943,1175,0.639878,8
9428,943,1174,0.639878,9
