In [1]:
!pip install lightfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import time
from zipfile import ZipFile
from urllib.request import urlretrieve
import itertools
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from lightfm import LightFM
from lightfm.data import Dataset

In [3]:
ML1M_URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
# MovieLens 1Mデータをダウンロード
urlretrieve(ML1M_URL, 'movielens.zip')
# Zipを解凍
ZipFile('movielens.zip', 'r').extractall()

In [4]:
#　ユーザ特徴量データ
df_user = pd.read_csv(
  'ml-1m/users.dat', 
  sep='::', 
  names=[
    'user_id', 'sex', 'age_group', 
    'occupation', 'zip_code'
  ]
)
# アイテム特徴量データ
df_item = pd.read_csv(
  'ml-1m/movies.dat', 
  sep='::', 
  names=['item_id', 'title', 'genres']
)
# ユーザ-アイテムインタラクションデータ
dataset = pd.read_csv(
  'ml-1m/ratings.dat', 
  sep='::', 
  names=['user_id', 'item_id', 'rating', 'timestamp']
)
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())
n_users = len(uq_users)
n_items = len(uq_items)

  return func(*args, **kwargs)


In [5]:
topk = 10
rank_list = [i+1 for i in range(topk)]
latent = 50

In [6]:
print('df_user:', df_user.head())
print('df_item:', df_item.head())
print('dataset:', dataset.head())
print('user list:', uq_users)
print('item list:', uq_items)
print('user number:', n_users)
print('item number:', n_items)

df_user:    user_id sex  age_group  occupation zip_code
0        1   F          1          10    48067
1        2   M         56          16    70072
2        3   M         25          15    55117
3        4   M         45           7    02460
4        5   M         25          20    55455
df_item:    item_id                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
dataset:    user_id  item_id  rating  timestamp
0        1     1193       5  978300760
1        1      661       3  978302109
2        1      914       3  978301968
3        1     3408       4  978300275
4        1     2355       5  97882

In [7]:
# 各ユーザの特徴量を格納するリスト
uq_user_features = []
# ユーザの特徴量カラムとなる集合(重複なし)
set_user_features = set()

# 各ユーザに対してLightFM用の特徴量データを作成する
for row in df_user.itertuples():
  user_features = {}
  dict_row = []

  # 基点ユーザの特徴量をone-hot化しリストとして持つ
  user_features['sex-'+row.sex] = 1
  user_features['age_group-'+str(row.age_group)] = 1
  user_features['occupation-'+str(row.occupation)] = 1

  # ユーザー特徴量カラムに追加する
  set_user_features.add('sex-'+row.sex)
  set_user_features.add(
    'age_group-'+str(row.age_group)
  )
  set_user_features.add(
    'occupation-'+str(row.occupation)
  )
  
  # ユーザーIDと特徴量をリストに格納する
  dict_row.append(row.user_id)
  dict_row.append(user_features)
  uq_user_features.append(dict_row)

In [8]:
uq_user_features

[[1, {'age_group-1': 1, 'occupation-10': 1, 'sex-F': 1}],
 [2, {'age_group-56': 1, 'occupation-16': 1, 'sex-M': 1}],
 [3, {'age_group-25': 1, 'occupation-15': 1, 'sex-M': 1}],
 [4, {'age_group-45': 1, 'occupation-7': 1, 'sex-M': 1}],
 [5, {'age_group-25': 1, 'occupation-20': 1, 'sex-M': 1}],
 [6, {'age_group-50': 1, 'occupation-9': 1, 'sex-F': 1}],
 [7, {'age_group-35': 1, 'occupation-1': 1, 'sex-M': 1}],
 [8, {'age_group-25': 1, 'occupation-12': 1, 'sex-M': 1}],
 [9, {'age_group-25': 1, 'occupation-17': 1, 'sex-M': 1}],
 [10, {'age_group-35': 1, 'occupation-1': 1, 'sex-F': 1}],
 [11, {'age_group-25': 1, 'occupation-1': 1, 'sex-F': 1}],
 [12, {'age_group-25': 1, 'occupation-12': 1, 'sex-M': 1}],
 [13, {'age_group-45': 1, 'occupation-1': 1, 'sex-M': 1}],
 [14, {'age_group-35': 1, 'occupation-0': 1, 'sex-M': 1}],
 [15, {'age_group-25': 1, 'occupation-7': 1, 'sex-M': 1}],
 [16, {'age_group-35': 1, 'occupation-0': 1, 'sex-F': 1}],
 [17, {'age_group-50': 1, 'occupation-1': 1, 'sex-M': 1}],


In [9]:
# シーケンスをリスト化する関数
def flatten_sequences(sequences):
  sequences = [
    i if type(i) == list else [i] for i in sequences
  ]
  flattened = list(
    itertools.chain.from_iterable(sequences)
  )
  return flattened

# |で区切られた複数のジャンル情報をリスト化
df_item['genre'] = df_item['genres'].str.split('|')
df_item = df_item[df_item['item_id'].isin(uq_items)]
df_item['genre'] = df_item[['genre']].values.tolist()
df_item['genre'] = \
  df_item['genre'].apply(flatten_sequences)

# 各アイテムの特徴量を格納するリスト
uq_item_features = []
# アイテムの特徴量カラムとなる集合（重複なし）
set_item_features = set()

# 各アイテムに対してLightFM用の特徴量データを作成する
for row in df_item.itertuples():
  item_features = {}
  dict_row = []

  for s in row.genre:
    # 各アイテムの特徴量をone-hot化しリストとして持つ
    item_features[s] = 1
    # アイテム特徴量カラムに追加する
    set_item_features.add(s)

  # アイテムIDと特徴量をリストに格納する
  dict_row.append(row.item_id)
  dict_row.append(item_features)
  uq_item_features.append(dict_row) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [10]:
uq_item_features

[[1, {'Animation': 1, "Children's": 1, 'Comedy': 1}],
 [2, {'Adventure': 1, "Children's": 1, 'Fantasy': 1}],
 [3, {'Comedy': 1, 'Romance': 1}],
 [4, {'Comedy': 1, 'Drama': 1}],
 [5, {'Comedy': 1}],
 [6, {'Action': 1, 'Crime': 1, 'Thriller': 1}],
 [7, {'Comedy': 1, 'Romance': 1}],
 [8, {'Adventure': 1, "Children's": 1}],
 [9, {'Action': 1}],
 [10, {'Action': 1, 'Adventure': 1, 'Thriller': 1}],
 [11, {'Comedy': 1, 'Drama': 1, 'Romance': 1}],
 [12, {'Comedy': 1, 'Horror': 1}],
 [13, {'Animation': 1, "Children's": 1}],
 [14, {'Drama': 1}],
 [15, {'Action': 1, 'Adventure': 1, 'Romance': 1}],
 [16, {'Drama': 1, 'Thriller': 1}],
 [17, {'Drama': 1, 'Romance': 1}],
 [18, {'Thriller': 1}],
 [19, {'Comedy': 1}],
 [20, {'Action': 1}],
 [21, {'Action': 1, 'Comedy': 1, 'Drama': 1}],
 [22, {'Crime': 1, 'Drama': 1, 'Thriller': 1}],
 [23, {'Thriller': 1}],
 [24, {'Drama': 1, 'Sci-Fi': 1}],
 [25, {'Drama': 1, 'Romance': 1}],
 [26, {'Drama': 1}],
 [27, {'Drama': 1}],
 [28, {'Romance': 1}],
 [29, {'Advent

In [11]:
%%time
# LightFM用のデータセットの宣言
lightfm_dataset = Dataset()
# ユーザー、アイテム、各特徴量へのenocoder生成
lightfm_dataset.fit(
  users=uq_users, 
  items=uq_items, 
  user_features=set_user_features, 
  item_features=set_item_features
)
# ユーザー特徴量データセットのビルド
u_feat = \
  lightfm_dataset.build_user_features(
    uq_user_features
  )
# アイテム特徴量データセットのビルド
i_feat = \
  lightfm_dataset.build_item_features(
    uq_item_features
  )
# マッピング情報の作成
user_id_map, user_feature_map, \
  item_id_map, item_feature_map = \
  lightfm_dataset.mapping()
# インタラクションデータのリスト化
list_dataset = list(
  dataset[
    ['user_id', 'item_id', 'rating']
  ].itertuples(index=False, name=None)
)
# インタラクションデータのビルド
interactions, weights = \
  lightfm_dataset.build_interactions(
    list_dataset
  )

CPU times: user 2.91 s, sys: 101 ms, total: 3.01 s
Wall time: 3.03 s


In [12]:
%%time
# モデルの宣言
model = LightFM(
  no_components=latent, #潜在因子の次元
  loss='bpr',           #損失関数はBPR
  learning_rate=0.01,   #学習率
  user_alpha=0.01,      #user特徴量の正則化
  item_alpha=0.01       #item特徴量の正則化
)
# モデルの学習
model.fit(
  weights,              #explicitデータ
  user_features=u_feat, #user特徴量情報
  item_features=i_feat, #item特徴量情報
  epochs=5,             #エポック数
  num_threads=4         #並列化数を指定
)

CPU times: user 57.7 s, sys: 52.7 ms, total: 57.8 s
Wall time: 35.2 s


In [13]:
%%time
prediction = model.predict(
  user_ids=\
    dataset.user_id.map(user_id_map).values,
  item_ids=\
    dataset.item_id.map(item_id_map).values,
  user_features=u_feat,
  item_features=i_feat, 
  num_threads=4)

CPU times: user 662 ms, sys: 8.97 ms, total: 671 ms
Wall time: 371 ms


In [14]:
%%time
# ユニークなユーザーIDのリストを作成する
uq_users = np.sort(dataset.user_id.unique().tolist())
# ユニークなアイテムIDのリストを作成する
uq_items = np.sort(dataset.item_id.unique().tolist())

# 推薦結果を格納するテーブルを作成する
df_recommend_list = pd.DataFrame(
  columns=['user_id', 'item_id', 'score', 'rank']
)

# 各ユーザーに対して、トップ10アイテムを絞り込む
for user_id in uq_users:

  # すでに接触したアイテムを除外する
  i_list = list(
    set(uq_items) - 
    set(dataset[
      dataset['user_id']==user_id
    ]['item_id'].tolist())
  )

  u_list = [user_id] * len(i_list)
  df_predict = pd.DataFrame()
  df_predict['user_id'] = u_list
  df_predict['item_id'] = i_list
  
  # 対象ユーザーと未接触アイテムでスコア予測を行う
  prediction = model.predict(
    user_ids=\
      df_predict.user_id.map(user_id_map).values,
    item_ids=\
      df_predict.item_id.map(item_id_map).values,
    user_features=u_feat, 
    item_features=i_feat, 
    num_threads=4
  )
  df_predict['score'] = prediction

  #  スコアでの並び替え、トップ10で絞り込み
  df_recommend = df_predict.sort_values(
    'score', ascending=False
  )[:topk]
  df_recommend['rank'] = rank_list

  # 全体テーブルに格納する
  df_recommend_list = \
    df_recommend_list.append(
      df_recommend, 
      ignore_index=True
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 2min 34s, sys: 2.26 s, total: 2min 36s
Wall time: 2min 29s


In [15]:
df_recommend_list

Unnamed: 0,user_id,item_id,score,rank
0,1,480,-1.355063,1
1,1,1198,-1.363643,2
2,1,589,-1.368019,3
3,1,2571,-1.368536,4
4,1,2628,-1.371501,5
...,...,...,...,...
60395,6040,2916,-3.816613,6
60396,6040,1527,-3.816704,7
60397,6040,1356,-3.819051,8
60398,6040,316,-3.820175,9
