In [1]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[?25l[K     |█                               | 10 kB 19.5 MB/s eta 0:00:01[K     |██▏                             | 20 kB 28.4 MB/s eta 0:00:01[K     |███▏                            | 30 kB 35.7 MB/s eta 0:00:01[K     |████▎                           | 40 kB 29.3 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 16.7 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 18.7 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 12.2 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 13.3 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 13.6 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 13.5 MB/s eta 0:00:01[K     |███████████▋                    | 112 kB 13.5 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 13.5 MB/s eta 0:00:01[K     |█████████████▊                  | 133 kB 13.5 MB/s eta 0:00:01[K 

In [2]:
import time
from zipfile import ZipFile
from urllib.request import urlretrieve
import itertools
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from lightfm import LightFM
from lightfm.data import Dataset

In [3]:
%%time
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()
df_user = pd.read_csv("ml-1m/users.dat", sep="::", names=["user_id", "sex", "age_group", "occupation", "zip_code"])
df_item = pd.read_csv("ml-1m/movies.dat", sep="::", names=["item_id", "title", "genres"])
dataset = pd.read_csv("ml-1m/ratings.dat", sep="::", names=['user_id', 'item_id', 'rating', 'timestamp'])
uq_users = np.sort(dataset.user_id.unique().tolist())
uq_items = np.sort(dataset.item_id.unique().tolist())
n_users = len(uq_users)
n_items = len(uq_items)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


CPU times: user 5.45 s, sys: 227 ms, total: 5.68 s
Wall time: 7.4 s


In [4]:
topk = 10
rank_list = [i+1 for i in range(topk)]
latent = 50

In [5]:
print('df_user:', df_user.head())
print('df_item:', df_item.head())
print('dataset:', dataset.head())
print('user list:', uq_users)
print('item list:', uq_items)
print('user number:', n_users)
print('item number:', n_items)

df_user:    user_id sex  age_group  occupation zip_code
0        1   F          1          10    48067
1        2   M         56          16    70072
2        3   M         25          15    55117
3        4   M         45           7    02460
4        5   M         25          20    55455
df_item:    item_id                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
dataset:    user_id  item_id  rating  timestamp
0        1     1193       5  978300760
1        1      661       3  978302109
2        1      914       3  978301968
3        1     3408       4  978300275
4        1     2355       5  97882

In [6]:
uq_user_features = []
user_features_set = set()
for row in df_user.itertuples():
  row_user_features = {}
  dict_row = []

  row_user_features['sex-'+row.sex] = 1
  row_user_features['age_group-'+str(row.age_group)] = 1
  row_user_features['occupation-'+str(row.occupation)] = 1

  user_features_set.add('sex-'+row.sex)
  user_features_set.add('age_group-'+str(row.age_group))
  user_features_set.add('occupation-'+str(row.occupation))

  dict_row.append(row.user_id)
  dict_row.append(row_user_features)
  uq_user_features.append(dict_row)

In [7]:
def flatten_sequences(sequences):
  sequences = [i if type(i) == list else [i] for i in sequences]
  flattened = list(itertools.chain.from_iterable(sequences))
  return flattened

df_item['genre'] = df_item['genres'].str.split('|')
df_item = df_item[df_item['item_id'].isin(uq_items)]
df_item['genre'] = df_item[['genre']].values.tolist()
df_item['genre'] = df_item['genre'].apply(flatten_sequences)
uq_item_features = []
item_features_set = set()
for row in df_item.itertuples():
  row_item_features = {}
  dict_row = []
  for s in row.genre:
      row_item_features[s] = 1
      item_features_set.add(s)
  dict_row.append(row.item_id)
  dict_row.append(row_item_features)
  uq_item_features.append(dict_row)  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [8]:
%%time
lightfm_dataset = Dataset()
lightfm_dataset.fit(users=uq_users, items=uq_items, user_features=user_features_set, item_features=item_features_set)
user_features = lightfm_dataset.build_user_features(uq_user_features)
item_features = lightfm_dataset.build_item_features(uq_item_features)
user_id_map, user_feature_mapping, item_id_map, item_feature_mapping = lightfm_dataset.mapping()
list_dataset = list(dataset[['user_id', 'item_id', 'rating']].itertuples(index=False, name=None))
interactions, weights = lightfm_dataset.build_interactions(list_dataset)

CPU times: user 3.34 s, sys: 47 ms, total: 3.39 s
Wall time: 3.71 s


In [9]:
%%time
model = LightFM(no_components=latent, 
                loss='bpr',
                learning_rate=0.01,
                user_alpha=0.01,
                item_alpha=0.01)
model.fit(weights,
          user_features=user_features,
          item_features=item_features, 
          epochs=5,
          num_threads=4)

CPU times: user 44.6 s, sys: 45.5 ms, total: 44.7 s
Wall time: 30.1 s


In [10]:
%%time
prediction = model.predict(user_ids=dataset.user_id.map(user_id_map).values,
                          item_ids=dataset.item_id.map(item_id_map).values,
                          user_features=user_features,
                          item_features=item_features, 
                          num_threads=4)

CPU times: user 602 ms, sys: 1.04 ms, total: 603 ms
Wall time: 324 ms


In [11]:
%%time
df_recommend_list = pd.DataFrame(columns=['user_id', 'item_id', 'score', 'rank'])
for user_id in uq_users:
  i_list = list(set(uq_items) - set(dataset[dataset['user_id']==user_id]['item_id'].tolist()))
  u_list = [user_id] * len(i_list)
  df_predict = pd.DataFrame()
  df_predict['user_id'] = u_list
  df_predict['item_id'] = i_list
  prediction = model.predict(user_ids=df_predict.user_id.map(user_id_map).values,
                            item_ids=df_predict.item_id.map(item_id_map).values,
                            user_features=user_features,
                            item_features=item_features, 
                            num_threads=4)
  df_predict['score'] = prediction
  df_recommend = df_predict.sort_values('score', ascending=False)[:topk]
  df_recommend['rank'] = rank_list
  df_recommend_list = df_recommend_list.append(df_recommend, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


CPU times: user 3min 1s, sys: 2.17 s, total: 3min 3s
Wall time: 3min 2s


In [12]:
df_recommend_list

Unnamed: 0,user_id,item_id,score,rank
0,1,480,-1.336260,1
1,1,1198,-1.344770,2
2,1,2571,-1.350574,3
3,1,589,-1.351244,4
4,1,1580,-1.353306,5
...,...,...,...,...
60395,6040,2916,-3.755513,6
60396,6040,1527,-3.756342,7
60397,6040,1356,-3.758591,8
60398,6040,316,-3.759297,9
