In [54]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from typing import List, Tuple, Dict

dataset_path = "/home/kaba/development/lightfm_sandbox/ml-1m/"
raw_data_df = pd.read_csv(f"{dataset_path}ml100k.csv")

In [55]:
# 必要な７種の変数を用意する。
all_user_ids = sorted(raw_data_df['user_id'].unique())
all_item_ids = sorted(raw_data_df['item_id'].unique())

# ユーザー特徴量のカラム名
user_features_columns = [col for col in raw_data_df.columns if col.startswith('user_')]
unique_user_features_df = pd.unique(pd.concat([raw_data_df[col] for col in user_features_columns], ignore_index=True))
unique_user_features_list = unique_user_features_df.tolist()

# アイテム特徴量のカラム名
item_features_columns = [col for col in raw_data_df.columns if col.startswith('item_')]
unique_item_features_df = pd.unique(pd.concat([raw_data_df[col] for col in item_features_columns], ignore_index=True))
unique_item_features_list = unique_item_features_df.tolist()

data = list(zip(raw_data_df['user_id'], raw_data_df['item_id']))


In [56]:
def build_feature_data(df: pd.DataFrame, target_column_name: str) -> List[Tuple[int, Dict]]:
    """カテゴリ変数は0.5、連続変数は0-1に正規化して特徴量辞書を生成"""
    result = []
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    continuous_cols = df.select_dtypes(include=['int', 'float']).columns.tolist()
    # user_id, item_idは正規化する必要がないため除外（あれば）
    continuous_cols.remove(target_column_name)
    print(continuous_cols)
    print(df.head())

    # user_id, item_idは正規化する必要がないため除外
    for col in ['user_id', 'item_id']:
        if col in continuous_cols:
            continuous_cols.remove(col)

    # 連値を0-1スケーリング
    if continuous_cols:
        scaler = MinMaxScaler()
        df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

    for idx, row in df.iterrows():
        features = {}

        # カテゴリ変数
        for col in categorical_cols:
            val = row[col]
            if pd.notnull(val):
                features[f"{col}_{val}"] = 0.5

        # 連続値
        for col in continuous_cols:
            features[col] = row[col]
        id = row[target_column_name]

        result.append((id, features))
    return result

In [57]:
# ユーザー特徴量抽出
user_df = raw_data_df[user_features_columns]
user_features_data = build_feature_data(user_df, 'user_id')

# アイテム特徴量抽出
item_df = raw_data_df[item_features_columns]
item_features_data = build_feature_data(item_df, 'item_id')

['user_age']
   user_id  user_age user_gender user_occupation user_zip_code
0      196        49           M          writer         55105
1      186        39           F       executive         00000
2       22        25           M          writer         40206
3      244        28           M      technician         80525
4      166        47           M        educator         55113


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[continuous_cols] = scaler.fit_transform(df[continuous_cols])


['item_video_release_date', 'item_genre_unknown', 'item_genre_action', 'item_genre_adventure', 'item_genre_animation', 'item_genre_childrens', 'item_genre_comedy', 'item_genre_crime', 'item_genre_documentary', 'item_genre_drama', 'item_genre_fantasy', 'item_genre_film-noir', 'item_genre_horror', 'item_genre_musical', 'item_genre_mystery', 'item_genre_romance', 'item_genre_sci-fi', 'item_genre_thriller', 'item_genre_war', 'item_genre_western']
   item_id                  item_title item_release_date  \
0      242                Kolya (1996)       24-Jan-1997   
1      302    L.A. Confidential (1997)       01-Jan-1997   
2      377         Heavyweights (1994)       01-Jan-1994   
3       51  Legends of the Fall (1994)       01-Jan-1994   
4      346         Jackie Brown (1997)       01-Jan-1997   

   item_video_release_date                                      item_imdb_url  \
0                      NaN    http://us.imdb.com/M/title-exact?Kolya%20(1996)   
1                      NaN  ht

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[continuous_cols] = scaler.fit_transform(df[continuous_cols])


In [58]:
from lightfm.data import Dataset

dataset = Dataset()

# 全てのユーザー、アイテムをリストアップ。
dataset.fit(users=all_user_ids, items=all_item_ids, user_features=unique_user_features_list, item_features=unique_item_features_list)
print(dataset)

# weightは不要のため＿として記述。
interactions, _ = dataset.build_interactions(data=data)
print('interactions: ', interactions)

user_features = dataset.build_user_features(user_features_data)
item_features = dataset.build_item_features(item_features_data)


<lightfm.data.Dataset object at 0x7f5f3db94370>
interactions:  <COOrdinate sparse matrix of dtype 'int32'
	with 100000 stored elements and shape (943, 1682)>
  Coords	Values
  (195, 241)	1
  (185, 301)	1
  (21, 376)	1
  (243, 50)	1
  (165, 345)	1
  (297, 473)	1
  (114, 264)	1
  (252, 464)	1
  (304, 450)	1
  (5, 85)	1
  (61, 256)	1
  (285, 1013)	1
  (199, 221)	1
  (209, 39)	1
  (223, 28)	1
  (302, 784)	1
  (121, 386)	1
  (193, 273)	1
  (290, 1041)	1
  (233, 1183)	1
  (118, 391)	1
  (166, 485)	1
  (298, 143)	1
  (290, 117)	1
  (307, 0)	1
  :	:
  (536, 442)	1
  (617, 627)	1
  (486, 290)	1
  (112, 974)	1
  (942, 390)	1
  (863, 684)	1
  (749, 322)	1
  (278, 63)	1
  (645, 749)	1
  (653, 369)	1
  (616, 581)	1
  (912, 689)	1
  (659, 228)	1
  (420, 497)	1
  (494, 1090)	1
  (805, 420)	1
  (675, 537)	1
  (720, 261)	1
  (912, 208)	1
  (377, 77)	1
  (879, 475)	1
  (715, 203)	1
  (275, 1089)	1
  (12, 224)	1
  (11, 202)	1


ValueError: Feature user_gender_M not in feature mapping. Call fit first.

In [None]:
interactions = dataset.build_interactions(data)

user_features = dataset.build_user_features(data)
item_features = dataset.build_item_features(data)


TypeError: 'int' object is not iterable

In [None]:
from lightfm import LightFM

# モデルの作成
model = LightFM(no_components=100, loss="warp", random_state=123)

# 学習
res = model.fit(interactions=interactions, user_features=user_features, item_features=item_features)

res


<lightfm.lightfm.LightFM at 0x7f2208d599f0>

In [None]:
from lightfm.cross_validation import random_train_test_split

# interactionsを引数にとり、
train_interactions, test_interactions = random_train_test_split(interactions=interactions, test_percentage=0.2)

print(train_interactions, test_interactions)

<COOrdinate sparse matrix of dtype 'float32'
	with 1 stored elements and shape (3, 3)>
  Coords	Values
  (2, 2)	1.0 <COOrdinate sparse matrix of dtype 'float32'
	with 1 stored elements and shape (3, 3)>
  Coords	Values
  (0, 1)	1.0


In [None]:
# modelのevaluation

from lightfm.evaluation import precision_at_k


precision_at_k = precision_at_k(model=model, test_interactions=test_interactions, train_interactions=train_interactions, k=10, user_features=all_user_features, item_features=all_item_features)

precision_at_k

AttributeError: 'list' object has no attribute 'tocsr'