In [16]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from typing import List, Tuple, Dict

dataset_path = "/home/kaba/development/lightfm_sandbox/ml-1m/"
raw_data_df = pd.read_csv(f"{dataset_path}ml100k.csv")
# すべての値がNaNのカラムを削除
raw_data_df = raw_data_df.dropna(axis=1, how='all')


In [17]:
# 必要な７種の変数を用意する。
all_user_ids = sorted(raw_data_df['user_id'].unique())
all_item_ids = sorted(raw_data_df['item_id'].unique())

# ユーザー特徴量のカラム名
user_features_columns = [
    col for col in raw_data_df.columns
    if col.startswith('user_') and col != 'user_id'
]
unique_user_features_df = pd.unique(pd.concat([raw_data_df[col] for col in user_features_columns], ignore_index=True))
unique_user_features_list = unique_user_features_df.tolist()
unique_user_features_list = []

for col in user_features_columns:
    # NAN二も対応すべくuser_ageのような、target+カラム名のようなケースも考慮する。
    unique_user_features_list.append(f"{col}")
    unique_values = raw_data_df[col].dropna().unique()
    for val in unique_values:
        unique_user_features_list.append(f"{col}_{val}")

# アイテム特徴量のカラム名
item_features_columns = [
    col for col in raw_data_df.columns
    if col.startswith('item_') and col != 'item_id'
]
unique_item_features_df = pd.unique(pd.concat([raw_data_df[col] for col in item_features_columns], ignore_index=True))
unique_item_features_list = unique_item_features_df.tolist()

unique_item_features_list = []

for col in item_features_columns:
    # NAN二も対応すべくuser_ageのような、target+カラム名のようなケースも考慮する。
    unique_item_features_list.append(f"{col}")
    unique_values = raw_data_df[col].dropna().unique()
    for val in unique_values:
        unique_item_features_list.append(f"{col}_{val}")

data = list(zip(raw_data_df['user_id'], raw_data_df['item_id']))


In [18]:
def build_feature_data(df: pd.DataFrame, target_column_name: str) -> List[Tuple[int, Dict]]:
    """カテゴリ変数は0.5、連続変数は0-1に正規化して特徴量辞書を生成"""
    result = []
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    continuous_cols = df.select_dtypes(include=['int', 'float']).columns.tolist()
    # user_id, item_idは正規化する必要がないため除外（あれば）
    continuous_cols.remove(target_column_name)

    # user_id, item_idは正規化する必要がないため除外
    for col in ['user_id', 'item_id']:
        if col in continuous_cols:
            continuous_cols.remove(col)

    # 連値を0-1スケーリング
    if continuous_cols:
        scaler = MinMaxScaler()
        df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

    for idx, row in df.iterrows():
        features = {}

        # カテゴリ変数
        for col in categorical_cols:
            val = row[col]
            if pd.notnull(val):
                features[f"{col}_{val}"] = 0.5

        # 連続値
        for col in continuous_cols:
            features[col] = row[col]
        id = row[target_column_name]

        result.append((id, features))
    return result

In [19]:
# ユーザー特徴量抽出
user_df = raw_data_df[user_features_columns + ['user_id']]
user_features_data = build_feature_data(user_df, 'user_id')

# アイテム特徴量抽出
item_df = raw_data_df[item_features_columns + ['item_id']]
item_features_data = build_feature_data(item_df, 'item_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[continuous_cols] = scaler.fit_transform(df[continuous_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[continuous_cols] = scaler.fit_transform(df[continuous_cols])


In [20]:
import math

for user_id, feature_dict in item_features_data:
    for key, value in feature_dict.items():
        if isinstance(value, float) and math.isnan(value):
            print(f"user_id={user_id}, feature={key}, value=NaN")


In [21]:
from lightfm.data import Dataset

dataset = Dataset()

# 全てのユーザー、アイテムをリストアップ。
dataset.fit(users=all_user_ids, items=all_item_ids, user_features=unique_user_features_list, item_features=unique_item_features_list)
print(dataset)

# weightは不要のため＿として記述。
interactions, _ = dataset.build_interactions(data=data)
print('interactions: ', interactions)

user_features = dataset.build_user_features(user_features_data)
item_features = dataset.build_item_features(item_features_data)


<lightfm.data.Dataset object at 0x7f9838869570>
interactions:  <COOrdinate sparse matrix of dtype 'int32'
	with 100000 stored elements and shape (943, 1682)>
  Coords	Values
  (195, 241)	1
  (185, 301)	1
  (21, 376)	1
  (243, 50)	1
  (165, 345)	1
  (297, 473)	1
  (114, 264)	1
  (252, 464)	1
  (304, 450)	1
  (5, 85)	1
  (61, 256)	1
  (285, 1013)	1
  (199, 221)	1
  (209, 39)	1
  (223, 28)	1
  (302, 784)	1
  (121, 386)	1
  (193, 273)	1
  (290, 1041)	1
  (233, 1183)	1
  (118, 391)	1
  (166, 485)	1
  (298, 143)	1
  (290, 117)	1
  (307, 0)	1
  :	:
  (536, 442)	1
  (617, 627)	1
  (486, 290)	1
  (112, 974)	1
  (942, 390)	1
  (863, 684)	1
  (749, 322)	1
  (278, 63)	1
  (645, 749)	1
  (653, 369)	1
  (616, 581)	1
  (912, 689)	1
  (659, 228)	1
  (420, 497)	1
  (494, 1090)	1
  (805, 420)	1
  (675, 537)	1
  (720, 261)	1
  (912, 208)	1
  (377, 77)	1
  (879, 475)	1
  (715, 203)	1
  (275, 1089)	1
  (12, 224)	1
  (11, 202)	1


In [22]:
user_features_data

results = [entry for entry in user_features_data if entry[0] == 242]
print(results)

# 1件だけ欲しい場合は以下でも可
result = next((entry for entry in user_features_data if entry[0] == 196), None)

print(result)


[(242, {'user_gender_M': 0.5, 'user_occupation_educator': 0.5, 'user_zip_code_31404': 0.5, 'user_age': 0.3939393939393939}), (242, {'user_gender_M': 0.5, 'user_occupation_educator': 0.5, 'user_zip_code_31404': 0.5, 'user_age': 0.3939393939393939}), (242, {'user_gender_M': 0.5, 'user_occupation_educator': 0.5, 'user_zip_code_31404': 0.5, 'user_age': 0.3939393939393939}), (242, {'user_gender_M': 0.5, 'user_occupation_educator': 0.5, 'user_zip_code_31404': 0.5, 'user_age': 0.3939393939393939}), (242, {'user_gender_M': 0.5, 'user_occupation_educator': 0.5, 'user_zip_code_31404': 0.5, 'user_age': 0.3939393939393939}), (242, {'user_gender_M': 0.5, 'user_occupation_educator': 0.5, 'user_zip_code_31404': 0.5, 'user_age': 0.3939393939393939}), (242, {'user_gender_M': 0.5, 'user_occupation_educator': 0.5, 'user_zip_code_31404': 0.5, 'user_age': 0.3939393939393939}), (242, {'user_gender_M': 0.5, 'user_occupation_educator': 0.5, 'user_zip_code_31404': 0.5, 'user_age': 0.3939393939393939}), (242, {

In [23]:
print(user_features)
# data = user_features
# for datum in data:
#     print(datum)
#     foo = len(datum)
#     print(foo)

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 4715 stored elements and shape (943, 1826)>
  Coords	Values
  (0, 0)	0.002087418222799897
  (0, 943)	0.14624591171741486
  (0, 1006)	0.28388887643814087
  (0, 1011)	0.28388887643814087
  (0, 1145)	0.28388887643814087
  (1, 1)	0.007287987042218447
  (1, 943)	0.31492921710014343
  (1, 1007)	0.22592759132385254
  (1, 1019)	0.22592759132385254
  (1, 1250)	0.22592759132385254
  (2, 2)	0.010516253300011158
  (2, 943)	0.1376672238111496
  (2, 1006)	0.28393885493278503
  (2, 1009)	0.28393885493278503
  (2, 1284)	0.28393885493278503
  (3, 3)	0.023157894611358643
  (3, 943)	0.1431579291820526
  (3, 1006)	0.2778947353363037
  (3, 1011)	0.2778947353363037
  (3, 1282)	0.2778947353363037
  (4, 4)	0.0030080669093877077
  (4, 943)	0.20737437903881073
  (4, 1007)	0.2632058560848236
  (4, 1019)	0.2632058560848236
  (4, 1138)	0.2632058560848236
  :	:
  (938, 938)	0.01128590852022171
  (938, 943)	0.1591997891664505
  (938, 1007)	0.2765047550201

In [25]:
interactions = dataset.build_interactions(data)

user_features = dataset.build_user_features(user_features_data)
item_features = dataset.build_item_features(item_features_data)

In [26]:
from lightfm import LightFM

# モデルの作成
model = LightFM(no_components=100, loss="warp", random_state=123)

# 学習
res = model.fit(interactions=interactions, user_features=user_features, item_features=item_features)

res


AttributeError: 'tuple' object has no attribute 'tocoo'

In [None]:
from lightfm.cross_validation import random_train_test_split

# interactionsを引数にとり、
train_interactions, test_interactions = random_train_test_split(interactions=interactions, test_percentage=0.2)

print(train_interactions, test_interactions)

<COOrdinate sparse matrix of dtype 'float32'
	with 1 stored elements and shape (3, 3)>
  Coords	Values
  (2, 2)	1.0 <COOrdinate sparse matrix of dtype 'float32'
	with 1 stored elements and shape (3, 3)>
  Coords	Values
  (0, 1)	1.0


In [None]:
# modelのevaluation

from lightfm.evaluation import precision_at_k


precision_at_k = precision_at_k(model=model, test_interactions=test_interactions, train_interactions=train_interactions, k=10, user_features=all_user_features, item_features=all_item_features)

precision_at_k

AttributeError: 'list' object has no attribute 'tocsr'