In [1]:
import pandas as pd

movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    encoding="latin1",
    engine="python",
    header=None,
    names=["movie_id", "title", "genres"],
)
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    encoding="latin1",
    engine="python",
    header=None,
    names=["user_id", "movie_id", "rating", "date"],
)
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    encoding="latin1",
    engine="python",
    header=None,
    names=["user_id", "gender", "age", "occupation", "zip"],
)

In [None]:
# ref : https://recruit.gmo.jp/engineer/jisedai/blog/python_movie_recommendation/
# ref : https://recruit.gmo.jp/engineer/jisedai/blog/movielens_fmm/

# 公開年の取り出し
movies["release"] = (
    movies["title"].str.findall(r"\((\d{4})\)$").apply(lambda x: x[0]).astype(int)
)
# ジャンルのリスト化
movies["genres"] = movies["genres"].str.split("|")

# UNIX時間→日付へ
ratings["date"] = pd.to_datetime(ratings["date"], unit="s")

# ageカテゴリとoccupation（職業）カテゴリの変換
age_map = {
    1: " -18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+",
}
occupation_map = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer",
}
users["age"] = users["age"].map(age_map)
users["occupation"] = users["occupation"].map(occupation_map)

In [3]:
from sklearn.model_selection import train_test_split


def split_dataframe(df, bool_seires):
    return (
        df[bool_seires].reset_index(drop=True),
        df[~bool_seires].reset_index(drop=True),
    )


# users     : ランダムに選んだ9割のユーザー
# users_new : ランダムに選んだ残り１割のユーザー
users, users_new = train_test_split(users, train_size=0.1, random_state=0)

# movies     : 1998年までの映画データ
# movies_new : 1999年以降の映画データ
movies, movies_new = split_dataframe(movies, movies["release"] < 1999)

# ratings     : ランダムに選んだ9割のユーザー　かつ　 1998年までの映画の評価データ
# ratings_new : ランダムに選んだ残り１割のユーザーの評価データ、1999年以降の映画評価データ
ratings, ratings_new = split_dataframe(
    ratings,
    (~ratings["user_id"].isin(users_new["user_id"]))
    & (~ratings["movie_id"].isin(movies_new["movie_id"])),
)

In [4]:
# interactionsの重みが違う場合
interactions = [
    ("user_A", "item_X", 0),
    ("user_A", "item_Y", 5),
    ("user_A", "item_Z", 1),
    ("user_B", "item_X", 1),
    ("user_C", "item_Y", -2),
]
# interactionsの重みが全て同じ場合
interactions = [
    ("user_A", "item_X"),
    ("user_A", "item_Y"),
    ("user_A", "item_Z"),
    ("user_B", "item_X"),
    ("user_C", "item_Y"),
]

In [5]:
# user_featuresが2値のみの場合
user_features = (
    ["user_A", ["user_feat1", "user_feat2"]],
    ["user_B", ["user_feat3", "user_feat4", "user_feat2"]],
    ["user_C", ["user_feat1", "user_feat4"]],
)
# user_featuresに連続値を含む場合
user_features = (
    ["user_A", {"user_feat1":1, "user_feat2":2}],
    ["user_B", {"user_feat3":1, "user_feat4":0.5, "user_feat2":1}],
    ["user_C", {"user_feat1":1, "user_feat4":10}],
)

In [6]:
# items_featuresが2値のみの場合
items_features = (
    ["item_X", ["item_feat1"]],
    ["item_Y", ["item_feat2", "item_feat3", "item_feat4"]],
    ["item_Z", ["item_feat1", "item_feat3", "item_feat4"]]
)

# items_featuresが連続値を含む場合
items_features = (
    ["item_X", {"item_feat1": 1}],
    ["item_Y", {"item_feat2":1, "item_feat3":2, "item_feat4":3}],
    ["item_Z", {"item_feat1":1, "item_feat3":6, "item_feat4":0.1}],
)

In [None]:
import itertools

import numpy as np
from lightfm.data import Dataset


def lightfm_data(rating, items, users, dataset=Dataset()):
    def get_item_features(item: pd.DataFrame):
        item_features = []
        for genres, release in zip(item["genres"], item["release"]):
            features = {value: 1 for value in genres}
            features.update({"release": release})
            item_features += [features]

        uq_item_features = np.unique(list(itertools.chain.from_iterable(item_features)))
        item_features = list(zip(item["movie_id"], item_features))
        uq_items = np.unique(item["movie_id"])

        return item_features, uq_items, uq_item_features

    def get_user_features(users: pd.DataFrame):
        user_features = [
            [row["user_id"], [row["gender"], row["age"], row["occupation"]]]
            for _, row in users.iterrows()
        ]

        uq_user_features = np.unique(users[["gender", "age", "occupation"]].unstack())
        uq_users = np.unique(users["user_id"])

        return user_features, uq_users, uq_user_features

    user_features, uq_users, uq_user_features = get_user_features(users)
    item_features, uq_items, uq_item_features = get_item_features(items)

    dataset.fit_partial(
        uq_users,
        uq_items,
        item_features=uq_item_features,
        user_features=uq_user_features,
    )

    interactions, weights = dataset.build_interactions(
        list(zip(rating["user_id"], rating["movie_id"], rating["rating"]))
    )

    user_features = dataset.build_user_features(user_features)
    item_features = dataset.build_item_features(item_features)

    return interactions, weights, user_features, item_features, dataset


interactions, weights, user_features, item_features, dataset = lightfm_data(
    ratings, movies, users
)
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [22]:
user_feature_map

{np.int64(1): 0,
 np.int64(22): 1,
 np.int64(25): 2,
 np.int64(26): 3,
 np.int64(64): 4,
 np.int64(94): 5,
 np.int64(95): 6,
 np.int64(100): 7,
 np.int64(101): 8,
 np.int64(113): 9,
 np.int64(127): 10,
 np.int64(137): 11,
 np.int64(147): 12,
 np.int64(152): 13,
 np.int64(154): 14,
 np.int64(161): 15,
 np.int64(165): 16,
 np.int64(169): 17,
 np.int64(175): 18,
 np.int64(181): 19,
 np.int64(198): 20,
 np.int64(200): 21,
 np.int64(202): 22,
 np.int64(208): 23,
 np.int64(210): 24,
 np.int64(217): 25,
 np.int64(238): 26,
 np.int64(257): 27,
 np.int64(276): 28,
 np.int64(281): 29,
 np.int64(282): 30,
 np.int64(293): 31,
 np.int64(308): 32,
 np.int64(330): 33,
 np.int64(335): 34,
 np.int64(344): 35,
 np.int64(376): 36,
 np.int64(401): 37,
 np.int64(405): 38,
 np.int64(424): 39,
 np.int64(431): 40,
 np.int64(434): 41,
 np.int64(438): 42,
 np.int64(450): 43,
 np.int64(461): 44,
 np.int64(470): 45,
 np.int64(498): 46,
 np.int64(505): 47,
 np.int64(509): 48,
 np.int64(526): 49,
 np.int64(538): 50

In [9]:
from lightfm import LightFM

model = LightFM(no_components=100, loss="bpr", random_state=123)
# 学習時間は約2分
model.fit(
    interactions=interactions,
    sample_weight=weights,
    item_features=item_features,
    user_features=user_features,
    epochs=100,
    num_threads=2,
    verbose=True,
)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99


<lightfm.lightfm.LightFM at 0x7f0a5a0d6440>

In [10]:
predictions = model.predict(user_ids=0, item_ids=[0, 1, 2])
print(predictions)
# [101.37569   48.311745  21.090853 -35.42724 ]

[-6.1013274 -6.678758  -6.7594676]


In [11]:
import itertools

# 予測したいユーザーインデックス:[アイテムインデックス]
users_items = {1: [0, 1], 2: [3, 4, 5]}

input_users = list(users_items.keys())
input_item_n = [len(item) for item in users_items.values()]

input_user_ids = np.repeat(input_users, input_item_n)
input_item_ids = np.array(
    list(itertools.chain.from_iterable(list(users_items.values())))
)

print(input_user_ids)
# [1 1 2 2 2]
print(input_item_ids)
# [0 1 3 4 5]

[1 1 2 2 2]
[0 1 3 4 5]


In [12]:
predictions = model.predict(user_ids = input_user_ids, item_ids=input_item_ids)
print(predictions)
# [338.07364  158.04309  -26.784586  10.424532  24.438726]

[-14.801998  -15.494138   -7.3008375  -7.323372   -7.118072 ]


In [13]:
predictions = np.split(predictions, list(itertools.accumulate(input_item_n))[:-1])
print(predictions)
# [array([338.07364, 158.04309], dtype=float32), 
# array([-26.784586,  10.424532,  24.438726], dtype=float32)]

[array([-14.801998, -15.494138], dtype=float32), array([-7.3008375, -7.323372 , -7.118072 ], dtype=float32)]


In [14]:
# ref : https://nnkkmto.hatenablog.com/entry/2020/12/21/193616
from scipy.sparse import csr_matrix


def convert_csr_features(features, features_map):
    row = np.repeat(range(len(features)), [len(value) for value in features])
    # 連続値
    if type(features[0]) == dict:
        col = []
        data = []
        for dict_value in features:
            col += [features_map[key] for key in dict_value.keys()]
            data += dict_value.values()
    # 2値
    else:
        col = np.array(
            [features_map[key] for key in itertools.chain.from_iterable(features)]
        )
        data = np.repeat(1, len(col))

    csr_features = csr_matrix(
        (data, (row, col)), shape=(len(features), len(features_map))
    )
    return csr_features


# 予測したいユーザー特徴量
pridict_user_features = [["25-34", "M", "programmer"], ["18-24", "F"], ["writer"]] # scientistが存在しなかったためwriterへと変更。
# 予測したいアイテムインデックス
pridict_item_list = [[0, 1, 2], [4, 5], [7]]

input_item_n = [len(item) for item in pridict_item_list]
# csr_matrixに変換する
new_user_features_csr = convert_csr_features(pridict_user_features, user_feature_map)

input_user_ids = np.repeat(range(new_user_features_csr.shape[0]), input_item_n)
input_item_ids = np.array(list(itertools.chain.from_iterable(list(pridict_item_list))))

print(input_user_ids)
# [0 0 0 1 1 2]
print(input_item_ids)
# [0 1 2 4 5 7]
new_user_features_csr
# <3x634 sparse matrix of type '<class 'numpy.int64'>'
# 	with 6 stored elements in Compressed Sparse Row format>

[0 0 0 1 1 2]
[0 1 2 4 5 7]


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 6 stored elements and shape (3, 634)>

In [15]:
predictions = model.predict(
    user_ids=input_user_ids,
    item_ids=input_item_ids,
    user_features=new_user_features_csr,
)
print(predictions)
# [22397.225    11281.854     5412.0024    3601.626     4415.794     -97.393456]

[-507.99185 -520.1732  -520.05084 -265.1837  -269.09048  -70.58545]


In [None]:
ranking_predictions = model.predict(
    user_ids=input_user_ids,
    item_ids=input_item_ids,
    user_features=new_user_features_csr,
)
print(ranking_predictions)

In [16]:
# 予測したいユーザーインデックス
pridict_item_list = [1, 10, 10, 100]
# 予測するアイテム特徴量
pridict_item_features = [
    {"Animation": 1, "Comedy": 1, "Action": 1},
    {"Horror": 1},
    {"Action": 1},
    {"release": 1995},
]

# csr_matrixに変換する
new_item_features_csr = convert_csr_features(pridict_item_features, item_feature_map)

input_item_n = [len(item) for item in pridict_item_features]

input_user_ids = pridict_item_list
input_item_ids = list(input_item_ids)

predictions = model.predict(
    user_ids=input_user_ids,
    item_ids=input_item_ids,
    item_features=new_item_features_csr,
)
predictions
# array([  681.6143 ,   121.10423,   452.80695, 10841.821  ], dtype=float32)

ValueError: Expected the number of user IDs (4) to equal the number of item IDs (6)

In [None]:
# ランキング
