dataフォルダがない場合に作成する処理を追加

In [1]:
import os
from urllib.request import urlopen

# MovieLensのサイトから、Zipファイルを取得し、ローカルに保存します.

# dataフォルダがない場合に作成する処理を追加
file_name = "data/ml-100k.zip"
if not os.path.exists(os.path.dirname(file_name)):
    os.makedirs(os.path.dirname(file_name))

if not os.path.exists("data/ml-100k.zip"):
    url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
    with urlopen(url) as res:
        with open("data/ml-100k.zip", "wb") as f:
            f.write(res.read())
    # Zipファイルを解凍します.
    from shutil import unpack_archive
    unpack_archive("data/ml-100k.zip", "data/", "zip")

In [2]:
import numpy as np
import pandas as pd
udata = pd.read_csv("data/ml-100k/u1.base", delimiter="\t", names=("user", "movie", "rating", "timestamp"))
udata.tail()

Unnamed: 0,user,movie,rating,timestamp
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275
79999,943,1330,3,888692465


pivotメソッドを使用

In [3]:
high_rate = udata.loc[udata["rating"] >= 3]
# movieを行, columnsを列にした後、欠損部分を0埋め
raw = high_rate.pivot(index="movie", columns="user", values="rating")
df = raw.fillna(0)
# whereメソッドはわかりにくいですが、以下で3未満以外(つまり3以上)を1で埋めて返します
df = df.where(df < 3, 1)

In [4]:
df.head()

user,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# 上記の雰囲気で、総当たりで全アイテムの距離を計算する.
from scipy.spatial.distance import pdist
d = pdist(df, "cosine")
# 類似度 = 1 - コサイン距離
d = 1 - d

# 結果を行列に変換します（上記だとベクトルで見辛い！！）
from scipy.spatial.distance import squareform
d = squareform(d)
# nan ができるので、0に補正します.
d[np.isnan(d)] = 0

# ここでちょっとしたトリックで、自分自身は「-1」に補正して、類似度を最低にします.
d = d - np.eye(d.shape[0])

# 表示してみる.
print(d)

[[-1.          0.32520786  0.27172635 ...,  0.          0.05322463
   0.05322463]
 [ 0.32520786 -1.          0.20689728 ...,  0.          0.10910895
   0.10910895]
 [ 0.27172635  0.20689728 -1.         ...,  0.          0.          0.14586499]
 ..., 
 [ 0.          0.          0.         ..., -1.          0.          0.        ]
 [ 0.05322463  0.10910895  0.         ...,  0.         -1.          0.        ]
 [ 0.05322463  0.10910895  0.14586499 ...,  0.          0.         -1.        ]]


例えば、映画ID=1に類似する映画を、類似度の高い順に並べてみます。

メモリ効率や速度を考え、Numpyを使います。

In [6]:
movie_id = 0

# 評価の良い順に並べます.
# ソート後のインデックスを収納
id = d[movie_id].argsort()[::-1]

# 最初の10件を表示してみます.
for i in id[:10]:
    print("{i:0>3d}: {v: .3f}".format(i=i, v=d[movie_id, i]))

049:  0.628
180:  0.602
120:  0.564
116:  0.559
221:  0.545
403:  0.539
256:  0.530
236:  0.530
006:  0.528
150:  0.519


レコメンドの関数

In [7]:
# 指定したユーザーへレコメンドするアイテムを10個出力する関数
def get_recommend_items(user_id):
    # 高く評価した映画のリストを取得
    favorite = df.loc[:, user_id].nonzero()
    # 評価済み映画のリストを取得
    reviewed = raw[raw.loc[:, user_id].notnull()].index.tolist()
    # 評価テーブルから、高評価の行を取り出す
    table = d[favorite]
    # 列ごとに類似度を合計
    indicator = table.sum(axis=0)
    # ソート
    sorted_id = indicator.argsort()[::-1]
    # 評価済みを削除
    recommend_id = [i for i in sorted_id if i not in reviewed]
    # 10件だけ返す
    return recommend_id[:10]

# 試しにUser_ID=100の人
recommends = get_recommend_items(100)
print(recommends)

[312, 301, 306, 331, 314, 287, 742, 299, 740, 329]


テスト

In [8]:
utest = pd.read_csv("data/ml-100k/u1.test", delimiter="\t", names=("user", "movie", "rating", "timestamp"))
utest.head()

Unnamed: 0,user,movie,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [9]:
high_rate_test = utest.loc[udata["rating"] >= 3]
raw_test = high_rate_test.pivot(index="movie", columns="user", values="rating")
df_test = raw_test.fillna(0)
df_test = df_test.where(df_test < 3, 1)

In [10]:
# 試しに、userId=1の人でテスト.
user_id = 1
real = set(df_test.loc[:, user_id].nonzero()[0])
recommends = set(get_recommend_items(user_id))
real & recommends

{81, 97}

In [11]:
users = df_test.columns
all = len(users)
good = 0

for user_id in users:
    real = set(df_test.loc[:, user_id].nonzero()[0])
    recommends = set(get_recommend_items(user_id))
    matches = real & recommends
    good += 1 if matches else 0

print("全件={0}, 成功数={1}, 成功率={2}%".format(all, good, good * 100 // all))

全件=458, 成功数=338, 成功率=73%
