In [None]:
%matplotlib inline
from __future__ import print_function

try:
    xrange
except NameError:
    xrange = range

import pandas as pd
import numpy as np
import scipy.spatial as sp
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import scipy.sparse as sparse
from sklearn.decomposition import NMF, TruncatedSVD

<h2>レコメンデーション</h2>

レコメンデーションの最も一般的なアルゴリズムである協調フィルタリングを紹介します。
まずは簡単なダミーデータで説明していきます。

In [None]:
rating_matrix = np.array([[ 2, 5, 1, 1, 0, 1, 2, 4],
                   [ 1, 5, 2, 1, 4, 0, 0, 3],
                   [ 0, 3, 3, 0, 1, 1, 1, 1],
                   [ 5, 2, 2, 3, 1, 0, 0, 4],
                   [ 5, 3, 3, 4, 1, 0, 0, 5],
                   [ 1, 4, 3, 2, 5, 1, 0, 1],
                   [ 0, 0, 0, 0, 0, 0, 0, 2],
                   [ 0, 4, 0, 0, 0, 0, 0, 0]])

1行目のユーザーと似ているユーザーを探しましょう。

In [None]:
user_similarity = []
for i in range(len(rating_matrix)):
    #cosine類似度　= 1 - cosine距離
    sim = 1 - sp.distance.cosine(rating_matrix[0], rating_matrix[i])
    user_similarity.append(sim)

user_similarity = np.array(user_similarity)
user_similarity

topNで指定した類似度の高いユーザーを探しましょう。

In [None]:
topN = 4
idx = np.argsort(user_similarity)[:-topN-2:-1][1:]
selected_user_similarity = user_similarity[idx]
selected_rating = rating_matrix[idx]

どの商品をオススメするか？ここでは平均類似度を使ってみましょう。

In [None]:
avg_score = []
for col_idx in range(selected_rating.shape[1]):
    weight_score = sum(selected_rating[:, col_idx] * selected_user_similarity)
    similarity_sum = sum(selected_user_similarity[selected_user_similarity > 0])
    avg_score.append(weight_score/similarity_sum)
for i, v in enumerate(avg_score):
    print(str(i)+"th item  ", v)

上記だと具体的な商品名がないので、わかりにくいですよね（でも雰囲気は伝わりましたか？）。もう少し具体的なデータでやってみましょう。

<h2>某ニュースアプリのテーマのフォロー状況を模したデータで協調フィルタリング</h2>

datasetフォルダにあるuser_topic_follow_dummy.csvを読み込みましょう。

In [None]:
data = pd.read_csv("dataset/user_topic_follow_dummy.csv", encoding="utf8")
data.drop_duplicates(keep="last", inplace=True)
print(data.shape)
data.head()

この後、Pandasのpivotを使って、User x Itemの行列を作りますので、ratingの列を新たに作成し、１.0を格納しておきます。

In [None]:
data["rating"] = 1.0

実際に User x Itemの行列を作成します。

In [None]:
rating_matrix = data.pivot( index="user_id", columns="topic_name", values="rating")
rating_matrix.fillna(0, inplace=True)
topic_list = np.array(rating_matrix.columns)
user_list = np.array(rating_matrix.index)
rating_matrix_ar = np.array(rating_matrix)

さて、user_id=1の方に対するオススメトピックを探してみましょう。

In [None]:
already_followed_topic = np.array(data[data["user_id"] == user_list[0]]["topic_name"])
data[data["user_id"] == user_list[0]]

実際にユーザーごとの類似度を計算しましょう。

In [None]:
def get_cosine_similarity(x, y):
    return 1 - sp.distance.cosine(x, y)

In [None]:
user_similarity = []
target_user_row = rating_matrix_ar[0]
for row in rating_matrix_ar:
    sim = get_cosine_similarity(target_user_row, row)
    user_similarity.append(sim)
user_similarity = np.array(user_similarity)

類似度の高いユーザーTopNを抽出します。

In [None]:
topN = 20
idx = user_similarity.argsort()[::-1][:topN]
selected_user_similarity = user_similarity[idx]
selected_rating = rating_matrix_ar[idx]

平均類似度を計算しましょう。

In [None]:
avg_score = []
for col_idx in range(selected_rating.shape[1]):
    weight_score = sum(selected_rating[:, col_idx] * selected_user_similarity)
    similarity_sum = sum(selected_user_similarity[selected_user_similarity > 0])
    avg_score.append(weight_score/similarity_sum)
avg_score = np.array(avg_score)

そして、平均類似度の高い上位10テーマをオススメとして表示させます。

In [None]:
recommend_num = 10
counter = 0
for recommended_topic in topic_list[avg_score.argsort()[::-1]]:
    if recommended_topic not in already_followed_topic:
        print(recommended_topic)
        counter +=1
        if recommend_num <= counter:
            break
        

どうでしょうか？これを気に入ってくれそうですか？

<h2>次元削減を行ってみましょう</h2>

ここでは、以下2つの手法を紹介します。
* SVD
* Non Negative Matrix Factorization

<h3>特異値分解(Singular Value Decomposition)による次元圧縮を使ったレコメンデーション</h3>

まず、SVDを初期化します。

In [None]:
svd = TruncatedSVD(n_components=10)

Scipy Sparse行列にしておきます。

In [None]:
rating_matrix_sparse = sparse.lil_matrix(rating_matrix_ar)

データを適用させます。

In [None]:
rating_matrix_svd = svd.fit_transform(rating_matrix_sparse)

あとは、前回と同じです。似ているユーザーを探して平均類似度の高いトピックをお勧めしましょう。

In [None]:
user_similarity = []
target_user_svd = rating_matrix_svd[0]
for row in rating_matrix_svd:
    sim = get_cosine_similarity(target_user_svd, row)
    user_similarity.append(sim)
user_similarity = np.array(user_similarity)

In [None]:
topN = 50
idx = user_similarity.argsort()[::-1][:topN]
selected_user_similarity = user_similarity[idx]
selected_rating = rating_matrix_ar[idx]

avg_score = []
for col_idx in range(selected_rating.shape[1]):
    weight_score = sum(selected_rating[:, col_idx] * selected_user_similarity)
    similarity_sum = sum(selected_user_similarity[selected_user_similarity > 0])
    avg_score.append(weight_score/similarity_sum)
avg_score = np.array(avg_score)

recommend_num = 10
counter = 0
for recommended_topic in topic_list[avg_score.argsort()[::-1]]:
    if recommended_topic not in already_followed_topic:
        print(recommended_topic)
        counter +=1
        if recommend_num <= counter:
            break

<h3>非負値行列分解(Non Negative Matrix Factorization)による次元圧縮を使ったレコメンデーション</h3>

In [None]:
nmf = NMF(n_components=10)

In [None]:
rating_matrix_nmf = nmf.fit_transform(rating_matrix_sparse)

In [None]:
user_similarity = []
target_user_svd = rating_matrix_nmf[0]
for row in rating_matrix_svd:
    sim = get_cosine_similarity(target_user_svd, row)
    user_similarity.append(sim)
user_similarity = np.array(user_similarity)

In [None]:
topN = 50
idx = user_similarity.argsort()[::-1][:topN]
selected_user_similarity = user_similarity[idx]
selected_rating = rating_matrix_ar[idx]

avg_score = []
for col_idx in range(selected_rating.shape[1]):
    weight_score = sum(selected_rating[:, col_idx] * selected_user_similarity)
    similarity_sum = sum(selected_user_similarity[selected_user_similarity > 0])
    avg_score.append(weight_score/similarity_sum)
avg_score = np.array(avg_score)

recommend_num = 10
counter = 0
for recommended_topic in topic_list[avg_score.argsort()[::-1]]:
    if recommended_topic not in already_followed_topic:
        print(recommended_topic)
        counter +=1
        if recommend_num <= counter:
            break