# 使用するライブラリのインポート

In [19]:
import pandas as pd
import numpy as np 
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


import scipy.sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import joblib
from tqdm import tqdm

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# データの読み込み

In [20]:
movie = pd.read_pickle('./data/movie.pickle')
ratings = pd.read_pickle('./data/review.pickle')

# 前処理

In [21]:
# membersの値が10より大きいデータのみに変更
movie = movie[movie['number_of_revier'] > 10].reset_index(drop=True)
# 欠損データをdropna()でデータセットから取り除く
movie = movie.dropna(subset=['mean_review_point', 'number_of_revier'])

ratings.point.replace({0: 10}, inplace=True)

print(len(movie))
print(len(ratings))

7974
452459


# マージ

In [22]:
# animeとratingsの2つのデータフレームをマージさせる
mergeddf = ratings.merge(movie, left_on = 'movie_id', right_on = 'movie_id', suffixes= ['_user', ''])
# 合体したデータフレームの最初の5行を表示
mergeddf.head()

Unnamed: 0,user_id,movie_id,movie_tile,movie_comment,point,mean_review_point,number_of_revier,screening_time,genre,movie_title
0,20001,25942,シン・エヴァンゲリオン劇場版：||,劇場で見てさらにAmazonPrimeで再確認しました。点数は劇場版で見た際のメモの点数のま...,8,6.84,44.0,155.0,アクション|ＳＦ|アニメ|シリーズもの|ＴＶの映画化,シン・エヴァンゲリオン劇場版：||
1,20202,25942,シン・エヴァンゲリオン劇場版：||,子供たちに誘われて一家総出で見に行ったんですけどね。こんなの見終わってから家族でどういう会話...,7,6.84,44.0,155.0,アクション|ＳＦ|アニメ|シリーズもの|ＴＶの映画化,シン・エヴァンゲリオン劇場版：||
2,20434,25942,シン・エヴァンゲリオン劇場版：||,Ｑを復習せず鑑賞。前半はシンジの鬱にイライラ。ひたすらウザい。それに対し周りの人間が優しすぎ...,6,6.84,44.0,155.0,アクション|ＳＦ|アニメ|シリーズもの|ＴＶの映画化,シン・エヴァンゲリオン劇場版：||
3,20857,25942,シン・エヴァンゲリオン劇場版：||,ほんとに終わったのね。序盤の鬱シンジ、もう笑えたわ、なんか。あと中身28歳見た目14歳アスカ...,8,6.84,44.0,155.0,アクション|ＳＦ|アニメ|シリーズもの|ＴＶの映画化,シン・エヴァンゲリオン劇場版：||
4,20925,25942,シン・エヴァンゲリオン劇場版：||,概ね旧劇場版からさらに踏み込んだ補完版という感じでしたが、こういう落とし方が無難でしたかね。...,8,6.84,44.0,155.0,アクション|ＳＦ|アニメ|シリーズもの|ＴＶの映画化,シン・エヴァンゲリオン劇場版：||


In [23]:
# 不必要な項目と重複項目を削除
mergeddf = mergeddf[['user_id','movie_id','point']]
mergeddf = mergeddf.drop_duplicates(['user_id','movie_id'])

# head()で最初の5行を表示
mergeddf.head()

Unnamed: 0,user_id,movie_id,point
0,20001,25942,8
1,20202,25942,7
2,20434,25942,6
3,20857,25942,8
4,20925,25942,8


# ラベルエンコーディング

In [24]:
user_enc = LabelEncoder()
movie_enc = LabelEncoder()
mergeddf["user_id"] = user_enc.fit_transform(mergeddf.user_id)
mergeddf["movie_id"] = movie_enc.fit_transform(mergeddf.movie_id)

In [25]:
mergeddf.head()

Unnamed: 0,user_id,movie_id,point
0,0,7961,8
1,111,7961,7
2,244,7961,6
3,508,7961,8
4,552,7961,8


In [26]:
joblib.dump(movie_enc, "./encoder/movie_encoder.pkl")

['./encoder/movie_encoder.pkl']

# データフレームのピボット

## Movie × User

In [27]:
movie_pivot = mergeddf.pivot(index= 'movie_id',columns='user_id',values='point').fillna(0)
#scikit-learnでの処理が速くなるデータ形式に変換
movie_pivot_sparse = csr_matrix(movie_pivot.values)

In [28]:
# anime_pivotの最初の10行を表示
movie_pivot.head(10)

user_id,0,1,2,3,4,5,6,7,8,9,...,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,7.0,7.0,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,10.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## User × Movie

In [29]:
user_pivot = mergeddf.pivot(index= 'user_id',columns='movie_id',values='point').fillna(0)
user_pivot_sparse = csr_matrix(user_pivot.values)

In [30]:
user_pivot.head(10)

movie_id,0,1,2,3,4,5,6,7,8,9,...,7964,7965,7966,7967,7968,7969,7970,7971,7972,7973
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,6.0,7.0,6.0,8.0,8.0,6.0,0.0,9.0,9.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0


## matrix

In [31]:
# create review matrix
n_users = mergeddf.user_id.nunique()
n_movies = mergeddf.movie_id.nunique()
matrix = scipy.sparse.csr_matrix(
    (mergeddf.point, (mergeddf.user_id, mergeddf.movie_id)), shape=(n_users, n_movies)
)

In [32]:
movie["movie_id"] = movie_enc.transform(movie.movie_id)
l1_l2_sym_diff = set(movie.movie_id.values) ^ set(movie_pivot.index.values)

movie_new = pd.DataFrame(columns=movie.columns)
for i in range(len(movie)):
    if movie.movie_id.iloc[i] not in list(l1_l2_sym_diff):
        movie_new = movie_new.append(movie.iloc[i], ignore_index=True)
movie_new = movie_new.drop_duplicates()
movie_new = movie_new.set_index("movie_id")
movie_new = movie_new.reindex(movie_pivot.index, axis=0)

In [33]:
movie_new.head()

Unnamed: 0_level_0,mean_review_point,number_of_revier,screening_time,genre,movie_title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5.6,57.0,161.0,ドラマ|伝記もの|ロマンス|小説の映画化,愛と哀しみの果て
1,6.78,169.0,124.0,ドラマ|青春もの|ロマンス,愛と青春の旅だち
2,6.56,80.0,132.0,ドラマ|シリーズもの|ロマンス|小説の映画化,愛と追憶の日々
3,6.94,34.0,129.0,ドラマ|伝記もの,愛は霧のかなたに
4,7.06,35.0,119.0,ドラマ|ラブストーリー|戯曲（舞台劇）の映画化,愛は静けさの中に


In [34]:
# movie_new.to_pickle('./data/movie_drop_duplicates.pickle')

# Recommendationモデルの構築

knnスコア, 主成分析スコア, kmeansクラスタを統合的に判断

## K-meansクラスタリング

In [37]:
from sklearn.cluster import KMeans
n_clusters = 3
clusterer = KMeans(n_clusters=n_clusters,random_state=30).fit(movie_pivot_sparse)
#cluster_centers_には各クラスターのセントロイドの座標が入っている
centers = clusterer.cluster_centers_
c_preds = clusterer.predict(movie_pivot_sparse)

In [39]:
joblib.dump(clusterer, f"./model_kmeans/kmeans_{n_clusters}.pkl")

['./model_kmeans/kmeans_3.pkl']

In [40]:
# u_size, counts = np.unique(c_preds, return_counts=True)

In [41]:
movie_recommend_addcluster = movie_new.copy()
movie_recommend_addcluster['cluster'] = c_preds
movie_recommend_addcluster.head(10)

Unnamed: 0_level_0,mean_review_point,number_of_revier,screening_time,genre,movie_title,cluster
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.6,57.0,161.0,ドラマ|伝記もの|ロマンス|小説の映画化,愛と哀しみの果て,1
1,6.78,169.0,124.0,ドラマ|青春もの|ロマンス,愛と青春の旅だち,0
2,6.56,80.0,132.0,ドラマ|シリーズもの|ロマンス|小説の映画化,愛と追憶の日々,0
3,6.94,34.0,129.0,ドラマ|伝記もの,愛は霧のかなたに,1
4,7.06,35.0,119.0,ドラマ|ラブストーリー|戯曲（舞台劇）の映画化,愛は静けさの中に,1
5,5.84,115.0,108.0,アクション|ドラマ|サスペンス|犯罪もの|リメイク,アサシン(1993),1
6,5.78,148.0,100.0,ホラー|コメディ|ファンタジー|シリーズもの|ＴＶの映画化|漫画の映画化,アダムス・ファミリー(1991),0
7,6.32,94.0,100.0,ホラー|コメディ|ファンタジー|シリーズもの|ＴＶの映画化|漫画の映画化,アダムス・ファミリー２,0
8,6.45,87.0,101.0,ドラマ|コメディ|実話もの|ロマンス,あなたに降る夢,0
9,6.61,130.0,140.0,ドラマ|サスペンス|ＳＦ|アドベンチャー|ファンタジー,アビス,0


In [42]:
# movie_pivot_add_cluster = movie_pivot.copy()
# movie_pivot_add_cluster['cluster'] = c_preds
# movie_pivot_add_cluster.head(10)

## 主成分分析によるスコア

In [52]:
# train models
for n_components in tqdm([10, 20, 30, 100, 200, 500, 512, 1000, 2000]):
    model_SVD = TruncatedSVD(n_components)
    model_SVD.fit(matrix)
    joblib.dump(model_SVD, f"./model_svd/svd_{n_components}.pkl")

100%|█████████████████████████████████████████████| 9/9 [00:23<00:00,  2.66s/it]


In [44]:
print('保たれている情報:累積寄与率: {0}'.format(sum(model_SVD.explained_variance_ratio_)))
print('主成分の数: ', model_SVD.components_.shape[0])

保たれている情報:累積寄与率: 0.9876305985091879
主成分の数:  2000


In [45]:
Movie = '言の葉の庭'
Movie_ID = movie_new[movie_new.movie_title == Movie].index[0]
# Movie_ID_index = list(movie_new.index).index(Movie_ID)

In [49]:
pref = np.zeros((1, model_SVD.components_.shape[1]))
pref[:, Movie_ID] = 10

In [50]:
movie_recommend_addcluster_svd = movie_recommend_addcluster.copy()
#ドット積を求める
score = model_SVD.transform(pref).dot(model_SVD.components_).ravel()
movie_recommend_addcluster_svd["svd_score"] = score

In [51]:
movie_recommend_addcluster_svd

Unnamed: 0_level_0,mean_review_point,number_of_revier,screening_time,genre,movie_title,cluster,svd_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,5.60,57.0,161.0,ドラマ|伝記もの|ロマンス|小説の映画化,愛と哀しみの果て,1,0.033755
1,6.78,169.0,124.0,ドラマ|青春もの|ロマンス,愛と青春の旅だち,0,-0.025457
2,6.56,80.0,132.0,ドラマ|シリーズもの|ロマンス|小説の映画化,愛と追憶の日々,0,-0.011978
3,6.94,34.0,129.0,ドラマ|伝記もの,愛は霧のかなたに,1,-0.010375
4,7.06,35.0,119.0,ドラマ|ラブストーリー|戯曲（舞台劇）の映画化,愛は静けさの中に,1,0.094153
...,...,...,...,...,...,...,...
7969,6.60,20.0,151.0,アクション|ＳＦ|ファンタジー|シリーズもの|漫画の映画化,ワンダーウーマン1984,1,0.036388
7970,7.77,13.0,126.0,ドラマ|実話もの|小説の映画化,すばらしき世界,1,-0.016833
7971,7.09,11.0,124.0,ドラマ|ラブストーリー|青春もの,花束みたいな恋をした,1,0.062804
7972,6.89,28.0,113.0,ＳＦ|ファンタジー|シリーズもの|特撮もの|モンスター映画,ゴジラvsコング,1,0.019579


## k近傍法（k-nearest neighbors）による近さスコア

In [59]:
range_list = list(range(2, 11))

# train models
for n_neighbors in tqdm(range_list):
    #インスタンス化
    model_knn = NearestNeighbors(n_neighbors=n_neighbors,algorithm= 'brute', metric= 'cosine')
    model_knn.fit(movie_pivot_sparse)
    joblib.dump(model_knn, f"./model_knn/knn_{n_neighbors}.pkl")

100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 247.57it/s]


In [63]:
distance, indice = model_knn.kneighbors(movie_pivot.iloc[movie_pivot.index== Movie_ID].values.reshape(1,-1),n_neighbors=len(movie_new))
distance_list = distance.tolist()[0]
indice_list = indice.tolist()[0]

In [64]:
movie_recommend_addcluster_svd_knn = movie_recommend_addcluster_svd.copy()

In [65]:
movie_recommend_addcluster_svd_knn['knn_distance'] = 0
movie_recommend_addcluster_svd_knn['knn_distance'].loc[indice_list] = distance_list.copy()
movie_recommend_addcluster_svd_knn = movie_recommend_addcluster_svd_knn.reindex(columns=['movie_title', 'genre', 'mean_review_point', 'screening_time', 'number_of_revier', 'svd_score', 'knn_distance', 'cluster'])

In [66]:
movie_recommend_addcluster_svd_knn.head()

Unnamed: 0_level_0,movie_title,genre,mean_review_point,screening_time,number_of_revier,svd_score,knn_distance,cluster
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,愛と哀しみの果て,ドラマ|伝記もの|ロマンス|小説の映画化,5.6,161.0,57.0,0.033755,0.942883,1
1,愛と青春の旅だち,ドラマ|青春もの|ロマンス,6.78,124.0,169.0,-0.025457,0.890492,0
2,愛と追憶の日々,ドラマ|シリーズもの|ロマンス|小説の映画化,6.56,132.0,80.0,-0.011978,0.882578,0
3,愛は霧のかなたに,ドラマ|伝記もの,6.94,129.0,34.0,-0.010375,0.969217,1
4,愛は静けさの中に,ドラマ|ラブストーリー|戯曲（舞台劇）の映画化,7.06,119.0,35.0,0.094153,0.879091,1


## TFIDFスコア

In [67]:
movie_test = movie_recommend_addcluster_svd_knn.copy()

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')


# Filling NaNs with empty string
movie_test['genre'] = movie_test['genre'].fillna('')
genres_str = movie_test['genre'].str.split('|').astype(str)
tfv_matrix = tfv.fit_transform(genres_str)

In [69]:
joblib.dump(tfv, f"./model_tfv/tfv.pkl")

['./model_tfv/tfv.pkl']

In [226]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
indices = pd.Series(movie_test.index, index=movie_test['movie_title']).drop_duplicates()

In [227]:
def give_rec(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]
    
    juni_list = list(range(1, len(sig)+1))
    
#     movie_test.loc[movie_indices] = 
    movie_test['TFIDF_rank'] = 0
    movie_test['TFIDF_rank'].loc[movie_indices] = juni_list.copy()
    
    return movie_test

In [228]:
movie_recommend_addcluster_svd_knn_TFIDF = give_rec(Movie)

In [229]:
movie_recommend_addcluster_svd_knn_TFIDF

Unnamed: 0_level_0,movie_title,genre,mean_review_point,screening_time,number_of_revier,svd_score,knn_distance,cluster,TFIDF_rank
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,愛と哀しみの果て,ドラマ|伝記もの|ロマンス|小説の映画化,5.60,161.0,57.0,0.002852,0.942883,1,471
1,愛と青春の旅だち,ドラマ|青春もの|ロマンス,6.78,124.0,169.0,0.002604,0.890492,0,472
2,愛と追憶の日々,ドラマ|シリーズもの|ロマンス|小説の映画化,6.56,132.0,80.0,0.004030,0.882578,0,473
3,愛は霧のかなたに,ドラマ|伝記もの,6.94,129.0,34.0,-0.001955,0.969217,1,474
4,愛は静けさの中に,ドラマ|ラブストーリー|戯曲（舞台劇）の映画化,7.06,119.0,35.0,-0.003579,0.879091,1,475
...,...,...,...,...,...,...,...,...,...
7969,ワンダーウーマン1984,アクション|ＳＦ|ファンタジー|シリーズもの|漫画の映画化,6.60,151.0,20.0,0.003036,0.882533,1,7971
7970,すばらしき世界,ドラマ|実話もの|小説の映画化,7.77,126.0,13.0,0.001892,0.815220,1,7972
7971,花束みたいな恋をした,ドラマ|ラブストーリー|青春もの,7.09,124.0,11.0,0.002683,0.839019,1,7973
7972,ゴジラvsコング,ＳＦ|ファンタジー|シリーズもの|特撮もの|モンスター映画,6.89,113.0,28.0,0.004537,0.819808,1,7974


# 正規化

In [230]:
movie_recommend_addcluster_svd_knn_TFIDF['svd_score'].values

array([0.00285249, 0.00260357, 0.00403045, ..., 0.00268262, 0.00453695,
       0.00287307])

In [231]:
from sklearn import preprocessing
mm = preprocessing.MinMaxScaler()
movie_recommend_addcluster_svd_knn_TFIDF['svd_score_mm'] = mm.fit_transform(movie_recommend_addcluster_svd_knn_TFIDF['svd_score'].values.reshape(-1, 1))
movie_recommend_addcluster_svd_knn_TFIDF['knn_distance_mm'] = mm.fit_transform(movie_recommend_addcluster_svd_knn_TFIDF['knn_distance'].values.reshape(-1, 1))
movie_recommend_addcluster_svd_knn_TFIDF['TFIDF_rank_mm'] = mm.fit_transform(movie_recommend_addcluster_svd_knn_TFIDF['TFIDF_rank'].values.reshape(-1, 1))


In [232]:
movie_recommend_addcluster_svd_knn_TFIDF['rec_score'] =\
    movie_recommend_addcluster_svd_knn_TFIDF['svd_score_mm'] - movie_recommend_addcluster_svd_knn_TFIDF['knn_distance_mm'] - movie_recommend_addcluster_svd_knn_TFIDF['TFIDF_rank_mm']

In [234]:
movie_recommend_addcluster_svd_knn_TFIDF.sort_values("rec_score", ascending=False, inplace=True)
movie_recommend_addcluster_svd_knn_TFIDF
movie_recommend_addcluster_svd_knn_TFIDF[~(movie_recommend_addcluster_svd_knn_TFIDF.index == Movie_ID)]

Unnamed: 0_level_0,movie_title,genre,mean_review_point,screening_time,number_of_revier,svd_score,knn_distance,cluster,TFIDF_rank,svd_score_mm,knn_distance_mm,TFIDF_rank_mm,rec_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7603,君の名は。(2016),ＳＦ|ラブストーリー|アニメ|青春もの,7.01,106.0,176.0,0.015658,0.623891,0,135,0.922719,0.623891,0.016807,0.282022
7606,映画聲の形,ドラマ|ラブストーリー|アニメ|青春もの|学園もの|漫画の映画化,6.49,129.0,59.0,0.010935,0.591038,1,312,0.777167,0.591038,0.039007,0.147122
7625,この世界の片隅に(2016),ドラマ|コメディ|戦争もの|アニメ|漫画の映画化,8.26,129.0,146.0,0.010533,0.645171,0,187,0.764772,0.645171,0.023329,0.096272
101,ゴッドファーザー,ドラマ|シリーズもの|犯罪もの|ヤクザ・マフィア|小説の映画化,8.56,175.0,491.0,0.018166,0.870194,2,571,1.000000,0.870194,0.071491,0.058314
7907,天気の子,ドラマ|ＳＦ|ラブストーリー|ファンタジー|アニメ|青春もの,6.47,114.0,76.0,0.009535,0.645214,0,289,0.734025,0.645214,0.036122,0.052689
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7432,特捜部Ｑ　檻の中の女,サスペンス|犯罪もの|ミステリー|刑事もの|小説の映画化,6.25,97.0,12.0,-0.001599,1.000000,1,7466,0.390882,1.000000,0.936285,-1.545403
7654,クリミナル2人の記憶を持つ男,アクション|ドラマ|犯罪もの|スパイもの,6.00,113.0,12.0,-0.001506,0.977030,1,7678,0.393763,0.977030,0.962875,-1.546141
7328,誰よりも狙われた男,サスペンス|小説の映画化,6.94,122.0,17.0,-0.002022,1.000000,1,7371,0.377866,1.000000,0.924370,-1.546504
7780,モリーズ・ゲーム,ドラマ|犯罪もの|実話もの,6.42,140.0,12.0,-0.000446,1.000000,1,7797,0.426417,1.000000,0.977800,-1.551383


# 関数化

In [254]:
#モデルの定義
model_clusterer = clusterer
#movie_new
#movie_pivot_sparse

model_SVD = model_SVD
movie_recommend = movie_new.copy()

model_knn = model_knn

tfv

from sklearn import preprocessing
mm = preprocessing.MinMaxScaler()

In [259]:
def give_rec(title, df, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]
    
    juni_list = list(range(1, len(sig)+1))
    
    df['TFIDF_rank'] = 0
    df['TFIDF_rank'].loc[movie_indices] = juni_list.copy()
    
    return df

In [290]:
def get_recommend_movie_df(Movie_ID):
    
    movie_recommend = movie_new.copy()
    
    #k-meansクラスタリング
    c_preds = model_clusterer.predict(movie_pivot_sparse)
    movie_recommend['cluster'] = c_preds
    
    #主成分分析
    pref = np.zeros((1, model_SVD.components_.shape[1]))
    påref[:, Movie_ID] = 10
    score = model_SVD.transform(pref).dot(model_SVD.components_).ravel()
    movie_recommend["svd_score"] = score
    
    if len(Movie_ID) >= 2:
        print('更新')
        Movie_ID = Movie_ID[0]
    
    #knn score
    distance, indice = model_knn.kneighbors(movie_pivot.iloc[movie_pivot.index == Movie_ID].values.reshape(1,-1),n_neighbors=len(movie_new))
    distance_list = distance.tolist()[0]
    indice_list = indice.tolist()[0]
    
    movie_recommend['knn_distance'] = 0
    movie_recommend['knn_distance'].loc[indice_list] = distance_list.copy()
    movie_recommend = movie_recommend.reindex(columns=['movie_title', 'genre', 'mean_review_point', 'screening_time', 'number_of_revier', 'svd_score', 'knn_distance', 'cluster'])
    
    #TFIDF score
    movie_recommend['genre'] = movie_recommend['genre'].fillna('')
    genres_str = movie_recommend['genre'].str.split('|').astype(str)
    tfv_matrix = tfv.fit_transform(genres_str)
    
    # Compute the sigmoid kernel
    sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
    indices = pd.Series(movie_recommend.index, index=movie_recommend['movie_title']).drop_duplicates()
    
    movie_recommend = give_rec(Movie_ID, movie_recommend)
    
    #正規化
    movie_recommend['svd_score_mm'] = mm.fit_transform(movie_recommend['svd_score'].values.reshape(-1, 1))
    movie_recommend['knn_distance_mm'] = mm.fit_transform(movie_recommend['knn_distance'].values.reshape(-1, 1))
    movie_recommend['TFIDF_rank_mm'] = mm.fit_transform(movie_recommend['TFIDF_rank'].values.reshape(-1, 1))
    
    movie_recommend['rec_score'] =\
        2 * movie_recommend['svd_score_mm'] - movie_recommend['knn_distance_mm'] - movie_recommend['TFIDF_rank_mm']
    
    return movie_recommend

In [291]:
Movie = '言の葉の庭'
Movie2 = 'ゴジラvsコング'
Movie_ID = movie_new[movie_new.movie_title == Movie].index[0]
Movie_ID2 = movie_new[movie_new.movie_title == Movie2].index[0]
Movie_ID_list=[Movie_ID, Movie_ID2]
Movie_ID = Movie_ID_list
Movie_ID #(serections)

[7108, 7972]

In [292]:
movie_recommend_test = get_recommend_movie_df(Movie_ID)

更新


In [297]:
movie_recommend_test[movie_recommend_test.movie_title == Movie]

Unnamed: 0_level_0,movie_title,genre,mean_review_point,screening_time,number_of_revier,svd_score,knn_distance,cluster,TFIDF_rank,svd_score_mm,knn_distance_mm,TFIDF_rank_mm,rec_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7108,言の葉の庭,アニメ,5.71,46.0,55.0,0.057289,0.0,1,5,0.719364,0.0,0.000502,1.438226


In [295]:
movie_recommend_test[movie_recommend_test.movie_title == Movie2]

Unnamed: 0_level_0,movie_title,genre,mean_review_point,screening_time,number_of_revier,svd_score,knn_distance,cluster,TFIDF_rank,svd_score_mm,knn_distance_mm,TFIDF_rank_mm,rec_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7972,ゴジラvsコング,ＳＦ|ファンタジー|シリーズもの|特撮もの|モンスター映画,6.89,113.0,28.0,0.035289,0.819808,1,7974,0.5785,0.819808,1.0,-0.662807


In [298]:
movie_recommend_test[movie_recommend_test.movie_title == 'ダウト～あるカトリック学校で～']

Unnamed: 0_level_0,movie_title,genre,mean_review_point,screening_time,number_of_revier,svd_score,knn_distance,cluster,TFIDF_rank,svd_score_mm,knn_distance_mm,TFIDF_rank_mm,rec_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6258,ダウト～あるカトリック学校で～,ドラマ|ミステリー|学園もの|戯曲（舞台劇）の映画化,7.15,105.0,54.0,-0.028361,0.965297,0,6374,0.170942,0.965297,0.799323,-1.422736


In [294]:
movie_recommend_test.sort_values("rec_score", ascending=False, inplace=True)
movie_recommend_test
# movie_recommend_addcluster_svd_knn_TFIDF[~(movie_recommend_addcluster_svd_knn_TFIDF.index == Movie_ID)]

Unnamed: 0_level_0,movie_title,genre,mean_review_point,screening_time,number_of_revier,svd_score,knn_distance,cluster,TFIDF_rank,svd_score_mm,knn_distance_mm,TFIDF_rank_mm,rec_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7108,言の葉の庭,アニメ,5.71,46.0,55.0,0.057289,0.000000,1,5,0.719364,0.000000,0.000502,1.438226
7603,君の名は。(2016),ＳＦ|ラブストーリー|アニメ|青春もの,7.01,106.0,176.0,0.101117,0.623891,0,135,1.000000,0.623891,0.016807,1.359302
7625,この世界の片隅に(2016),ドラマ|コメディ|戦争もの|アニメ|漫画の映画化,8.26,129.0,146.0,0.094925,0.645171,0,187,0.960356,0.645171,0.023329,1.252212
7112,風立ちぬ(2013),ドラマ|戦争もの|アニメ|伝記もの|ロマンス|漫画の映画化,6.54,126.0,181.0,0.071191,0.680416,0,124,0.808386,0.680416,0.015427,0.920929
7606,映画聲の形,ドラマ|ラブストーリー|アニメ|青春もの|学園もの|漫画の映画化,6.49,129.0,59.0,0.061506,0.591038,1,312,0.746372,0.591038,0.039007,0.862699
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7064,キャビン,ホラー|サスペンス,6.37,96.0,70.0,-0.022934,0.921201,0,7122,0.205692,0.921201,0.893139,-1.402957
6107,ウォンテッド(2008),アクション|ドラマ|サスペンス|漫画の映画化,5.52,110.0,128.0,-0.031620,0.938205,0,6227,0.150076,0.938205,0.780885,-1.418939
6258,ダウト～あるカトリック学校で～,ドラマ|ミステリー|学園もの|戯曲（舞台劇）の映画化,7.15,105.0,54.0,-0.028361,0.965297,0,6374,0.170942,0.965297,0.799323,-1.422736
7328,誰よりも狙われた男,サスペンス|小説の映画化,6.94,122.0,17.0,-0.017548,1.000000,1,7371,0.240182,1.000000,0.924370,-1.444006
