# インポート

In [3]:
import pandas as pd

import scipy.sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import joblib
from tqdm import tqdm

# データの読み込み

In [4]:
# load data
review = pd.read_pickle("./data/review.pickle")

In [5]:
review = review[["user_id", "movie_id", "point"]]
review.head()

Unnamed: 0,user_id,movie_id,point
0,20001,25942,8
1,20001,26788,8
2,20001,25605,8
3,20001,24933,6
4,20001,23973,6


In [6]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452459 entries, 0 to 452458
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   452459 non-null  int64
 1   movie_id  452459 non-null  int64
 2   point     452459 non-null  int64
dtypes: int64(3)
memory usage: 10.4 MB


In [7]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452459 entries, 0 to 452458
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   452459 non-null  int64
 1   movie_id  452459 non-null  int64
 2   point     452459 non-null  int64
dtypes: int64(3)
memory usage: 10.4 MB


In [8]:
user_enc = LabelEncoder()
movie_enc = LabelEncoder()
review["user_id"] = user_enc.fit_transform(review.user_id)
review["movie_id"] = movie_enc.fit_transform(review.movie_id)

In [9]:
review.head()

Unnamed: 0,user_id,movie_id,point
0,0,24158,8
1,0,24919,8
2,0,23833,8
3,0,23207,6
4,0,22317,6


In [10]:
joblib.dump(movie_enc, "./model/movie_encoder.pkl")

['./model/movie_encoder.pkl']

In [11]:
# create review matrix
n_users = review.user_id.nunique()
n_movies = review.movie_id.nunique()
matrix = scipy.sparse.csr_matrix(
    (review.point, (review.user_id, review.movie_id)), shape=(n_users, n_movies)
)

In [80]:
# train models
for n_components in tqdm([10, 20, 30, 100, 200, 500, 1000, 2000]):
    model = TruncatedSVD(n_components)
    model.fit(matrix)
    joblib.dump(model, f"./model/svd_{n_components}.pkl")

100%|█████████████████████████████████████████████| 8/8 [00:37<00:00,  4.75s/it]


# データ確認

In [76]:
# load data
movie_df = pd.read_pickle("./data/movie.pickle")
movie_df.head()

Unnamed: 0,mean_review_point,number_of_revier,screening_time,genre,movie_id,movie_title
0,6.84,44.0,155.0,アクション|ＳＦ|アニメ|シリーズもの|ＴＶの映画化,25942,シン・エヴァンゲリオン劇場版：||
1,8.0,1.0,104.0,アクション|ＳＦ|アニメ|シリーズもの|青春もの|学園もの|ＴＶの映画化|漫画の映画化,26788,僕のヒーローアカデミアTHEMOVIE　ワールドヒーローズミッション
2,8.27,11.0,114.0,ドラマ|サスペンス|コメディ,25605,メランコリック
3,5.93,14.0,100.0,アクション|ＳＦ|ファンタジー|アニメ|シリーズもの|ＴＶの映画化|漫画の映画化|３Ｄ映画,24933,ドラゴンボール超／ブロリー
4,5.8,5.0,120.0,ドラマ|ラブストーリー|青春もの|学園もの|漫画の映画化,23973,ReLIFEリライフ


In [77]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25002 entries, 0 to 25001
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   mean_review_point  24988 non-null  float64
 1   number_of_revier   24988 non-null  float64
 2   screening_time     23499 non-null  float64
 3   genre              25002 non-null  object 
 4   movie_id           25002 non-null  int64  
 5   movie_title        25002 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 1.1+ MB


In [78]:
movie_df[["movie_id"]].nunique()

movie_id    25002
dtype: int64