In [1]:
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import random
import scipy

In [2]:
def split_train_test(data):
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=10)
    return train_data, test_data

def make_train_matrix(train_data, test_data):
    user_item_mtx = train_data.pivot_table(values="rating", index="userId", columns="movieId")
    # empty entry는 평균 평점으로 fill
    user_item_mtx = user_item_mtx.apply(lambda x: x.fillna(x.mean()), axis=1)
    return user_item_mtx

def calculate_SVD(matrix, k=50):
    # numpy SVD
    u, sig, i = np.linalg.svd(matrix)
    # 유저 matrix 중 k개 latent factor만 사용
    u_hat = u[:, :k]
    # 아이템 matrix 중 k개 latent factor만 사용
    i_hat = i[:k, :]
    # latent factor 대각 행렬
    # sig : singular matrix로 가장 큰 값부터 내림차순
    sig_hat = sig[:k] * np.identity(k, np.float)
    user_factors = u_hat
    item_factors = np.matmul(sig_hat, i_hat)
    return user_factors, item_factors

In [3]:
from pathlib import Path
import pandas as pd
import os

path = os.path.join(Path(os.getcwd()).parent, "data")
ratings = os.path.join(path, "ml-latest-small", "ratings.csv")

In [4]:
ratings_df = pd.read_csv(ratings, encoding='utf-8')

In [5]:
train_data, test_data = split_train_test(ratings_df)
user_item_matrix = make_train_matrix(train_data, test_data)
user_factors, item_factors = calculate_SVD(user_item_matrix)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sig_hat = sig[:k] * np.identity(k, np.float)


### latent factor = 50에 대한 user factors, item_factors shape

In [6]:
print(user_factors.shape, item_factors.shape)

(610, 50) (50, 9001)


### user factors, item_factors로 평점 예측치 산출

In [7]:
prediction_result = pd.DataFrame(np.matmul(user_factors, item_factors),
                                 columns=user_item_matrix.columns.values, index=user_item_matrix.index.values)
prediction_result

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193587,193609
1,4.455760,4.376862,4.445517,4.378512,4.336953,4.230930,4.287235,4.378368,4.379953,4.338540,...,4.374534,4.372675,4.371745,4.373604,4.373604,4.372675,4.373604,4.372675,4.372675,4.373269
2,4.116021,4.093123,4.059365,4.075956,4.089813,4.076023,4.054649,4.081869,4.076706,4.093391,...,4.085951,4.082815,4.081246,4.084383,4.084383,4.082815,4.084383,4.082815,4.082815,4.083071
3,2.489142,2.493289,2.435234,2.471230,2.497086,2.406091,2.466641,2.491103,2.462545,2.481086,...,2.467970,2.465050,2.463590,2.466510,2.466510,2.465050,2.466510,2.465050,2.465050,2.465735
4,3.438885,3.538087,3.404722,3.516883,3.387723,3.323818,3.443567,3.498151,3.554042,3.884841,...,3.524558,3.522719,3.521799,3.523639,3.523639,3.522719,3.523639,3.522719,3.522719,3.526527
5,3.535751,3.476395,3.438381,3.478689,3.454173,3.565401,3.468226,3.491675,3.488108,3.454820,...,3.500828,3.499840,3.499346,3.500334,3.500334,3.499840,3.500334,3.499840,3.499840,3.500740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.080360,3.688198,3.524394,3.673937,3.584164,3.757912,2.530674,3.511241,3.703077,3.715367,...,3.661908,3.665996,3.668041,3.663952,3.663952,3.665996,3.663952,3.665996,3.665996,3.657973
607,3.794047,3.738736,3.911573,3.795850,3.789813,3.785160,3.717314,3.807169,3.807893,3.695064,...,3.819084,3.816495,3.815200,3.817790,3.817790,3.816495,3.817790,3.816495,3.816495,3.816851
608,2.350255,3.199218,1.928100,3.104161,3.148591,3.270782,3.159779,3.074696,3.115715,3.212388,...,3.126966,3.129054,3.130098,3.128010,3.128010,3.129054,3.128010,3.129054,3.129054,3.120215
609,3.311869,3.274965,3.261605,3.256527,3.258811,3.307523,3.266666,3.262454,3.261982,3.301365,...,3.270620,3.268960,3.268130,3.269790,3.269790,3.268960,3.269790,3.268960,3.268960,3.269617


### test 데이터에 대한 예측

In [10]:
from tqdm import tqdm

# test 데이터에서 새로 등장하는 유저, 아이템에 대입하기 위한 global rating
global_rating = train_data['rating'].mean()
result=[]
for _, row in tqdm(test_data.iterrows()):
    user_id, movie_id, = row['userId'], row['movieId']
    true_rating = row['rating']
    if user_id in prediction_result.index.values and movie_id in prediction_result.columns.values:
        pred_rating = prediction_result.loc[user_id][movie_id]
    else:
        pred_rating = global_rating
    result.append([user_id, movie_id, true_rating, pred_rating])
pred_result = pd.DataFrame(result, columns=['user_id', 'movie_id', 'true_rating', 'pred_rating'])
pred_result

20168it [00:02, 6824.42it/s]


Unnamed: 0,user_id,movie_id,true_rating,pred_rating
0,239.0,924.0,4.0,4.106564
1,384.0,2763.0,3.0,2.957313
2,52.0,58559.0,5.0,4.640744
3,600.0,719.0,2.5,3.038116
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.472314
20164,177.0,6787.0,3.0,3.400911
20165,103.0,969.0,4.0,3.926839
20166,42.0,2875.0,3.0,3.609600


### RMSE 계산

In [11]:
from sklearn.metrics import mean_squared_error

def get_rmse(X, X_hat):
    return np.sqrt(mean_squared_error(X, X_hat))

In [12]:
get_rmse(pred_result['true_rating'].values, pred_result['pred_rating'].values)

0.9342638634779826