In [None]:
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import random
import scipy

In [None]:
def split_train_test(data):
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=10)
    return train_data, test_data

def make_train_matrix(train_data, test_data):
    user_item_mtx = train_data.pivot_table(values="rating", index="userId", columns="movieId")
    # empty entry는 평균 평점으로 fill
    user_item_mtx = user_item_mtx.apply(lambda x: x.fillna(x.mean()), axis=1)
    return user_item_mtx

def calculate_SVD(matrix, k=50):
    # numpy SVD
    u, sig, i = np.linalg.svd(matrix)
    # 유저 matrix 중 k개 latent factor만 사용
    u_hat = u[:, :k]
    # 아이템 matrix 중 k개 latent factor만 사용
    i_hat = i[:k, :]
    # latent factor 대각 행렬
    # sig : singular matrix로 가장 큰 값부터 내림차순
    sig_hat = sig[:k] * np.identity(k, np.float)
    user_factors = u_hat
    item_factors = np.matmul(sig_hat, i_hat)
    return user_factors, item_factors

In [None]:
from pathlib import Path
import pandas as pd
import os

path = os.path.join(Path(os.getcwd()).parent, "data")
ratings = os.path.join(path, "ml-latest-small", "ratings.csv")

In [None]:
ratings_df = pd.read_csv(ratings, encoding='utf-8')

In [None]:
train_data, test_data = split_train_test(ratings_df)
user_item_matrix = make_train_matrix(train_data, test_data)
user_factors, item_factors = calculate_SVD(user_item_matrix)

### latent factor = 50에 대한 user factors, item_factors shape

In [None]:
print(user_factors.shape, item_factors.shape)

### user factors, item_factors로 평점 예측치 산출

In [None]:
prediction_result = pd.DataFrame(np.matmul(user_factors, item_factors),
                                 columns=user_item_matrix.columns.values, index=user_item_matrix.index.values)
prediction_result

### test 데이터에 대한 예측

In [None]:
# test 데이터에서 새로 등장하는 유저, 아이템에 대입하기 위한 global rating
global_rating = train_data['rating'].mean()
result=[]
for _, row in tqdm(test_data.iterrows()):
    user_id, movie_id, = row['userId'], row['movieId']
    true_rating = row['rating']
    if user_id in prediction_result.index.values and movie_id in prediction_result.columns.values:
        pred_rating = prediction_result.loc[user_id][movie_id]
    else:
        pred_rating = global_rating
    result.append([user_id, movie_id, true_rating, pred_rating])
pred_result = pd.DataFrame(result, columns=['user_id', 'movie_id', 'true_rating', 'pred_rating'])
print(pred_result)

### RMSE 계산

In [None]:
from sklearn.metrics import mean_squared_error

def get_rmse(X, X_hat):
    return np.sqrt(mean_squared_error(X, X_hat))

In [None]:
get_rmse(pred_result['true_rating'].values, pred_result['pred_rating'].values)