## SGD 기반 행렬 분해

### RMSE 계산 함수

In [1]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 예측 R 행렬
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 행렬에서 null이 아닌 값의 위치 인덱스를 추출하여 실제 R과 예측 R의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

### P, Q update 함수

In [2]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
    num_users, num_items = R.shape
    # P, Q 랜덤값으로 초기화
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    
    prev_rmse = 10000
    break_count = 0
    
    # R>0인 행(user), 열(item) 위치와 값을 저장
    non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j]>0]

    # SGD 기법으로 P, Q 업데이트
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i, :] += learning_rate * (eij * Q[j, :] - r_lambda*P[i, :])
            Q[j, :] += learning_rate * (eij * P[i, :] - r_lambda*Q[j, :])
        rmse = get_rmse(R, P, Q, non_zeros)
        if step % 50 == 0:
            print('### iteration step :', step, 'rmse :', rmse)
    
    return P, Q

### pivot table 생성

In [3]:
import pandas as pd
import numpy as np

movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
ratings = ratings.drop(columns='timestamp')
# rating 데이터에 영화 데이터 합치기
rating_movies = pd.merge(ratings, movies, on='movieId')

# 합친 영화 데이터에 대하여 pivot table
ratings_matrix = pd.pivot_table(data=rating_movies, columns='title', index='userId', values='rating')

### 학습

In [4]:
%%time

P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=1000, learning_rate=0.01, r_lambda=0.01)
pred_matrix = np.dot(P, Q.T)

### iteration step : 0 rmse : 2.9023619751336867
### iteration step : 50 rmse : 0.2520353192341642
### iteration step : 100 rmse : 0.17742927527209104
### iteration step : 150 rmse : 0.1569598699945732
### iteration step : 200 rmse : 0.14741397048716315
### iteration step : 250 rmse : 0.14185543178111187
### iteration step : 300 rmse : 0.13818177442906426
### iteration step : 350 rmse : 0.13554866973229762
### iteration step : 400 rmse : 0.13355264488720703
### iteration step : 450 rmse : 0.1319754416328639
### iteration step : 500 rmse : 0.13068781889676734
### iteration step : 550 rmse : 0.12960809574864302
### iteration step : 600 rmse : 0.1286821658071667
### iteration step : 650 rmse : 0.12787293648334966
### iteration step : 700 rmse : 0.12715432564668785
### iteration step : 750 rmse : 0.12650764212206722
### iteration step : 800 rmse : 0.1259192945121282
### iteration step : 850 rmse : 0.12537928066816156
### iteration step : 900 rmse : 0.12488015793051062
### iteration step : 

### 출력

In [5]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index=ratings_matrix.index,
                                  columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.032402,3.763177,3.273126,4.407304,3.902972,1.121797,3.430295,2.316132,4.301166,3.850254,...,1.370699,4.06057,3.402056,2.59919,2.780331,3.7503,2.703799,2.080365,3.996262,0.84128
2,3.1134,3.130846,2.855477,3.765964,3.748858,1.210122,4.212414,1.614208,2.880316,3.060063,...,0.921541,3.618902,3.106185,2.500448,2.393956,3.791819,2.507229,1.444825,3.777343,0.627442
3,1.831385,1.039056,1.00832,1.729376,1.770943,0.494093,0.951114,0.550314,2.113869,1.662878,...,0.426308,1.36371,1.85457,1.378624,1.281677,0.732316,2.439503,0.705142,1.146918,0.307928


### 관람하지 않은 영화의 예측 평점 출력

In [6]:
def get_unseen_movies(ratings_matrix, userId):
    # 입력받은 userId의 사용자의 영화 정보 추출
    user_rating = ratings_matrix.loc[userId, :]
    
    # user_rating이 0보다 크면 관람한 영화이므로 해당 영화의 인덱스 추출하여 list로
    already_seen = user_rating[user_rating>0].index.tolist()
    
    # 모든 영화 명 list
    movies_list = ratings_matrix.columns.tolist()
    
    # already_seen에 있는 movie 제외
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [7]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # userid, unseen list로 추출, 예측 평점이 높은 순으로 정렬
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [8]:
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 잠재 요인 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

# DataFrame
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index,
                            columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Gattaca (1997),5.113103
Rear Window (1954),5.055717
Star Wars: Episode V - The Empire Strikes Back (1980),5.009944
Rounders (1998),4.9598
Monty Python and the Holy Grail (1975),4.949565
Roger & Me (1989),4.899791
"Remains of the Day, The (1993)",4.878511
Blade Runner (1982),4.874711
Gandhi (1982),4.869262
There's Something About Mary (1998),4.837735
