In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [18]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    # 2개의 분해된 행렬 P와 Q.T의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 index 추출해서 실제 R행렬과 예측 행렬의 RMSE(오차율) 추출
    x_non_zero_i = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_i = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_i, y_non_zero_i]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_i, y_non_zero_i]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [19]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
    R = np.array(R,dtype = float)
    num_users, num_items = R.shape
    # P와 Q 행렬의 크기를 지정하고 정규 분포를 가진 랜덤한 값으로 입력
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    
    prev_rmse = 10000
    break_count = 0
    
    # R>0인 행 위치, 열 위치, 값을 non_zeros list 객체에 저장
    str_non_zeros = [(i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j]==1 or R[i,j] == -1]
    for i in range(num_users):
        for j in range(num_items):
            if R[i,j] == 1:
                R[i,j] = 1
            elif R[i,j] == -1:
                R[i,j] = 0
    non_zeros = []
    for i, j, r in str_non_zeros:
        if r == 1:
            non_zeros.append((i,j,1))
        elif r == -1:
            non_zeros.append((i,j,0))
    # SGD 기법으로 P와 Q 행렬을 반복 업데이트
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구하기
            eij = r - np.dot(P[i,:], Q[j,:].T)
            
            # 정규화를 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate * (eij*Q[j,:] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate * (eij*P[i,:] - r_lambda*Q[j,:])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        
        print("### iteration step:", step, " rmse:", rmse)
        
    return P, Q

In [20]:
# 데이터 프레임 작업

review = pd.read_csv('./escapable/review.csv',encoding='CP949' ,sep = ';', names=['review_index', 'cafe_name', 'thema_name', 'user_left_time', 'user_difficulty','user_escape','user_rate','user_nickname']) # 방탈출 리뷰 파일
print(type(review))
#review = review[['user_nickname', 'thema_name', 'user_escape']] # 유저 닉네임, 테마 이름, 평점

review_matrix = review.pivot_table('user_escape', index='user_nickname', columns='thema_name', aggfunc='first') # 사용자-테마 행렬

In [21]:
review

Unnamed: 0,review_index,cafe_name,thema_name,user_left_time,user_difficulty,user_escape,user_rate,user_nickname
0,1,비트포비아 미션브레이크 CGV 용산점,신서유기 : 신묘한 실종사건,300,쉬움,1,3.0,방린냥
1,2,비트포비아 미션브레이크 CGV 용산점,신서유기 : 신묘한 실종사건,2541,매우 쉬움,1,2.5,미옥
2,3,비트포비아 미션브레이크 CGV 용산점,신서유기 : 신묘한 실종사건,80,쉬움,1,2.5,진누
3,4,비트포비아 미션브레이크 CGV 용산점,신서유기 : 신묘한 실종사건,2629,매우 쉬움,1,2.0,빵수니
4,5,비트포비아 미션브레이크 CGV 용산점,신서유기 : 신묘한 실종사건,-3600,어려움,-1,3.0,튠
...,...,...,...,...,...,...,...,...
159263,161943,코드케이 홍대점,꼬레아 우라,1652,보통,1,5.0,Colory
159264,161944,코드케이 홍대점,꼬레아 우라,1650,어려움,1,4.5,깡우
159265,161945,코드케이 홍대점,꼬레아 우라,1470,보통,1,5.0,엔데
159266,161946,코드케이 홍대점,꼬레아 우라,688,보통,1,4.5,하핑


In [22]:
review_matrix

thema_name,(나쁜) 아기돼지 삼형제,13 여고괴담,170cm즈음에,1945 SPY GAME,23번째 실험연구 : 알비노,36.5 마온술사의 정원,3인의 저주,5010,501동사람들,ALBA (알바),...,화생설화 : Blooming,화이트데이,화이트룸,환생,황금 감옥 : 와캄,후레쉬망고 호스텔,휴가중,흐린날,흥보와 놀보,히로인
user_nickname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000,,,,,,,,,,,...,,,,,,,,,,
002,,,,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
123456789,,,,,,,,,,,...,1.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
힌터,,,,,,,1.0,,,,...,,,,,,,,,,
힌트5개쓰고탈출,,,,,,,,,,,...,,,,,,,,,,
힌트는쓰라고있는거야,,,,,,,,,,,...,,,,,,1.0,,,,
힙스터,,,,,,,,,,,...,,,,,,,,,,


In [23]:
P, Q = matrix_factorization(review_matrix.values, K=50, steps=50, learning_rate=0.01, r_lambda=0.01)
pred_matrix = np.dot(P, Q.T)

### iteration step: 0  rmse: 0.905192137719839
### iteration step: 1  rmse: 0.7478891817751769
### iteration step: 2  rmse: 0.4628964982148888
### iteration step: 3  rmse: 0.3857074025145577
### iteration step: 4  rmse: 0.35413025217422
### iteration step: 5  rmse: 0.33761673401283177
### iteration step: 6  rmse: 0.32770383344092835
### iteration step: 7  rmse: 0.3212036524975483
### iteration step: 8  rmse: 0.31665740111157953
### iteration step: 9  rmse: 0.31330542037155434
### iteration step: 10  rmse: 0.31071378221200413
### iteration step: 11  rmse: 0.308616484519755
### iteration step: 12  rmse: 0.3068411214447338
### iteration step: 13  rmse: 0.30527107733471576
### iteration step: 14  rmse: 0.3038252943781932
### iteration step: 15  rmse: 0.30244708974633594
### iteration step: 16  rmse: 0.3010977321914619
### iteration step: 17  rmse: 0.2997523401590616
### iteration step: 18  rmse: 0.2983966130855924
### iteration step: 19  rmse: 0.2970235946504221
### iteration step: 20  rms

In [24]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index=review_matrix.index, columns=review_matrix.columns)
ratings_pred_matrix.head(5)

thema_name,(나쁜) 아기돼지 삼형제,13 여고괴담,170cm즈음에,1945 SPY GAME,23번째 실험연구 : 알비노,36.5 마온술사의 정원,3인의 저주,5010,501동사람들,ALBA (알바),...,화생설화 : Blooming,화이트데이,화이트룸,환생,황금 감옥 : 와캄,후레쉬망고 호스텔,휴가중,흐린날,흥보와 놀보,히로인
user_nickname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.47142,0.695251,0.595974,0.539744,0.677036,0.680068,0.699662,0.291903,0.582885,0.521831,...,0.658871,0.649121,0.283835,0.628942,0.563509,0.802088,0.534587,0.622931,0.660203,0.686208
2,0.848521,0.999162,0.920924,0.912667,0.962248,0.944025,1.003399,0.45401,0.913186,0.878834,...,1.007993,0.983815,0.315292,0.933342,0.912473,1.09602,0.855704,0.955745,0.9744,0.986891
100,0.907654,0.987425,0.935511,0.912078,0.986404,0.979187,1.001896,0.618705,0.927537,0.886925,...,1.022695,0.997403,0.26846,0.941997,0.921848,1.071184,0.910225,0.961271,0.963748,0.987061
11,0.289299,0.420372,0.359368,0.325328,0.390871,0.372751,0.40989,0.125531,0.343024,0.30623,...,0.456759,0.387539,0.092055,0.37997,0.354015,0.556068,0.288262,0.394297,0.39314,0.404712
123456789,0.807063,0.895151,0.804748,0.835498,0.895084,0.870357,0.921347,0.487875,0.858898,0.816599,...,0.964847,0.893538,0.246199,0.848644,0.840503,1.032817,0.821954,0.883265,0.890075,0.896075


In [25]:
ratings_pred_matrix[ratings_pred_matrix.index=='멍유']
for value in ratings_pred_matrix[ratings_pred_matrix.index=='멍유'].values:
    print(value[0])

0.8719165889571563


In [26]:
cnt = 0
al = 0
for real_value in review_matrix[review_matrix.index=='멍유'].values:
    pred_value = ratings_pred_matrix[ratings_pred_matrix.index=='멍유'].values[0]
    #print(pred_value)
    for i in range(len(real_value)):
        if real_value[i] == 1.0 or real_value[i] == -1:
            if real_value[i] * (pred_value[i]-0.5) > 0:
                cnt += 1
            al += 1
            
print("학습 성공률  : ",float(cnt/al))

학습 성공률  :  0.9502262443438914


In [27]:
for real_value in review_matrix[review_matrix.index=='티노'].values:
    print(real_value)

[ 1.  1.  1.  1.  1.  1. nan  1. nan nan nan  1.  1.  1. nan nan  1.  1.
  1. nan  1.  1.  1.  1.  1.  1. nan  1.  1. nan  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1. nan  1.  1. nan nan nan -1. nan -1.
  1.  1.  1. nan  1. nan nan nan nan  1. -1. -1. nan nan nan  1. nan nan
 nan nan -1. nan nan  1. nan  1.  1. nan nan  1. nan  1.  1.  1. nan  1.
  1.  1.  1.  1. nan  1.  1.  1.  1. nan  1.  1.  1.  1.  1.  1.  1.  1.
 nan  1.  1. nan  1. nan  1.  1.  1. -1.  1.  1.  1.  1.  1. nan nan  1.
  1. nan  1.  1. nan  1. nan nan  1.  1. nan  1. nan  1. nan  1.  1.  1.
 nan nan  1. nan  1. nan  1.  1.  1.  1. nan  1.  1. nan nan nan nan nan
  1. nan  1.  1. nan  1.  1. nan nan nan  1.  1.  1.  1.  1.  1.  1.  1.
 nan nan  1.  1.  1.  1. nan nan nan  1.  1. nan nan  1.  1.  1. nan  1.
  1. nan nan  1. nan nan nan  1.  1. nan  1.  1.  1. nan nan  1.  1.  1.
  1. -1. nan  1. nan -1. nan  1. nan nan nan  1.  1. nan nan nan  1.  1.
 nan nan  1. nan nan nan nan nan nan  1. nan nan na

In [28]:
R = review_matrix.values
R = np.array(R,dtype = float)
num_users, num_items = review_matrix.values.shape
al = 0
cnt = 0
np.nan_to_num(R,copy=False)
str_non_zeros = [(i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] != 0]
for i,j,r in str_non_zeros:
    if (ratings_pred_matrix.values[i,j] - 0.5) * r >= 0:
        cnt += 1
    al += 1
print("학습 성공률 : " + str(float(cnt/al)))

학습 성공률 : 0.934695651626152
