# 경사하강을 이용한 행렬 분해 

- 1번째, 원본행렬 R을 분해할 P,Q행렬을 정규분포값을 가진 랜덤값으로 초기화

- np.random.normal에서 size인자로 shape을 넣어주면 해당 size로 행렬을 만들어줌

In [4]:
import numpy as np

# 원본 행렬 R 생성, 분해 행렬 P와 Q 초기화, 잠재요인 차원 K는 3 설정. 
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN ],
              [np.NaN, 5, np.NaN, 3, 1 ],
              [np.NaN, np.NaN, 3, 4, 4 ],
              [5, 2, 1, 2, np.NaN ]])

# shape 행,열 두 개 변수에 한 줄로 할당하기
num_users, num_items = R.shape
print(num_users, num_items)
# 잠재요인 factor 개수
K=3

# P와Q 매트릭스의 크기를 지정하고 정규분포를 가진 random한 값으로 P,Q행렬 생성
# 난수 시드 생성
np.random.seed(1)
# P행렬 : 사용자 - 잠재요인 행렬
P = np.random.normal(scale=1./K, size=(num_users, K))
# Q행렬 : 아이템 - 잠재요인 행렬(실제 분해하게되면 Q의 Transpose행렬로 됨!)
Q = np.random.normal(scale=1./K, size=(num_items, K))
print('P:', P)
print('Q:', Q)

4 5
P: [[ 0.54144845 -0.2039188  -0.17605725]
 [-0.35765621  0.28846921 -0.76717957]
 [ 0.58160392 -0.25373563  0.10634637]
 [-0.08312346  0.48736931 -0.68671357]]
Q: [[-0.1074724  -0.12801812  0.37792315]
 [-0.36663042 -0.05747607 -0.29261947]
 [ 0.01407125  0.19427174 -0.36687306]
 [ 0.38157457  0.30053024  0.16749811]
 [ 0.30028532 -0.22790929 -0.04096341]]


- Cost function을 생성
- 분해된 행렬 P와 Q의 T행렬을 내적해 예측 행렬 생성
- 실제 행렬인 R행렬에서 NaN값이 아닌 값들만 갖고오자. 그리고 정규분포로 random하게 초기화 시킨 P행렬, Q의 전치행렬을 내적한 R' 행렬에서 원본행렬 R의 NaN값이 아닌 값들과의 cost 계산

In [3]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0 # 이거는 왜..?
    # 두개의 분해된 행렬 P와 Q의 전치행렬 냐적으로 예측행렬 R' 생성
    R_ = np.dot(P, Q.T)
    
    # 실제 R행렬에서 NaN값이 아닌값들의 인덱스 위치와 값을 추출해서
    # 실제 R행렬과 예측 R'행렬 간의 RMSE비교
    x_non_zero_idx = [non_zero[0] for non_zero in non_zeros] #행
    y_non_zero_idx = [non_zero[1] for non_zero in non_zeros] #열
    Rnon_zeros = R[x_non_zero_idx, y_non_zero_idx] #실제 R행렬의 NaN아닌 값들
    R_non_zeros = R_[x_non_zero_idx, y_non_zero_idx] #예측 R'행렬에서 똑같은 위치의 값들을 뽑아내기
    # 1차원의 array 2개 값들을 각각 비교
    mse = mean_squared_error(Rnon_zeros, R_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

- 경사하강법을 통해서 R'행렬들의 값들을 업데이트시키자

In [None]:
'''
<일반 for문을 돌리면 이런식으로>
non_zeros = []
for i in range(num_users):
    for j in range(num_items):
        v = R[i, j]
        if v > 0:
            non_zeros.append((i, j, v))
'''

In [29]:
# 위 함수에서 non_zeros에 해당하는 값들 리스트에 저장
non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]

steps = 1000
learning_rate = 0.01
r_lambda = 0.01

# Stochastic Gradient Descent 방법으로 P와 Q 매트릭스를 계속 업데이트
for step in range(steps):
    for i, j, v in non_zeros:
        #실제 R행렬의 특정값과 예측행렬 R'의 똑같은 위치의 특정값의 차이(오류)를 구하기
        eij = v - np.dot(P[i, :], Q[j, :].T)
        # 정규화를 반영한 SGD 업데이트 공식 적용
        P[i, :] = P[i, :] + learning_rate*(eij*Q[j, :] - r_lambda*P[i,:])
        Q[j, :] = Q[j, :] + learning_rate*(eij*P[i, :] - r_lambda*Q[j, :])
    # 1번 step돌때마다 예측행렬 R'의 특정 인덱스(실제행렬 R에서 NaN값이 아닌 위치인덱스들)의 값들 업데이트
    # get_rmse함수는 실제행렬 R에서 NaN값이 아닌 값들과 예측행렬 R'의 특정 인덱스 값들만을 비교
    # 단, 예측행렬 R'의 모든 요소값들은 업데이트 되었음. RMSE값을 도출하기 위해서 특정 위치의 값들만 비교를 한 것임!
    rmse = get_rmse(R, P, Q, non_zeros)
    # 50번 step수행할 때마다 출력하기, %는 나머지 값
    if (step % 50) == 0:
        print('### iteration step: ', step, "RMSE: ", rmse)

### iteration step:  0 RMSE:  3.2388050277987723
### iteration step:  50 RMSE:  0.4876723101369648
### iteration step:  100 RMSE:  0.1564340384819247
### iteration step:  150 RMSE:  0.07455141311978046
### iteration step:  200 RMSE:  0.04325226798579314
### iteration step:  250 RMSE:  0.029248328780878973
### iteration step:  300 RMSE:  0.022621116143829466
### iteration step:  350 RMSE:  0.019493636196525135
### iteration step:  400 RMSE:  0.018022719092132704
### iteration step:  450 RMSE:  0.01731968595344266
### iteration step:  500 RMSE:  0.016973657887570753
### iteration step:  550 RMSE:  0.016796804595895633
### iteration step:  600 RMSE:  0.01670132290188466
### iteration step:  650 RMSE:  0.01664473691247669
### iteration step:  700 RMSE:  0.016605910068210026
### iteration step:  750 RMSE:  0.016574200475705
### iteration step:  800 RMSE:  0.01654431582921597
### iteration step:  850 RMSE:  0.01651375177473524
### iteration step:  900 RMSE:  0.01648146573819501
### iteration

In [30]:
# 업데이트된 P,Q행렬 값들로 내적해서 예측행렬 R' 만들기
final_R_ = np.dot(P, Q.T)
print('예측 행렬 :\n', np.round(final_R_, 3))

예측 행렬 :
 [[3.991 0.897 1.306 2.002 1.663]
 [6.696 4.978 0.979 2.981 1.003]
 [6.677 0.391 2.987 3.977 3.986]
 [4.968 2.005 1.006 2.017 1.14 ]]


# 잠재요인 기반 협업필터링 실습

In [43]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01,
                        r_lambda=0.01):
    num_users, num_items = R.shape
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    
    break_count = 0 
    # NaN값이 아닌 값들과 인덱스 추출
    non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
    
    # SGD기법으로 P, Q 매트릭스값들 업데이트
    for step in range(steps):
        for i, j, v in non_zeros:
            # error
            eij = v - np.dot(P[i, :], Q[j, :].T)
            # 정규화를 적용해 SGD업데이트
            P[i, :] = P[i, :] + learning_rate*(eij*Q[j, :] - r_lambda*P[i, :])
            Q[j, :] = Q[j, :] + learning_rate*(eij*P[i, :] - r_lambda*Q[j, :])
        
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0:
            print('##### iteration step :', step, 'RMSE :', rmse)
    # Cost function에 기반하여 모두 업데이트 한 P,Q행렬 반환
    return P, Q

In [39]:
import pandas as pd
import os
os.chdir('/Users/younghun/Desktop/gitrepo/data/ml-latest-small/')

movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]

In [41]:
# title 값을 얻고 pivot_table로 변환하기
ratings_movies = pd.merge(ratings, movies, on='movieId')
ratings_matrix = ratings_movies.pivot_table(index='userId',
                                           columns='title',
                                           values='rating')
ratings_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [44]:
P, Q = matrix_factorization(ratings_matrix.values, K=50)
pred_matrix = np.dot(P, Q.T)

##### iteration step : 0 RMSE : 2.9023619751336867
##### iteration step : 10 RMSE : 0.7335768591017927
##### iteration step : 20 RMSE : 0.5115539026853442
##### iteration step : 30 RMSE : 0.37261628282537446
##### iteration step : 40 RMSE : 0.2960818299181014
##### iteration step : 50 RMSE : 0.2520353192341642
##### iteration step : 60 RMSE : 0.22487503275269854
##### iteration step : 70 RMSE : 0.20685455302331537
##### iteration step : 80 RMSE : 0.19413418783028685
##### iteration step : 90 RMSE : 0.18470082002720403
##### iteration step : 100 RMSE : 0.17742927527209104
##### iteration step : 110 RMSE : 0.1716522696470749
##### iteration step : 120 RMSE : 0.1669518194687172
##### iteration step : 130 RMSE : 0.1630529219199754
##### iteration step : 140 RMSE : 0.15976691929679643
##### iteration step : 150 RMSE : 0.1569598699945732
##### iteration step : 160 RMSE : 0.1545339818671543
##### iteration step : 170 RMSE : 0.15241618551077643
##### iteration step : 180 RMSE : 0.1505508073962

In [46]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix,
                                  index=ratings_matrix.index,
                                  columns=ratings_matrix.columns)
ratings_pred_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.56413,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.31189,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.78076,1.997043,0.924908,2.9707,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.88758,1.042618,2.29389,0.396941
4,2.628629,3.03555,2.575746,3.706912,3.430636,0.706441,3.33028,1.978826,4.560368,2.77571,...,1.046116,2.912178,2.479592,2.231915,1.888629,2.211364,0.645603,1.585734,3.542892,0.59154
5,2.116148,3.084761,2.747679,3.78349,3.94699,0.883259,1.958953,1.757317,2.054312,2.775258,...,0.956159,3.893975,2.717024,2.002443,2.053337,3.983639,2.099626,1.423718,2.490428,0.531403


In [61]:
# 원본행렬 rating 매트릭스에 결측치값 0 으로 채워주기
ratings_matrix = ratings_matrix.fillna(0)

In [67]:
# 특정 userId를 선정해서 그 유저가 관람하지 않았던 영화 리스트를 출력
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    unseen_movies = user_rating[user_rating == 0].index.tolist()
    
    return unseen_movies

In [68]:
def recomm_movie_by_userid(pred_df, userId, unseen_movies, top_n=10):
    # 예측 행렬로 만든 데이터프레임에서 특정 유저가 보지못한 영화리스트들 출력
    recomm_movies = pred_df.loc[userId, unseen_movies].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [71]:
unseen_list = get_unseen_movies(ratings_matrix, 5)
# 데이터프레임으로 반환
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 5, unseen_list)
recomm_movies_df = pd.DataFrame(data=recomm_movies,
                               index=recomm_movies.index).rename(columns={5:'pred_score'})
recomm_movies_df

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Psycho (1960),5.551539
12 Angry Men (1957),5.482291
Ed Wood (1994),5.369367
Wallace & Gromit: The Best of Aardman Animation (1996),5.350756
No Country for Old Men (2007),5.320315
Jaws (1975),5.302713
"Fisher King, The (1991)",5.293706
Dancer in the Dark (2000),5.18448
"Talented Mr. Ripley, The (1999)",5.117104
Amadeus (1984),5.004169
