In [1]:
import pandas as pd
import numpy as np
import os
os.chdir('/Users/younghun/Desktop/gitrepo/data/ml-latest-small')
movies = pd.read_csv('movies.csv', encoding='utf-8')
ratings = pd.read_csv('ratings.csv', encoding='utf-8')
print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
merge_df = pd.merge(movies, ratings, on='movieId')
merge_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


# user-item 행렬 만들기

In [10]:
user_item = merge_df.pivot_table(index='userId',
                                columns='title',
                                values='rating')
# 결측치는 0으로 대체
user_item = user_item.fillna(0)
user_item.head(2)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# 정규분포로 초기화해서 user_item 사이즈와 동일하게 결합한 행렬
# 분해할 사이즈 K 하이퍼파라미터 설정
K = 256
num_user = user_item.shape[0]
num_item = user_item.shape[1]

np.random.seed(1)
P = np.random.normal(scale=1./K, size=(num_user, K))
# Q는 잇다가 Transpose할 것임!
Q = np.random.normal(scale=1./K, size=(num_item, K))
print("A shape of 'P'matrix: ", P.shape)
print("A shape of 'Q'matrix: ", Q.shape)

A shape of 'P'matrix:  (610, 256)
A shape of 'Q'matrix:  (9719, 256)


In [18]:
# 원본 행렬의 0이 아닌 값(결측치가 아닌 값)의 인덱스를 골라 놓기
R = np.asarray(user_item)
non_zeros = [(i, j, R[i, j]) for i in range(num_user) for j in range(num_item) if R[i, j] > 0]

In [27]:
# 원본 행렬과 SVD로 결합한 행렬 0이 아닌 값들 끼리의 Cost function
from sklearn.metrics import mean_squared_error

def get_cost(R, P, Q, non_zeros):
    # SVD 결합한 새로운 행렬
    R_ = np.dot(P, Q.T)
    
    # 원본 행렬에서 0이 아닌 값들의 인덱스에 해당하는 새로운 행렬의 인덱스
    i_non_zero_idx = [non_zero[0] for non_zero in non_zeros]
    j_non_zero_idx = [non_zero[1] for non_zero in non_zeros]
    # 원본행렬과 만든행렬에서 0이 아닌 값들 추출
    Rnon_zeros = R[i_non_zero_idx, j_non_zero_idx]
    R_non_zeros = R_[i_non_zero_idx, j_non_zero_idx]
    # Cost 정의
    mse = mean_squared_error(Rnon_zeros, R_non_zeros)
    return np.sqrt(mse)

In [21]:
a = np.array([[1,2,3],
            [4,5,6]])
a

array([[1, 2, 3],
       [4, 5, 6]])

In [24]:
# a 행렬에서 (0,0)에 있는 값과 (1,1)에 있는 값 가져옴!
a[[0,1],[0,2]]

array([1, 6])

In [28]:
steps = 100
learning_rate = 0.01
r_lambda = 0.01

for step in range(steps):
    for i, j, v in non_zeros:
        # 0이 아닌 값들의 모든 i,j 위치에서의 각 R, R_ 값의 차이값 구하기
        error_ij = v - np.dot(P[i,:], Q[j,:].T)
        # 정규화(오버피팅예방) 적용된 파라미터 업데이트 공식 이용해서 P, Q행렬의 업데이트
        P[i, :] = P[i, :] + learning_rate*(error_ij*Q[j, :] - r_lambda*P[i, :])
        Q[j, :] = Q[j, :] + learning_rate*(error_ij*P[i, :] - r_lambda*Q[j, :])
        
    # 1번 step 돌았을 때 Cost function
    rmse = get_cost(R, P, Q, non_zeros)
    if step % 10 == 0:
        print(f"### Step: {step}, RMSE: {rmse: .4f}")

### Step: 0, RMSE:  1.6361
### Step: 10, RMSE:  0.7521
### Step: 20, RMSE:  0.5415
### Step: 30, RMSE:  0.3538
### Step: 40, RMSE:  0.2363
### Step: 50, RMSE:  0.1699
### Step: 60, RMSE:  0.1330
### Step: 70, RMSE:  0.1118
### Step: 80, RMSE:  0.0992
### Step: 90, RMSE:  0.0913


In [29]:
re_user_item = np.dot(P, Q.T)
re_user_item.shape

(610, 9719)

In [30]:
re_user_item = pd.DataFrame(re_user_item,
                           index=user_item.index,
                           columns=user_item.columns)
re_user_item.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.228077,4.216531,3.690624,4.948584,4.627499,1.395215,4.27668,2.40007,4.340007,4.01452,...,1.384066,4.202935,3.978444,2.955318,2.90078,4.367448,3.535411,2.258632,3.994355,0.909788
2,3.23475,3.893944,3.397626,4.534528,4.382139,1.341394,3.979365,2.089005,3.508625,3.680712,...,1.159304,3.95183,3.437129,2.732256,2.601988,4.082229,2.592683,1.789106,4.106172,0.83969
3,2.556898,2.279141,1.971096,2.936664,2.903239,0.898846,1.568912,1.656953,1.779822,2.63916,...,0.765771,2.758038,2.355026,2.125628,1.836326,3.338309,2.493763,1.006935,2.059108,0.543391
4,2.481544,3.13068,2.737064,3.665478,3.619511,0.966972,3.252921,2.003946,3.48617,2.860795,...,1.112399,3.430471,2.908917,2.260507,2.206313,2.656968,1.234947,1.537194,3.335598,0.630081
5,2.625715,3.473906,3.020612,3.900146,3.761487,1.029028,2.471812,1.882,2.248516,3.016761,...,1.096317,3.807394,2.974509,2.302322,2.289491,3.522031,2.076815,1.707321,3.32517,0.713698


In [32]:
def recommend_similar_movies(user_item, user_id, top_n=10):
    score_idx = user_item.iloc[user_id].values.argsort()[::-1]
    top_score_idx = score_idx[:top_n]
    score = user_item.iloc[user_id].values[top_score_idx]
    top_score_movies = user_item.T.index[top_score_idx].values
    
    top_movies_df = pd.DataFrame({'영화 제목': top_score_movies,
                                 '점수': score})
    return top_movies_df

In [33]:
user_1 = recommend_similar_movies(re_user_item,
                                 1, top_n=10)

In [34]:
user_1

Unnamed: 0,영화 제목,점수
0,Bossa Nova (2000),5.506609
1,"Man Who Would Be King, The (1975)",5.256015
2,Christmas Vacation (National Lampoon's Christm...,5.193871
3,Dune (2000),5.181763
4,Deathgasm (2015),5.175075
5,Arrival (2016),5.100004
6,"Seventh Seal, The (Sjunde inseglet, Det) (1957)",5.093048
7,"Thomas Crown Affair, The (1999)",5.083659
8,Phantom of the Paradise (1974),5.082042
9,Anchorman: The Legend of Ron Burgundy (2004),5.073694
