In [1]:
import random
import numpy as np 
import pandas as pd
import operator
from scipy.sparse import coo_matrix
from numpy.linalg import norm
from sklearn.metrics import mean_squared_error

In [2]:
ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [3]:
import os, sys 
from google.colab import drive 

### 해당 코드 실행 시 colab에서 실행중인 폴더의 /content/drive/My Drive가 구글 드라이브에 연결됨

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#### 영화 데이터셋 형태 확인 ####
SUBMIT = False
if not(SUBMIT):
    os.chdir("/content/drive/MyDrive/WorkColab/AI_Boostcamp/week5_graph")
    print(os.getcwd())

if SUBMIT:
    movies = pd.read_csv("drive/MyDrive/data/others/movies.csv")
else:
    movies = pd.read_csv('data/others/movies.csv')
print("### Movie Dataset Format ###", end = '\n\n')
print(movies.head())

/content/drive/MyDrive/WorkColab/AI_Boostcamp/week5_graph
### Movie Dataset Format ###

   movieId  ...                                       genres
0        1  ...  Adventure|Animation|Children|Comedy|Fantasy
1        2  ...                   Adventure|Children|Fantasy
2        3  ...                               Comedy|Romance
3        4  ...                         Comedy|Drama|Romance
4        5  ...                                       Comedy

[5 rows x 3 columns]


In [5]:
################### 가상 유저 추가해주기 위한 작업 데이터 전처리 - 장르별 영화 리스트를 뽑는다 #################

# 이 셀을 실행하는 이유는 후에 특정 장르를 좋아하는 가상의 유저를 만들어서 추천의 결과를 보기 위함입니다.

movie_dict = dict()                 # {movie_id : (movie_title, movie_genre)}
musical_list = list()               # musical 장르의 영화들
horror_list = list()                # horror 장르의 영화들
documentary_list = list()           # documentary 장르의 영화들
comedy_list = list()                # comedy 장르의 영화들
animation_list = list()

for (movie_id, movie_title, movie_genre) in movies.itertuples(index=False):
    movie_dict[movie_id] = (movie_title, movie_genre)
    if 'Musical' in movie_genre:
        musical_list.append(movie_id)
    if 'Horror' in movie_genre:
        horror_list.append(movie_id)
    if 'Documentary' in movie_genre:
        documentary_list.append(movie_id)
    if 'Comedy' in movie_genre:
        comedy_list.append(movie_id)
    if 'Animation' in movie_genre:
        animation_list.append(movie_id)

In [6]:
if SUBMIT:
    file_path = "drive/MyDrive/data/others/ratings.csv"
else:
    file_path = 'data/others/ratings.csv'


ratings = pd.read_csv(file_path)

# 평점 데이터셋 형태 확인
print("### Rating Dataset Format ###", end='\n\n')
print(ratings.head(), end='\n\n\n')
ratings.drop(['timestamp'], axis=1, inplace=True)
print(ratings.head())

### Rating Dataset Format ###

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0


In [7]:
'''
    유저가 평균적으로 영화를 130개정도 본 것으로 나타나서 일반적으로 추가하는 사람들도 그정도로 설정
    그리고, 영화에 대한 평점을 줄 때는 영화 순서를 섞고, 무조건 5점을 주는 것이 아니라 4, 5점 중 랜덤으로 설정
    낮은 점수를 줄 때도 마찬가지.
'''
# uid==800을 가지는 유저를 추가한다. 이 유저는 뮤지컬의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);

print(ratings)
new_uid = 800
rows = list()
for movie_id in musical_list[:100]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in horror_list[:50]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in documentary_list[:20]:
    rows.append([new_uid, movie_id, random.randint(2,3)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)


        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
100831     610   166534     4.0
100832     610   168248     5.0
100833     610   168250     5.0
100834     610   168252     5.0
100835     610   170875     3.0

[100836 rows x 3 columns]
        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101001     800   165843     2.0
101002     800     6331     2.0
101003     800   102742     3.0
101004     800     6380     2.0
101005     800    48322     3.0

[101006 rows x 3 columns]


In [8]:
# uid==850을 가지는 유저를 추가한다. 이 유저는 다큐 영화의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);


print(ratings)
new_uid = 850
rows = list()
for movie_id in documentary_list[:100]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in horror_list[:50]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in animation_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,2)])

for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101001     800   165843     2.0
101002     800     6331     2.0
101003     800   102742     3.0
101004     800     6380     2.0
101005     800    48322     3.0

[101006 rows x 3 columns]
        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101161     850     1148     1.0
101162     850   136556     1.0
101163     850   177765     1.0
101164     850    32456     2.0
101165     850   112006     2.0

[101166 rows x 3 columns]


In [9]:
# uid==900을 가지는 유저를 추가한다. 이 유저는 호러의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);

new_uid = 900
rows = list()
for movie_id in horror_list[:120]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in documentary_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in comedy_list[:30]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in animation_list[:20]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101341     900    79008     2.0
101342     900    27186     1.0
101343     900    26662     1.0
101344     900   120130     2.0
101345     900    85736     2.0

[101346 rows x 3 columns]


In [10]:
# uid==950을 가지는 유저를 추가한다. 이 유저는 애니메이션의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);

new_uid = 950
rows = list()
for movie_id in horror_list[:20]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in documentary_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,3)])
for movie_id in comedy_list[:30]:
    rows.append([new_uid, movie_id, random.randint(2,4)])
for movie_id in animation_list[:150]:
    rows.append([new_uid, movie_id, random.randint(3,5)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101551     950   128968     5.0
101552     950    95165     5.0
101553     950   106873     4.0
101554     950    92348     5.0
101555     950   115664     3.0

[101556 rows x 3 columns]


In [11]:
########################################################################
######### Mapping user id, movie id to user index, movie index #########
########################################################################

uid_2_idx = dict()     # user id --> user idx
mid_2_idx = dict()    # movie id --> movie idx

u_idx = 0
m_idx = 0
for user_id, movie_id, r in ratings.itertuples(index=False):
    if user_id not in uid_2_idx:
        uid_2_idx[user_id] = u_idx
        u_idx = u_idx+1

    if movie_id not in mid_2_idx:
        mid_2_idx[movie_id] = m_idx
        m_idx = m_idx+1

num_user = len(uid_2_idx)
num_movie = len(mid_2_idx)
print(num_user) # 유저수
print(num_movie) # 영화수

uidx_2_id = {v:k for (k,v) in uid_2_idx.items()}    # user idx --> user id
midx_2_id = {v:k for (k,v) in mid_2_idx.items()}    # movie idx --> movie id


614
9726


In [12]:
# rating matrix 초기화
rating_matrix = np.zeros((num_user, num_movie)) # num_user x num_movie를 크기로 가지는 numpy 행렬 선언, 각 요소는 유저가 영화에 매긴 rating을 나타냄.

for user_id, movie_id, r in ratings.itertuples(index=False):
    u_idx = uid_2_idx[user_id]
    m_idx = mid_2_idx[movie_id]
    rating_matrix[u_idx, m_idx] = r

rating_matrix

array([[4., 4., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 3.]])

In [13]:
######################################################################################################################################
# Training Set과 Test Set을 분리해 주는 함수
# kind='user'인 경우 같은 user 내의 rating을, kind = 'item'인 경우 같은 item(movie) 내의 rating을 train set과 test set으로 분리해준다.
######################################################################################################################################
def train_test_split(ratings):
    test = np.zeros_like(ratings)
    train = ratings.copy()
    for x in range(ratings.shape[0]):
        nonzero_idx = ratings[x, :].nonzero()[0]
        test_ratings = np.random.choice(nonzero_idx, 
                                        size=int(len(nonzero_idx)/5),  
                                        replace=False)
        train[x, test_ratings] = 0.
        test[x, test_ratings] = ratings[x, test_ratings]
        
    assert(np.all((train * test) == 0))     # train set과 test set이 완전히 분리되었는지 확인

    return train, test

######################################################################################
###################### 유저별 평균을 기존 rating에서 빼준다 ##########################
######################################################################################
####### *****rating이 "있는" 값들의 평균만을 rating이 있는 곳에서만 빼준다############
def subtract_mean_ratings(ratings):
    mean_subtracted_ratings = np.zeros_like(ratings)
    avg_ratings = np.zeros(ratings.shape[0])
    for i in range(ratings.shape[0]):
        nonzero_idx = ratings[i].nonzero()[0]                       # rating 값이 존재하는(0이 아닌) index 추출
        sum_ratings = np.sum(ratings[i])          
        num_nonzero = len(nonzero_idx)
        avg_rating = sum_ratings / num_nonzero                      # rating 값들의 평균값 계산
        if num_nonzero == 0:                  
            avg_rating = 0
        avg_ratings[i] = avg_rating
        mean_subtracted_ratings[i, nonzero_idx] = ratings[i, nonzero_idx] - avg_rating 
                                                                    # 원 rating matrix에서 평균 값을 빼줌
    return mean_subtracted_ratings, avg_ratings


In [14]:
train_ratings, test_ratings = train_test_split(rating_matrix)
mean_subtracted_ratings, avg_ratings = subtract_mean_ratings(train_ratings)

In [15]:
def calculate_rmse(R, U, V, lambda_u, lambda_v):
    error = 0
    for u, i, r_ui in zip(R.row, R.col, R.data):
        error += (r_ui - (U[u]*V[:,i]).sum())**2
    for u in range(U.shape[0]):
        error += lambda_u * ((U[u]**2).sum())
    for i in range(V.shape[1]):
        error += lambda_v * ((V[:,i]**2).sum())
    rmse = np.sqrt(error/len(R.data))
    return rmse

def SGD(U, V, R, lr, lambda_u, lambda_v):
    for u, i, r_ui in R:
        e = r_ui - (U[u] * V[:,i]).sum()
        grad_user = 2*e*(-V[:,i]) + 2 * lambda_u * U[u]
        grad_item = 2*e*(-U[u]) + 2 * lambda_v * V[:,i]

        U[u] = U[u]- lr*grad_user
        V[:,i] = V[:,i]- lr*grad_item
    return U, V


def train(ratings, dim=10, max_epoch=50, lambda_u=0.1, lambda_v=0.1, lr=0.01):
    #lambda_u : regularization coefficient of U matrix, lambda_v : regularizatoin coeff of V matrix,
    # lr : learning rate
    num_u, num_i = ratings.shape

    U = np.random.rand(num_u, dim)
    V = np.random.rand(dim, num_i)
    R = coo_matrix(ratings)
    rmse = calculate_rmse(R, U, V, lambda_u, lambda_v)
    print("Initial RMSE: " + str(rmse))

    for epoch in range(max_epoch):
        if epoch != 0 and epoch%10==0:
            lr /= 2
        rating_data = list(zip(R.row, R.col, R.data))
        random.shuffle(rating_data)
        U, V = SGD(U, V, rating_data, lr, lambda_u, lambda_v)
        rmse = calculate_rmse(R, U, V, lambda_u, lambda_v)        
        print('Epoch: {:5}, RMSE: {:15}, Learning Rate:{}'.format(epoch, rmse, lr))
    
    return U, V 

def predict(U, V, user_id=None):
    # user_id가 주어지지 않은 경우엔 전체 예측된 평점 행렬을 return,
    # 주어진 경우엔 해당 유저에 대한 예측 평점 행렬을 return한다.

    if user_id is None:
        return np.dot(U, V)
    else:
        user_idx = uid_2_idx[user_id]
        return np.dot(U[user_idx], V )


#######################################################################################
##########user id = uid인 유저에게 유저가 본 영화를 제외하고 N개를 추천해준다######
#######################################################################################
def recommend_for_uid(uid, ori_rating_matrix, U, V, top_N):

    movie_rating_dict = dict()
    u_predicted_rating = predict(U, V, user_id=uid)
    for movie_idx, pred_rating in enumerate(u_predicted_rating):
        movie_rating_dict[movie_idx] = pred_rating
    
    uidx = uid_2_idx[uid]
    sorted_dict = sorted(movie_rating_dict.items(), key=operator.itemgetter(1), reverse=True)       # 전체 영화를 대상으로 예측 rating을 기준으로 나열
    already_seen_movie_idxs = np.nonzero(ori_rating_matrix[uidx])[0]                                # 유저가 평점을매긴(이미 본) 영화들의 idx

    print_cnt = 0
    for idx, pred_rating in sorted_dict:
        if print_cnt == top_N:
            break
        if idx not in already_seen_movie_idxs:
            print(str(movie_dict[midx_2_id[idx]])+"    "+str(pred_rating))
            print_cnt = print_cnt+1


In [16]:
len(mean_subtracted_ratings[:,2])

614

In [17]:
U, V = train(mean_subtracted_ratings)

Initial RMSE: 2.845998651981894
Epoch:     0, RMSE: 0.9759062468009729, Learning Rate:0.01
Epoch:     1, RMSE: 0.9496803680037682, Learning Rate:0.01
Epoch:     2, RMSE: 0.9368007219906423, Learning Rate:0.01
Epoch:     3, RMSE: 0.9234778207778798, Learning Rate:0.01
Epoch:     4, RMSE: 0.9060005957112599, Learning Rate:0.01
Epoch:     5, RMSE: 0.8882721979388667, Learning Rate:0.01
Epoch:     6, RMSE: 0.8714444221318348, Learning Rate:0.01
Epoch:     7, RMSE: 0.8555971152506251, Learning Rate:0.01
Epoch:     8, RMSE: 0.8416625118619026, Learning Rate:0.01
Epoch:     9, RMSE: 0.8301022659811268, Learning Rate:0.01
Epoch:    10, RMSE: 0.8221945053311571, Learning Rate:0.005
Epoch:    11, RMSE: 0.8161822021411695, Learning Rate:0.005
Epoch:    12, RMSE: 0.811270997780156, Learning Rate:0.005
Epoch:    13, RMSE: 0.8060139648717874, Learning Rate:0.005
Epoch:    14, RMSE: 0.8011261937744375, Learning Rate:0.005
Epoch:    15, RMSE: 0.7964593786261868, Learning Rate:0.005
Epoch:    16, RMSE:

In [18]:
predicted_ratings = predict(U, V)
predicted_ratings += avg_ratings.reshape([-1,1])
print(predicted_ratings)

[[4.59088796 4.49771238 4.54746744 ... 3.9252905  4.34674559 4.44482772]
 [3.75746847 3.89313326 3.66410808 ... 4.02804724 3.80650844 3.92388755]
 [3.02998439 2.29250888 2.90652702 ... 3.67085944 2.00706692 2.59464143]
 ...
 [3.18757046 4.00772581 3.03647736 ... 3.25047106 3.19153085 4.39593821]
 [2.8953203  2.95017261 2.98967759 ... 4.34921869 3.23119586 2.82450872]
 [3.64750843 3.45573874 3.12604733 ... 3.73344408 3.94096528 3.68334524]]


In [19]:
# 타겟 유저를 정해서, 해당 유저가 본 영화들의 분포를 뽑아내보기
# 유저의 추첨 결과가 올바르게 나왔는지 해석하기 위하여 해당 함수를 만들어서 추천 결과와 출력합니다.  
def print_user_preference(ori_rating_matrix, u_idx, type):
    '''
        type : ['avg', 'sum', 'cnt']
        avg 는 평균 평점, sum은 평점의 합, cnt은 평점 매긴 횟수
    '''
    target_hist = ori_rating_matrix[u_idx]
    target_seen_movie_idx =  np.nonzero(target_hist)[0]
    target_genre_dict = dict()
    target_genre_sum_rating_dict = dict()
    for movie_idx in target_seen_movie_idx:
        _, genre = movie_dict[midx_2_id[movie_idx]]
        for g in genre.split("|"):
            if g not in target_genre_dict.keys():
                target_genre_dict[g]=0
                target_genre_sum_rating_dict[g]=0
            target_genre_dict[g]=target_genre_dict[g]+1
            target_genre_sum_rating_dict[g] = target_genre_sum_rating_dict[g] + ori_rating_matrix[u_idx][movie_idx]

    # # 많이 본 횟수 대로 
    if type=='cnt':
        target_genre_dict = sorted(target_genre_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_dict:
            print(k,v)
    
    # # 영화 평점의 합대로
    elif type=='sum':
        target_genre_sum_rating_dict = sorted(target_genre_sum_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_sum_rating_dict:
            print(k,v)
        
    # 영화 평균 평점 순서대로
    elif type=="avg":    
        target_genre_avg_rating_dict = dict()
        for k,v in target_genre_sum_rating_dict.items():
            target_genre_avg_rating_dict[k] = v/target_genre_dict[k]
        target_genre_avg_rating_dict = sorted(target_genre_avg_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_avg_rating_dict:
            print(k,v)
    
    else:
        target_genre_avg_rating_dict = dict()
        for k,v in target_genre_sum_rating_dict.items():
            target_genre_avg_rating_dict[k] = v/target_genre_dict[k]
        target_genre_avg_rating_dict = sorted(target_genre_avg_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_avg_rating_dict:
            print(k,v,target_genre_dict[k],target_genre_sum_rating_dict[k])

In [20]:
##### Check the RMSE of the test set ####### 
pred = predicted_ratings[test_ratings.nonzero()].flatten()
actual = test_ratings[test_ratings.nonzero()].flatten()

print("### Test RMSE ###")
print(np.sqrt(mean_squared_error(pred,actual)))

### Test RMSE ###
0.8726564397714941


In [21]:
musical_fan = 800
docu_fan = 850
horror_fan = 900
animation_fan =  950

In [22]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, musical_fan, 'sum')
recommend_for_uid(musical_fan, rating_matrix, U, V, top_N=30)

('Ruby & Quentin (Tais-toi!) (2003)', 'Comedy|Crime')    1.8189354166082565
('Dumb & Dumber (Dumb and Dumber) (1994)', 'Adventure|Comedy')    1.5247554096433884
('9to5: Days in Porn (a.k.a. 9 to 5: Days in Porn) (2008)', 'Documentary')    1.5093687897420134
('Doors, The (1991)', 'Drama')    1.4997915551268273
('Mr. Holmes (2015)', 'Drama|Mystery')    1.4991471062039199
('Atomic Cafe, The (1982)', 'Documentary|War')    1.481559312575267
('Redemption (Hummingbird) (2013)', 'Action|Crime|Thriller')    1.476320124130568
('Green Card (1990)', 'Comedy|Drama|Romance')    1.4424608633420335
('On Golden Pond (1981)', 'Drama')    1.4251485873327392
('Suspect Zero (2004)', 'Crime|Thriller')    1.3920214134592797
('Stage Beauty (2004)', 'Drama')    1.378285567471807
('Garm Wars: The Last Druid (2014)', 'Action|Sci-Fi|Thriller')    1.3693680990350119
('He Loves Me... He Loves Me Not (À la folie... pas du tout) (2002)', 'Romance|Thriller')    1.3676659514734497
('Serpent and the Rainbow, The (1988)'

In [23]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, docu_fan,'sum')
recommend_for_uid(docu_fan, rating_matrix, U, V, top_N=30)

('Pokémon Heroes (2003)', 'Animation|Children')    2.123483681447321
('Alice Through the Looking Glass (2016)', 'Adventure|Children|Fantasy')    1.99570056043352
('My Afternoons with Margueritte (La tête en friche) (2010)', 'Comedy')    1.9867025254382535
('Did You Hear About the Morgans? (2009)', 'Comedy|Crime|Drama|Romance')    1.9833904869510532
('Paisan (Paisà) (1946)', 'Drama|War')    1.9581046615716444
('Indochine (1992)', 'Drama|Romance')    1.9004325627584782
('Last Waltz, The (1978)', 'Documentary')    1.8845564606463143
('Into the Woods (1991)', 'Adventure|Comedy|Fantasy|Musical')    1.8432766748729568
('Before Sunset (2004)', 'Drama|Romance')    1.830397628557711
("Joe Gould's Secret (2000)", 'Drama')    1.829069043320867
('Good Time (2017)', 'Crime|Drama')    1.82584330004597
('High Heels and Low Lifes (2001)', 'Action|Comedy|Crime|Drama')    1.8090575761518073
('Mind Game (2004)', 'Adventure|Animation|Comedy|Fantasy|Romance|Sci-Fi')    1.7972036547028836
('How to Steal a M

In [24]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, horror_fan,'sum')
recommend_for_uid(horror_fan, rating_matrix, U, V, top_N=30)

('Speed 2: Cruise Control (1997)', 'Action|Romance|Thriller')    2.260904398242882
('I Still Know What You Did Last Summer (1998)', 'Horror|Mystery|Thriller')    2.246856186416724
('Showgirls (1995)', 'Drama')    2.2245906654209917
('From Justin to Kelly (2003)', 'Musical|Romance')    2.2216635872519896
('Star Trek V: The Final Frontier (1989)', 'Action|Sci-Fi')    2.1767807827244736
('Spice World (1997)', 'Comedy')    2.1200215608137425
('Mortal Kombat (1995)', 'Action|Adventure|Fantasy')    2.0752409583776696
('Flintstones, The (1994)', 'Children|Comedy|Fantasy')    2.0679386573135448
('Uncle Nino (2003)', 'Comedy')    2.0300607384416165
('Dumb & Dumber (Dumb and Dumber) (1994)', 'Adventure|Comedy')    2.006241136938356
('Dumb and Dumberer: When Harry Met Lloyd (2003)', 'Comedy')    1.9991206624771154
('Volcano (1997)', 'Action|Drama|Thriller')    1.9925739151117232
('Epic Movie (2007)', 'Adventure|Comedy')    1.9636529231400703
("Big Momma's House (2000)", 'Comedy')    1.92898217195

In [25]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, horror_fan,'sum')
recommend_for_uid(animation_fan, rating_matrix, U, V, top_N=30)

('Hours, The (2002)', 'Drama|Romance')    1.5532715553608127
('Alive (1993)', 'Drama')    1.5523805263929202
('Disconnect (2012)', 'Drama|Thriller')    1.531570513088059
('Happy People: A Year in the Taiga (2010)', 'Documentary')    1.5211318601288997
('Prefontaine (1997)', 'Drama')    1.4577349422309442
('Vagabond (Sans toit ni loi) (1985)', 'Drama')    1.4426217952303941
('Fifty Shades Darker (2017)', 'Drama|Romance')    1.4305911771865587
('As I Was Moving Ahead Occasionally I Saw Brief Glimpses of Beauty (2000)', 'Documentary')    1.340396414178136
('Raven, The (2012)', 'Mystery|Thriller')    1.289696358058086
('Mr. Holmes (2015)', 'Drama|Mystery')    1.2875690470675574
('Super, The (1991)', 'Comedy')    1.2767051963201872
('Jimmy Carr: Making People Laugh (2010)', 'Comedy')    1.2750693888651807
('X-Men: First Class (2011)', 'Action|Adventure|Sci-Fi|Thriller|War')    1.2697041637434914
('Circle of Friends (1995)', 'Drama|Romance')    1.251527667706691
('Love Actually (2003)', 'Com

# Test Code

In [26]:
################ Generating Synthetic Data #######################
synthetic_rating = np.zeros((5,10))

for i in range(10):
    random.seed(i)
    u_idx = random.randint(0,4)
    i_idx = random.randint(0,9)
    r_ui = random.randint(1,5)
    synthetic_rating[u_idx ,i_idx] = r_ui

synthetic_R = coo_matrix(synthetic_rating)
synthetic_R_zipped = list(zip(synthetic_R.row, synthetic_R.col, synthetic_R.data))
np.random.seed(7)
synthetic_U = np.random.rand(5, 3)
synthetic_V = np.random.rand(3, 10)

In [27]:
if SUBMIT:
    answer_U = np.load('./drive/MyDrive/data/others/answer_U.npy')
    answer_V = np.load('./drive/MyDrive/data/others/answer_V.npy')
    answer_rmse = np.load('./drive/MyDrive/data/others/answer_rmse.npy')
else:
    answer_U = np.load('data/others/answer_U.npy')
    answer_V = np.load('data/others/answer_V.npy')
    answer_rmse = np.load('data/others/answer_rmse.npy')

In [28]:
def test_code(sgd, rmse):
    U, V = sgd(synthetic_U, synthetic_V, synthetic_R_zipped, lr=0.01, lambda_u=0.1, lambda_v=0.1) 
    mse_u = mean_squared_error(U, answer_U)
    mse_v = mean_squared_error(V, answer_V)
    assert(mse_u <1e-2 or mse_v < 1e-2), 'calculated U, V is different with the answer : SGD 함수 오류'   ## 정답과 오차범위 내의 값이 아니면 assertion 오류

    rmse = calculate_rmse(synthetic_R, U, V, lambda_u=0.1, lambda_v=0.1)
    assert(abs(answer_rmse-rmse)<1e-05), 'calculated rmse is diferent with the answer : RMSE 함수 오류'   ## 정답과 오차범위 내의 값이 아니면 assertion 오류
    
    print("모든 함수 알맞게 구현됨")


In [29]:
test_code(SGD, calculate_rmse)

모든 함수 알맞게 구현됨
