### 1. Dataset

In [1]:
import pandas as pd
import numpy as np
from scipy import spatial

In [2]:
rating_df = pd.read_csv("ratings_small.csv")
rating_df.drop("timestamp", axis=1, inplace=True)
rating_df.tail(2)

Unnamed: 0,userId,movieId,rating
100002,671,6385,2.5
100003,671,6565,3.5


##### 2. Check Dataset

In [3]:
unique_user = rating_df["userId"].unique()
len(unique_user)

671

In [4]:
unique_movie = rating_df["movieId"].unique()
len(unique_movie)

9066

In [5]:
# rating 분포
rating_df.groupby("rating").size().reset_index(name="rating_counts")

Unnamed: 0,rating,rating_counts
0,0.5,1101
1,1.0,3326
2,1.5,1687
3,2.0,7271
4,2.5,4449
5,3.0,20064
6,3.5,10538
7,4.0,28750
8,4.5,7723
9,5.0,15095


In [6]:
# user 분포
user_counts_df = rating_df.groupby("userId").size().reset_index(name="user_rating_count")
user_counts_df = user_counts_df.sort_values("user_rating_count", ascending=False)
user_counts_df.head()

Unnamed: 0,userId,user_rating_count
546,547,2391
563,564,1868
623,624,1735
14,15,1700
72,73,1610


In [7]:
# movie 분포
movie_counts_df = rating_df.groupby("movieId").size().reset_index(name="movie_rating_count")
movie_counts_df = movie_counts_df.sort_values("movie_rating_count", ascending=False)
movie_counts_df.head()

Unnamed: 0,movieId,movie_rating_count
321,356,341
266,296,324
284,318,311
525,593,304
232,260,291


### 3. Preprocessing

In [8]:
# user 최소 평가수, movie 최소 평가수
user_limit, movie_limit = 365, 100

In [9]:
filtered_userId = user_counts_df[user_counts_df["user_rating_count"] > user_limit]
filtered_userId = list(filtered_userId["userId"])
len(filtered_userId), filtered_userId[:5]

(59, [547, 564, 624, 15, 73])

In [10]:
filtered_movieId = movie_counts_df[movie_counts_df["movie_rating_count"] > movie_limit]
filtered_movieId = list(filtered_movieId["movieId"])
len(filtered_movieId), filtered_movieId[:5]

(149, [356, 296, 318, 593, 260])

In [13]:
# 10004 -> 5570
filtered_df = rating_df[rating_df["userId"].isin(filtered_userId)]
len(filtered_df)

filtered_df = filtered_df[filtered_df["movieId"].isin(filtered_movieId)]
len(filtered_df)

5570

### 4. pivot

In [14]:
user_df = filtered_df.pivot_table(values="rating",\
                                  index="userId", columns="movieId",\
                                  aggfunc=np.average, fill_value=0,\
                                  dropna=False
                                 )
user_df.tail()

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,4.0,0.0,3.5,0.0,0.0,3.0,0.0,0.0,0.0,4.5,...,4.5,3.5,4.5,4.0,4.0,4.0,4.0,4.0,4.5,4.0
624,5.0,3.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,4.0,...,3.5,3.5,4.0,4.5,0.0,3.5,4.5,3.5,3.5,4.0
654,5.0,3.0,0.0,4.0,0.0,5.0,4.5,4.5,0.0,4.5,...,5.0,4.5,4.5,5.0,4.0,4.0,5.0,4.5,0.0,0.0
664,3.5,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.5,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.5,5.0
665,0.0,3.0,0.0,0.0,0.0,4.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### 6. function 

In [19]:
def cosine_similarity(vec1, vec2):
    idx = vec1.nonzero()[0]
    if len(idx) == 0:
        return -1
    vec1, vec2 = np.array(vec1)[idx], np.array(vec2)[idx]
    
    idx = vec2.nonzero()[0]
    if len(idx) == 0:
        return -1
    vec1, vec2 = np.array(vec1)[idx], np.array(vec2)[idx]

    
    return 1-spatial.distance.cosine(vec1, vec2)

In [21]:
# test code - cosine_similarity
vec1 = np.array([1, 0, 3, 0, 5])
vec2 = np.array([5, 3, 0, 1, 5])
cosine_similarity(vec1, vec2)

0.83205029433784372

In [22]:
def similarity_matrix(user_df, similarity_func):
    users = user_df.index
    df = user_df.T
    
    matrix=[]
    for idx_1, value_1 in df.items():
        row = []
        for idx_2, value_2 in df.items():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
    
    return pd.DataFrame(matrix, index = users, columns = users)

In [24]:
# test code 
sm_df = similarity_matrix(user_df, cosine_similarity)

In [25]:
def mean_score(user_df, sm_df, target, closer_count):
    sms_df = sm_df.drop(target)
    sms_df = sms_df.sort_values(target, ascending = False)
    sms_df = sms_df[target][:closer_count]
    
    smsw_df = user_df.loc[sms_df.index]
    
    ms_df = pd.DataFrame(columns = user_df.columns)
    ms_df.loc["user"] = user_df.loc[target]
    ms_df.loc["mean"] = smsw_df.mean()
    
    return ms_df
    

In [29]:
# test code - mean_score
ms_df = mean_score(user_df, sm_df, 15, 10)
ms_df

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,2.0,2.0,4.0,3.0,3.0,4.0,3.0,1.0,2.5,5.0,...,1.0,3.5,1.0,1.5,5.0,0.5,2.0,4.5,4.5,5.0
mean,1.7,1.0,2.15,0.75,3.2,2.35,2.35,2.3,1.4,4.35,...,0.0,0.75,1.15,1.15,1.25,0.95,0.0,0.85,0.85,0.4


In [32]:
def recommend(ms_df) :
    recommend_df = ms_df.T
    recommend_df = recommend_df[recommend_df["user"] == 0]
    recommend_df = recommend_df.sort_values("mean", ascending = False)
    
    return recommend_df, list(recommend_df.index)

In [38]:
recommend_df, recommend_list = recommend(ms_df)
print("recommended movies : {}".format(recommend_list))

recommended movies : [595]


##### metric

In [46]:
def mae(value, pre):
    idx = value.nonzero()[0]
    if len(idx) == 0:
        return -1
    value = np.array(value)[idx]
    pre = np.array(pre)[idx]
    
    idx = pre.nonzero()[0]
    if len(idx) == 0:
        return -1
    value = np.array(value)[idx]
    pre = np.array(pre)[idx]
    
    return sum(np.abs(value - pre)/len(idx))

In [47]:
mae(ms_df.loc["user"], ms_df.loc["mean"])

1.3551724137931034

In [48]:
def evaluate(user_df, sm_df, closer_count, algorithm):
    users = user_df.index
    evaluate_list = []
    
    for target in users:
        pred_df = mean_score(user_df, sm_df, target, closer_count)
        evaluate_list.append(algorithm(pred_df.loc["user"], pred_df.loc["mean"]))
        
    return np.average(evaluate_list)

In [49]:
# test code - evaluate
evaluate(user_df, sm_df, 10, mae)

1.4807110633380403

In [50]:
start, end = 2, 10

for closer_count in range(start, end +1):
    print(closer_count, evaluate(user_df, sm_df, closer_count, mae))

2 1.30487123248
3 1.48916180216
4 1.55688220475
5 1.54849659935
6 1.53756469167
7 1.53165024331
8 1.51446414786
9 1.49956952568
10 1.48071106334
