In [1]:
import pandas as pd
import csv
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
import math
from operator import itemgetter
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

In [14]:
def distance_euclidean(a,b):
    return 1/(1+distance.euclidean(a,b))

In [15]:
def distance_correlation(a,b):
    return 1-distance.correlation(a,b)

In [16]:
def nearest_neighbor_user(user, topN, simFunc):
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid == user: continue
            
        for i in ratedIndex:
            if not math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
        if interSectionLen < 3: continue
        sim = simFunc(interSectionU1, interSectionU2)
        
        if not math.isnan(sim): nn[uid] = sim
            
    return sorted(nn.items(), key=itemgetter(1))[:-(topN+1):-1]

In [17]:
def predict_rating(userid, nn, simFunc):
    neighbor = nearest_neighbor_user(userid, nn, simFunc)
    neighbor_id = [id for id, sim in neighbor]
    
    neighbor_movie = UM_matrix_ds.loc[neighbor_id].dropna(1,how='all', thresh = 1)
    neighbor_dict = (dict(neighbor))
    ret = []
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dict.get(v[0], 0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
        
    return ret

In [18]:
ratings = pd.read_csv('./data/final_rating.csv')
ratings.tail(5)

Unnamed: 0,userId,reviewNo,rating,movieId
5116,zxcv,5308791,8,39576
5117,zxcv,5308792,10,33082
5118,zxcv,5308796,5,66487
5119,zxcv,5492271,7,52462
5120,zxcv,5375651,10,17521


In [19]:
userlist = pd.read_csv('./data/naver_user.csv')
userlist.head(5)

Unnamed: 0,reviewNo,userId
0,15772038,airf
1,15772037,nanw
2,15772036,zxcv
3,15772035,sdh1
4,15772032,guan


In [31]:
selected_user= ratings.groupby(["userId"]).size().nlargest(10).reset_index(name="Count")
selected_user = pd.merge(selected_user, userlist, on='userId')
selected_user = pd.merge(selected_user, ratings, on='reviewNo')
del selected_user['userId_y']
selected_user = selected_user.rename(columns={'userId_x': 'userId'})
selected_user.head(10)

Unnamed: 0,userId,Count,reviewNo,rating,movieId
0,ykm3,700,15771936,1,137938
1,sang,691,15771961,6,95327
2,tsp0,677,15771934,5,52757
3,hosu,564,15771998,10,94170
4,zxcv,356,15772036,7,113351
5,zard,276,15772012,9,152170
6,artn,192,15771948,10,120165
7,suha,108,15771976,8,95327
8,ldsl,105,15771977,10,163788
9,imag,102,15771940,10,161967


In [32]:
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values = 'rating')
UM_matrix_ds.head(5)

movieId,10002,10003,10004,10005,10006,10008,10009,10012,10016,10018,...,181409,181410,181411,181414,181419,181711,182348,182360,183132,183877
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0424,,,,,,,,,,,...,,,,,,,,,,
2pop,,,,,,,,,,,...,,,,,,,,,,
airf,,,,,,,,,,,...,,,,,,,,,,
akh3,,,,,,,,,,,...,,,,,,,,,,
artn,,,,,,,,,,,...,,,1.0,,,,,,,


In [22]:
for user in selected_user['userId']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(user, 3, distance_cosine)))

User ykm3 neighbors : [('kchm', 0.9984038297885895), ('mumu', 0.9715365547505717), ('kktw', 0.9409649470363858)]
User sang neighbors : [('sugo', 0.997040538050167), ('akh3', 0.9965457582448796), ('zero', 0.9959100033104786)]
User tsp0 neighbors : [('lucy', 0.9999195737406962), ('scw6', 0.9979402655783317), ('akh3', 0.9945054529214061)]
User hosu neighbors : [('sugo', 1.0), ('leer', 1.0), ('akh3', 1.0)]
User zxcv neighbors : [('sssk', 1.0), ('toyc', 0.9814954576223638), ('azra', 0.9775856785291243)]
User zard neighbors : [('kktw', 0.9975093361076331), ('nege', 0.9539061044454907), ('mony', 0.9240168324218909)]
User artn neighbors : [('azra', 0.9988130559615214), ('grea', 0.9973753280839895), ('toyc', 0.9778315327487492)]
User suha neighbors : [('azra', 0.9985942544945725), ('sugo', 0.9985239844353392), ('myil', 0.9961867268641653)]
User ldsl neighbors : [('asdz', 1.0), ('sssk', 0.9984776046952725), ('bs07', 0.99028229600855)]
User imag neighbors : [('sugo', 1.0), ('mumu', 1.0), ('sohy',

In [23]:
for user in selected_user['userId']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(user, 3, distance_correlation)))

User ykm3 neighbors : [('mumu', 0.9999999999999999), ('k2sn', 0.6887568256738226), ('wjda', 0.5)]
User sang neighbors : [('kchm', 1.0), ('frie', 0.9374056926690407), ('hanu', 0.9318878949926048)]
User tsp0 neighbors : [('lucy', 1.0000000000000002), ('xmj0', 0.9819805060619656), ('sugo', 0.9563650695950074)]
User hosu neighbors : [('jhl5', 0.8898206684033634), ('huya', 0.8784585919193317), ('ldsl', 0.6602463508292717)]
User zxcv neighbors : [('azra', 0.9707253433941507), ('toyc', 0.9449111825230684), ('imag', 0.5967499875721346)]
User zard neighbors : [('nege', 0.944911182523068), ('wato', 0.5773502691896257), ('hanu', 0.5773502691896257)]
User artn neighbors : [('toyc', 0.9415130835240084), ('bs07', 0.7276068751089989), ('myil', 0.5921865681580842)]
User suha neighbors : [('imag', 0.852823034590905), ('hanu', 0.5773502691896258), ('geon', 0.5320868955829371)]
User ldsl neighbors : [('asdz', 1.0), ('sssk', 0.9999999999999999), ('hosu', 0.6602463508292717)]
User imag neighbors : [('sohy'

In [24]:
for user in selected_user['userId']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(user, 3, distance_euclidean)))

User ykm3 neighbors : [('kchm', 0.25), ('mumu', 0.2), ('zct2', 0.11978243827074593)]
User sang neighbors : [('sugo', 0.3333333333333333), ('kalk', 0.3333333333333333), ('gopa', 0.3333333333333333)]
User tsp0 neighbors : [('xmj0', 0.4142135623730951), ('kktw', 0.3090169943749474), ('ehw2', 0.28989794855663564)]
User hosu neighbors : [('huya', 0.16666666666666666), ('xmj0', 0.1639607805437114), ('jhl5', 0.15438708879488486)]
User zxcv neighbors : [('sssk', 1.0), ('toyc', 0.2402530733520421), ('k2sn', 0.18660549686337075)]
User zard neighbors : [('nege', 0.21712927295533244), ('kktw', 0.1463924816619788), ('hanu', 0.12973190755680383)]
User artn neighbors : [('grea', 0.4142135623730951), ('toyc', 0.23166247903554), ('bs07', 0.1463924816619788)]
User suha neighbors : [('myil', 0.3090169943749474), ('hanu', 0.3090169943749474), ('sugo', 0.28989794855663564)]
User ldsl neighbors : [('asdz', 1.0), ('sssk', 0.5), ('bs07', 0.25)]
User imag neighbors : [('sugo', 1.0), ('mumu', 1.0), ('sohy', 0.3

In [25]:
result = []
for i in range(10):
    userId = selected_user.iloc[i].userId
    movieId = int(selected_user.iloc[i].movieId)
    predict = predict_rating(userId, 100, distance_cosine)
    for movie in predict:
        #print(movie[0])
        if movieId == movie[0]:
            result.append([userId, int(movieId), movie[1]])
            
resultdf = pd.DataFrame(result, columns=['userId', 'movieId', 'rating'])
resultdf

Unnamed: 0,userId,movieId,rating
0,ykm3,137938,3.0
1,sang,95327,8.991661
2,tsp0,52757,7.152302
3,hosu,94170,5.390642
4,zard,152170,6.482358
5,artn,120165,9.0
6,suha,95327,8.354529
7,ldsl,163788,8.53487
8,imag,161967,9.004265


In [26]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(selected_user[selected_user['userId'] == userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared):" , error_rate_squared)


Error Rate(Absolute) :  2.009595182239067
Error Rate(Squared): 5.492324982636706


In [29]:
result = []
for i in range(10):
    userId = selected_user.iloc[i].userId
    movieId = int(selected_user.iloc[i].movieId)
    predict = predict_rating(userId, 100, distance_correlation)
    for movie in predict:
        #print(movie[0])
        if movieId == movie[0]:
            result.append([userId, int(movieId), movie[1]])
            
resultdf = pd.DataFrame(result, columns=['userId', 'movieId', 'rating'])
resultdf

  dist = 1.0 - uv / np.sqrt(uu * vv)
  from ipykernel import kernelapp as app


Unnamed: 0,userId,movieId,rating
0,ykm3,137938,3.0
1,sang,95327,9.280078
2,tsp0,52757,8.094246
3,hosu,94170,4.976418
4,zard,152170,7.54491
5,artn,120165,9.0
6,suha,95327,7.703229
7,ldsl,163788,7.0
8,imag,161967,8.341019


In [30]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(selected_user[selected_user['userId'] == userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared):" , error_rate_squared)


Error Rate(Absolute) :  2.3120832609045414
Error Rate(Squared): 7.169692679721367


In [33]:
result = []
for i in range(10):
    userId = selected_user.iloc[i].userId
    movieId = int(selected_user.iloc[i].movieId)
    predict = predict_rating(userId, 100, distance_euclidean)
    for movie in predict:
        #print(movie[0])
        if movieId == movie[0]:
            result.append([userId, int(movieId), movie[1]])
            
resultdf = pd.DataFrame(result, columns=['userId', 'movieId', 'rating'])
resultdf

Unnamed: 0,userId,movieId,rating
0,ykm3,137938,3.0
1,sang,95327,9.081624
2,tsp0,52757,7.267217
3,hosu,94170,4.697674
4,zard,152170,5.252888
5,artn,120165,9.0
6,suha,95327,8.976081
7,ldsl,163788,9.534171
8,imag,161967,9.509293


In [34]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(selected_user[selected_user['userId'] == userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared):" , error_rate_squared)


Error Rate(Absolute) :  2.1478771038301177
Error Rate(Squared): 7.022522536688424
