In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### Load Dataset

In [2]:
path = '../data'
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')

print(ratings_df.shape)
ratings_df.head(2)

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [3]:
train_df, test_df = train_test_split(ratings_df,
                                     test_size=0.2,
                                     random_state=1234)
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


### Sparse Matrix 만들기

In [4]:
sparse_matrix = train_df.groupby('movieId').apply(lambda x : pd.Series(x['rating'].values, index=x['userId']))
sparse_matrix = sparse_matrix.unstack()

sparse_matrix.head(2)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,,2.5,,5.0
2,,,,,,,,4.0,,,...,,4.0,,,3.5,,,2.0,,


### Cosine Similarity

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim_values = cosine_similarity(a.values, b.values)
    
    return pd.DataFrame(data=cos_sim_values,
                         columns=a.index,
                         index=a.index)

### Neighborhood-based Collaborative Filtering

### 1) Item-based

In [6]:
item_sparse_matrix = sparse_matrix.fillna(0)

item_sparse_matrix.shape

(8938, 610)

In [276]:
item_cossim_df = cos_sim_matrix(item_sparse_matrix, item_sparse_matrix)

print(item_cossim_df.shape)
item_cossim_df.head(2)

(8938, 8938)


movieId,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.304336,0.267816,0.040259,0.221228,0.266544,0.149392,0.132943,0.182044,0.296838,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.304336,1.0,0.226138,0.052482,0.154783,0.209716,0.18942,0.068012,0.027945,0.303157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [271]:
# movie id : 8938
# user id : 610
item_prediction_result_df = pd.DataFrame(index=item_sparse_matrix.columns,
                                         columns=item_sparse_matrix.index)

item_prediction_result_df

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [272]:
def func(data):
    # (1 x n) x (n x 8938) => (1 x 8938)
    # (유저가 본 영화 평점 * 유저가 본 영화와의 유사도) / 유사도 총 합
    user_data = train_df[train_df['userId'] == data.name]
    user_sim = item_cossim_df.loc[user_data['movieId'].values].values
    
    item_prediction_result_df.loc[data.name] = np.matmul(user_data['rating'].values.reshape(1,-1), user_sim).squeeze() / (user_sim.sum(0)+1)
    
item_prediction_result_df.apply(func, axis=1)

item_prediction_result_df.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.20279,4.18783,4.17609,3.03272,4.04561,4.24019,3.9933,3.90547,3.70279,4.18855,...,0.979003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.38273
2,3.19261,3.15207,2.46138,0.280903,2.61916,2.9754,1.85877,1.68768,1.09268,3.02235,...,0.387743,1.70885,1.70885,1.70885,1.70885,1.70885,1.70885,1.70885,1.70885,2.32666
3,1.32315,1.20446,1.39635,0.217016,0.819716,1.59889,0.802514,0.755008,0.86708,1.55365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.48994,3.48805,3.38864,3.15017,3.30488,3.50267,3.31794,2.90629,3.03413,3.41697,...,1.0335,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,1.963
5,3.27916,3.1278,2.99947,2.56582,2.94179,3.16869,3.00793,2.67809,2.27668,3.01557,...,0.391638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.562705
6,3.58597,3.57233,3.53615,3.33866,3.51444,3.5477,3.53826,3.42057,3.3406,3.55275,...,0.446497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.672442
7,3.36009,3.25338,3.24516,2.22581,3.2114,3.33808,3.1832,2.8918,2.89194,3.3515,...,0.872628,0.3554,0.3554,0.3554,0.3554,0.3554,0.3554,0.3554,0.3554,2.50752
8,3.26123,3.22385,3.03702,2.58129,2.94548,3.21505,3.06126,2.6713,2.37,3.17426,...,0.597653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.730709
9,2.80797,2.62156,2.28209,0.767416,2.25043,2.64179,2.04582,1.59945,1.07072,2.63652,...,0.458811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3.07042,3.04355,2.85983,0.908598,2.97965,2.905,2.75228,2.24064,1.58879,3.05624,...,0.63253,1.21136,1.21136,1.21136,1.21136,1.21136,1.21136,1.21136,1.21136,2.33335


### 2) User-based

In [275]:
user_sparse_matrix = sparse_matrix.fillna(0).T

user_sparse_matrix.shape

(610, 8938)

In [277]:
user_cossim_df = cos_sim_matrix(user_sparse_matrix, user_sparse_matrix)

print(user_cossim_df.shape)
user_cossim_df.head(2)

(610, 610)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.016665,0.07057,0.160438,0.07541,0.087404,0.123664,0.069225,0.037416,0.0099,...,0.056163,0.124896,0.162324,0.050852,0.101583,0.128198,0.240652,0.225897,0.063984,0.095228
2,0.016665,1.0,0.0,0.004295,0.02056,0.030996,0.027726,0.0,0.0,0.057112,...,0.151666,0.019379,0.006645,0.0,0.0,0.02818,0.0,0.046286,0.033522,0.090288


In [280]:
user_prediction_result_df = pd.DataFrame(index=user_sparse_matrix.columns,
                                         columns=user_sparse_matrix.index)
user_prediction_result_df.shape

(8938, 610)

In [310]:
def func(data):
    
    movie_data = train_df[train_df['movieId'] == data.name]
    movie_sim = user_cossim_df.loc[movie_data['userId'].values]
    
    user_prediction_result_df.loc[data.name] = np.matmul(movie_data.rating.values,
                                                         movie_sim.values) / (movie_sim.sum(0)+1)

user_prediction_result_df.apply(func, axis=1)
user_prediction_result_df = user_prediction_result_df.T

user_prediction_result_df.head(10)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.67043,3.01898,3.02703,0.636151,2.33427,3.65344,2.54601,1.01293,1.59897,3.2986,...,0.0654522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140421
2,3.34895,2.57281,1.7197,0.146062,1.55944,2.9715,1.49602,0.289957,0.408521,2.88504,...,0.0463527,0.427493,0.332495,0.284995,0.379994,0.379994,0.332495,0.379994,0.332495,0.592074
3,2.121,1.27956,1.26626,0.0367179,0.466202,2.1174,0.780114,0.104695,0.218952,1.8424,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.61963,2.93753,2.73966,0.72914,2.1024,3.50549,2.50204,0.613417,1.2701,3.09378,...,0.0574532,0.052407,0.040761,0.034938,0.046584,0.046584,0.040761,0.046584,0.040761,0.251308
5,3.76122,3.21044,2.92431,0.957915,2.54943,3.45657,2.85021,1.20054,1.5878,3.13192,...,0.0586848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142512
6,3.83087,3.32198,3.32667,1.21591,3.03136,3.60413,3.11031,1.63385,2.15392,3.25954,...,0.0290969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0660016
7,3.77535,3.10961,2.83467,0.572114,2.35059,3.64751,2.62267,0.940899,1.55949,3.36406,...,0.0674776,0.0409324,0.0318363,0.0272883,0.0363844,0.0363844,0.0318363,0.0363844,0.0318363,0.58529
8,3.79756,3.3381,3.12298,1.16575,2.71027,3.53906,3.00015,1.44235,1.66749,3.22467,...,0.0958472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202119
9,3.17211,2.35781,1.68523,0.0990461,1.19739,2.80897,1.40665,0.275126,0.327804,2.82822,...,0.0636902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3.53956,2.8281,2.16925,0.172367,1.72931,3.11516,1.83001,0.387772,0.468644,3.10031,...,0.0432531,0.213547,0.166092,0.142364,0.189819,0.189819,0.166092,0.189819,0.166092,0.562335


In [325]:
item_prediction_result_df.shape, user_prediction_result_df.shape

((610, 8938), (610, 8938))

### Evaluation (RMSE)

In [326]:
test_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
99731,610,3527,5.0,1479545223
97583,606,1250,3.5,1171376891


In [327]:
set(test_df.userId.unique()) - set(train_df.userId.unique())

set()

In [324]:
set(test_df.movieId.unique()) - set(train_df.movieId.unique())

{49,
 117,
 137,
 178,
 241,
 320,
 359,
 478,
 488,
 495,
 511,
 563,
 632,
 645,
 679,
 722,
 773,
 790,
 896,
 1119,
 1310,
 1335,
 1336,
 1337,
 1349,
 1412,
 1427,
 1428,
 1507,
 1519,
 1659,
 1671,
 1685,
 1798,
 1807,
 1891,
 1902,
 1932,
 2008,
 2154,
 2164,
 2172,
 2175,
 2260,
 2281,
 2298,
 2370,
 2388,
 2419,
 2462,
 2503,
 2577,
 2652,
 2659,
 2665,
 2727,
 2738,
 2765,
 2800,
 2813,
 2848,
 2876,
 2885,
 2887,
 2893,
 2896,
 2979,
 2983,
 3013,
 3057,
 3086,
 3106,
 3125,
 3162,
 3276,
 3315,
 3345,
 3353,
 3379,
 3428,
 3434,
 3459,
 3622,
 3674,
 3694,
 3700,
 3737,
 3807,
 3813,
 3834,
 3855,
 3857,
 3939,
 3941,
 3945,
 4074,
 4139,
 4174,
 4181,
 4202,
 4242,
 4243,
 4297,
 4353,
 4374,
 4384,
 4390,
 4426,
 4440,
 4454,
 4458,
 4524,
 4557,
 4581,
 4615,
 4635,
 4646,
 4653,
 4660,
 4695,
 4710,
 4769,
 4777,
 4786,
 4810,
 4828,
 4879,
 4998,
 5033,
 5051,
 5053,
 5063,
 5071,
 5095,
 5109,
 5197,
 5209,
 5212,
 5221,
 5224,
 5240,
 5241,
 5268,
 5278,
 5300,
 5328

In [320]:
def func(data):
    
    item_prediction_result_df.loc[data.userId, data.movieId]

test_df.apply(func, axis = 1)

KeyError: 5550.0

In [319]:
item_prediction_result_df#.loc[1]

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.20279,4.18783,4.17609,3.03272,4.04561,4.24019,3.9933,3.90547,3.70279,4.18855,...,0.979003,0,0,0,0,0,0,0,0,1.38273
2,3.19261,3.15207,2.46138,0.280903,2.61916,2.9754,1.85877,1.68768,1.09268,3.02235,...,0.387743,1.70885,1.70885,1.70885,1.70885,1.70885,1.70885,1.70885,1.70885,2.32666
3,1.32315,1.20446,1.39635,0.217016,0.819716,1.59889,0.802514,0.755008,0.86708,1.55365,...,0,0,0,0,0,0,0,0,0,0
4,3.48994,3.48805,3.38864,3.15017,3.30488,3.50267,3.31794,2.90629,3.03413,3.41697,...,1.0335,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,0.505535,1.963
5,3.27916,3.1278,2.99947,2.56582,2.94179,3.16869,3.00793,2.67809,2.27668,3.01557,...,0.391638,0,0,0,0,0,0,0,0,0.562705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.6229,3.54834,3.4441,3.50806,3.47302,3.63002,3.625,3.27543,3.26219,3.51926,...,1.83383,1.39051,1.39051,1.39051,1.39051,1.39051,1.39051,1.39051,1.39051,2.99518
607,3.66544,3.65456,3.65215,3.0027,3.50511,3.70112,3.53403,3.35029,3.31835,3.66258,...,0.777542,0,0,0,0,0,0,0,0,0.68735
608,3.19053,3.14796,2.95741,2.55002,3.01801,3.25907,2.99704,2.79422,2.78949,3.20234,...,1.95922,0.44408,0.44408,0.44408,0.44408,0.44408,0.44408,0.44408,0.44408,3.46138
609,2.90968,2.85971,2.7548,1.95234,2.63154,2.84516,2.64514,2.41542,2.22803,2.90974,...,0.246588,0,0,0,0,0,0,0,0,0.272122
