In [1]:
# Collaborative Filtering : User similarity based recommendation
## 1) Get similarities of between all users (using Similarity measures)
## 2) Get similarities between current user and others
## 3) Get estimated rating of items which current user do not rate yet
## 4) Recommend N items ranked high

# Similarity measure
# 1) Correlations
# 2) Cosine similarity : simil(x,y) = /sum_i{r_{x,i}*r_{y,i}}/(/sqrt{/sum_i{r_{x,i}**2}}*/sqrt{/sum_i{r_{y,i}**2}})
# 3) Tanimoto coefficient (Binary data) : simil(x,y) = c/(a+b-c) 
# 4) Jaccard coefficeint (Binary data)
# 5) Pearson correlation coefficient

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [3]:
# data directory absolute path
data_dir = '../../../../data/'

# Read data
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(data_dir+'u.user', sep='|', names=u_cols, encoding='latin-1')

i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Childeren\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
         ]
movies = pd.read_csv(data_dir+'u.item', sep='|', names=i_cols, encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(data_dir+'u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

### User-Item Matrix

In [5]:
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


### User-User Similarity Matrix 

In [6]:
# get cosine similarities of all user in train set
dummy_matrix = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(dummy_matrix, dummy_matrix)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
print('User similirarity matrix')
user_similarity

User similirarity matrix


user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.172823,0.014915,0.011442,0.248115,0.342545,0.325373,0.236226,0.071352,0.277756,...,0.297696,0.097647,0.225789,0.157429,0.211078,0.117602,0.210699,0.121136,0.147133,0.327089
2,0.172823,1.000000,0.081583,0.146540,0.065135,0.179882,0.067089,0.078834,0.139252,0.142387,...,0.087555,0.217577,0.283508,0.330042,0.276425,0.213303,0.091183,0.182386,0.094923,0.085006
3,0.014915,0.081583,1.000000,0.215194,0.027876,0.027177,0.028713,0.075722,0.056803,0.034532,...,0.032991,0.057279,0.125059,0.078330,0.136701,0.015153,0.139934,0.048218,0.160619,0.034909
4,0.011442,0.146540,0.215194,1.000000,0.000000,0.038916,0.065344,0.061564,0.058100,0.020777,...,0.024103,0.048823,0.147494,0.173591,0.123564,0.038747,0.131613,0.197275,0.086055,0.035706
5,0.248115,0.065135,0.027876,0.000000,1.000000,0.167418,0.278878,0.210463,0.067059,0.153149,...,0.263021,0.074281,0.103402,0.049039,0.141594,0.029476,0.172623,0.105115,0.088654,0.290359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.117602,0.213303,0.015153,0.038747,0.029476,0.127627,0.052345,0.070793,0.051778,0.062954,...,0.012888,0.363743,0.165642,0.251329,0.363969,1.000000,0.070374,0.235143,0.040158,0.115464
940,0.210699,0.091183,0.139934,0.131613,0.172623,0.222816,0.228030,0.121435,0.070350,0.249058,...,0.269959,0.070940,0.114583,0.150367,0.092526,0.070374,1.000000,0.138096,0.160088,0.178343
941,0.121136,0.182386,0.048218,0.197275,0.105115,0.076937,0.033313,0.085603,0.135105,0.086022,...,0.000000,0.163375,0.267988,0.236292,0.287703,0.235143,0.138096,1.000000,0.099390,0.050629
942,0.147133,0.094923,0.160619,0.086055,0.088654,0.214471,0.180090,0.150939,0.102033,0.157912,...,0.229683,0.022489,0.075907,0.143008,0.075828,0.040158,0.160088,0.099390,1.000000,0.150523


In [8]:
def simple_cf(user_id, movie_id):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy() # get similarity from user-user matrix
        movie_ratings = rating_matrix[movie_id].copy() # get rating from user-item matrix
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index # list
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    
    return mean_rating

score(simple_cf)

1.0168108797156636

In [None]:
# practice 3-1 : using Pearson correlation coefficient

In [None]:
# RMSE with neighbor
def score(model, neighbor_size=0):
    