In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('Recommend.csv',names=['user_id', 'movie_id', 'rating', 'timestamp'])
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
from sklearn.model_selection import train_test_split
n_users = df.user_id.unique().shape[0] 
n_movies = df.movie_id.unique().shape[0]
train_data, test_data = train_test_split(df, test_size=0.25)

In [7]:
axis = np.newaxis
axis

In [8]:
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    train_data_matrix[line[1]-1, line[2]-1] = line[3] 
train_data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    #[user_id index, movie_id index] = given rating.
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
test_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [10]:
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
movie_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
mean_user_rating = train_data_matrix.mean(axis=1)[:, np.newaxis] 
ratings_diff = (train_data_matrix - mean_user_rating) 
user_pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
user_pred

array([[ 1.52230695,  0.5415636 ,  0.46370038, ...,  0.29145256,
         0.2913536 ,  0.29111565],
       [ 1.27095364,  0.25644547,  0.1432306 , ..., -0.05615223,
        -0.05471476, -0.05465459],
       [ 1.25174831,  0.19016622,  0.0846035 , ..., -0.11859025,
        -0.11718063, -0.11703066],
       ...,
       [ 1.14472216,  0.17978317,  0.06831918, ..., -0.1205954 ,
        -0.11971666, -0.11965975],
       [ 1.29215352,  0.2655818 ,  0.18142624, ..., -0.01722874,
        -0.0165043 , -0.01627576],
       [ 1.33594763,  0.34392243,  0.26551484, ...,  0.08882104,
         0.08883017,  0.08882008]])

In [11]:
mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

array([[ 1.52230695,  0.5415636 ,  0.46370038, ...,  0.29145256,
         0.2913536 ,  0.29111565],
       [ 1.27095364,  0.25644547,  0.1432306 , ..., -0.05615223,
        -0.05471476, -0.05465459],
       [ 1.25174831,  0.19016622,  0.0846035 , ..., -0.11859025,
        -0.11718063, -0.11703066],
       ...,
       [ 1.14472216,  0.17978317,  0.06831918, ..., -0.1205954 ,
        -0.11971666, -0.11965975],
       [ 1.29215352,  0.2655818 ,  0.18142624, ..., -0.01722874,
        -0.0165043 , -0.01627576],
       [ 1.33594763,  0.34392243,  0.26551484, ...,  0.08882104,
         0.08883017,  0.08882008]])

In [12]:
mean_user_rating 

array([[0.43995244],
       [0.11117717],
       [0.04875149],
       [0.0469679 ],
       [0.21105826],
       [0.35969084],
       [0.7098692 ],
       [0.10820452],
       [0.03507729],
       [0.32639715],
       [0.27586207],
       [0.08917955],
       [0.92211653],
       [0.18727705],
       [0.14447087],
       [0.24732461],
       [0.03745541],
       [0.47859691],
       [0.02378121],
       [0.06064209],
       [0.22294887],
       [0.18370987],
       [0.25862069],
       [0.13495838],
       [0.14149822],
       [0.14744352],
       [0.039239  ],
       [0.12604043],
       [0.0588585 ],
       [0.07074911],
       [0.06242568],
       [0.05648038],
       [0.03864447],
       [0.03032105],
       [0.03151011],
       [0.03269917],
       [0.08680143],
       [0.18906064],
       [0.04399524],
       [0.03745541],
       [0.08977408],
       [0.3097503 ],
       [0.34601665],
       [0.25326992],
       [0.07372176],
       [0.05707491],
       [0.03567182],
       [0.123

In [13]:
user_similarity

array([[0.        , 0.88103104, 0.97078998, ..., 0.86073093, 0.84538022,
        0.68486349],
       [0.88103104, 0.        , 0.91242785, ..., 0.83948924, 0.89995262,
        0.91736575],
       [0.97078998, 0.91242785, 0.        , ..., 0.90329868, 0.96031926,
        0.95630086],
       ...,
       [0.86073093, 0.83948924, 0.90329868, ..., 0.        , 0.96580472,
        0.90538384],
       [0.84538022, 0.89995262, 0.96031926, ..., 0.96580472, 0.        ,
        0.85010803],
       [0.68486349, 0.91736575, 0.95630086, ..., 0.90538384, 0.85010803,
        0.        ]])

In [14]:
ratings_diff

array([[ 4.56004756,  2.56004756,  3.56004756, ..., -0.43995244,
        -0.43995244, -0.43995244],
       [ 3.88882283, -0.11117717, -0.11117717, ..., -0.11117717,
        -0.11117717, -0.11117717],
       [-0.04875149, -0.04875149, -0.04875149, ..., -0.04875149,
        -0.04875149, -0.04875149],
       ...,
       [ 4.95600476, -0.04399524, -0.04399524, ..., -0.04399524,
        -0.04399524, -0.04399524],
       [-0.14090369, -0.14090369, -0.14090369, ..., -0.14090369,
        -0.14090369, -0.14090369],
       [-0.23959572, -0.23959572, -0.23959572, ..., -0.23959572,
        -0.23959572, -0.23959572]])

In [15]:
user_similarity

array([[0.        , 0.88103104, 0.97078998, ..., 0.86073093, 0.84538022,
        0.68486349],
       [0.88103104, 0.        , 0.91242785, ..., 0.83948924, 0.89995262,
        0.91736575],
       [0.97078998, 0.91242785, 0.        , ..., 0.90329868, 0.96031926,
        0.95630086],
       ...,
       [0.86073093, 0.83948924, 0.90329868, ..., 0.        , 0.96580472,
        0.90538384],
       [0.84538022, 0.89995262, 0.96031926, ..., 0.96580472, 0.        ,
        0.85010803],
       [0.68486349, 0.91736575, 0.95630086, ..., 0.90538384, 0.85010803,
        0.        ]])

In [16]:
movie_pred = train_data_matrix.dot(movie_similarity) / np.array([np.abs(movie_similarity).sum(axis=1)])
movie_pred

array([[0.36176994, 0.38366008, 0.39266571, ..., 0.4432017 , 0.4326203 ,
        0.42821342],
       [0.09441736, 0.10828642, 0.10523846, ..., 0.10967169, 0.11169044,
        0.11141362],
       [0.04710408, 0.04914378, 0.04786567, ..., 0.0469866 , 0.04843063,
        0.04927277],
       ...,
       [0.0301919 , 0.0398214 , 0.0375038 , ..., 0.04407853, 0.04337073,
        0.04333391],
       [0.11870632, 0.12794114, 0.13584232, ..., 0.14017288, 0.1389391 ,
        0.14003433],
       [0.19021445, 0.20165478, 0.20961064, ..., 0.24184031, 0.23370001,
        0.23356466]])

In [17]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(pred, test):
    pred = pred[test.nonzero()].flatten() 
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, test))

In [18]:
rmse(user_pred, test_data_matrix)

3.127243665648637

In [19]:
rmse(movie_pred, test_data_matrix)

3.4541961024248926