In [1]:
# !wget 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
# !unzip 'ml-1m.zip'

In [2]:
import pandas as pd
import numpy as np

In [3]:
ratings = pd.read_csv('ml-1m/ratings.dat', 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userId', 'movieId', 'rating', 'timestamp']).drop(['timestamp'], axis=1)
movies = pd.read_csv('ml-1m/movies.dat', 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movieId', 'title', 'genres'])

ratings.shape, movies.shape

((1000209, 3), (3883, 3))

In [4]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3


In [5]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [6]:
num_movies = ratings['movieId'].unique().shape[0]
num_users = ratings['userId'].unique().shape[0]
print('Number of movies: ',num_movies)
print('Number of users: ',num_users)

Number of movies:  3706
Number of users:  6040


In [7]:
# users have at least rated 20 movies
ratings.groupby(['userId']).size().reset_index().sort_values(0)[:5]


Unnamed: 0,userId,0
946,947,20
4067,4068,20
2529,2530,20
340,341,20
5257,5258,20


In [8]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=5, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]

    return train, test

In [9]:
sparse_ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0).values

train, test = train_test_split(sparse_ratings)

train.shape, test.shape

((6040, 3706), (6040, 3706))

In [10]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = 1-pairwise_distances(train, metric='cosine')
item_similarity = 1-pairwise_distances(train.T, metric='cosine')
train.shape, user_similarity.shape, item_similarity.shape

((6040, 3706), (6040, 6040), (3706, 3706))

In [11]:
def predict(ratings, similarity, type='user'):
    sum_abs_similarity = np.abs(similarity).sum(axis=1)
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1).reshape(-1,1) # (:,1)
        # removing bias associated with user, some user tend to give high or low ratings
        ratings_diff = (ratings - mean_user_rating)
        pred = mean_user_rating + similarity.dot(ratings_diff) / sum_abs_similarity.reshape(-1,1) # (:,1)
    elif type == 'item':
        pred = ratings.dot(similarity) / sum_abs_similarity.reshape(1,-1)
    return pred

In [12]:
user_pred = predict(train, user_similarity, type='user')
item_pred = predict(train, item_similarity, type='item')

user_pred.shape, item_pred.shape

((6040, 3706), (6040, 3706))

In [13]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [14]:
print('user-based RMSE: ' + str(get_rmse(user_pred, test)))
print('item-based RMSE: ' + str(get_rmse(item_pred, test)))

user-based RMSE: 3.133744042016701
item-based RMSE: 3.5981890905373124


In [15]:
def recommend_movies(userId, ratings, predictions, n=5):
    userId = userId-1
    to_watch_movies = np.where(ratings[userId] == 0)[0]
    
    pred_sort = to_watch_movies[predictions[userId, to_watch_movies].argsort()[:-n-1:-1]]
    pred_values = predictions[userId][pred_sort]
    
    pred_movies = [movies.loc[x+1]['title'] for x in pred_sort]
    
    return pd.DataFrame({'movies': pred_movies, 'predictions': pred_values})


In [16]:
recommend_movies(1, train, user_pred, 10)

Unnamed: 0,movies,predictions
0,Trick (1999),2.396623
1,"Perfect Candidate, A (1996)",2.359701
2,"Return of the Pink Panther, The (1974)",2.090124
3,I Don't Want to Talk About It (De eso no se ha...,2.03154
4,Lethal Weapon 4 (1998),2.000907
5,Hustler White (1996),2.000178
6,24 7: Twenty Four Seven (1997),1.95497
7,"Swan Princess, The (1994)",1.953543
8,Princess Caraboo (1994),1.791114
9,Rush Hour (1998),1.783176


# Test

In [17]:
rats = np.array([
    [5,4,5,0,0,0,0,0,0],
    [5,5,4,0,0,0,0,0,0],
    [0,5,4,0,0,0,0,0,0],
    [0,0,0,3,4,5,0,0,0],
    [0,0,0,5,4,5,0,0,0],
    [0,0,0,0,0,0,5,3,4],
    [0,0,0,0,0,0,5,5,3],
    [0,0,0,0,0,0,5,0,3],
])
rats_mean_user = np.mean(rats, axis=1).reshape(-1,1)
rats_mean_item = np.mean(rats, axis=0).reshape(1,-1)
rats_mean_user.shape, rats_mean_item.shape

((8, 1), (1, 9))

In [18]:
sims_user = 1-pairwise_distances(rats, metric='cosine')
sims_item = 1-pairwise_distances(rats.T, metric='cosine')

In [19]:
p_u = predict(rats, sims_user, type='user')
p_i = predict(rats, sims_item, type='item')

In [20]:
p_u

array([[ 3.75897086,  4.7919934 ,  4.51826389,  0.15512864,  0.15512864,
         0.15512864,  0.15512864,  0.15512864,  0.15512864],
       [ 3.73676351,  4.80275062,  4.5130585 ,  0.15790456,  0.15790456,
         0.15790456,  0.15790456,  0.15790456,  0.15790456],
       [ 2.70637567,  4.36099463,  3.96241145, -0.33829696, -0.33829696,
        -0.33829696, -0.33829696, -0.33829696, -0.33829696],
       [-0.10969523, -0.10969523, -0.10969523,  3.87756187,  3.89030477,
         4.89030477, -0.10969523, -0.10969523, -0.10969523],
       [ 0.10969523,  0.10969523,  0.10969523,  4.12243813,  4.10969523,
         5.10969523,  0.10969523,  0.10969523,  0.10969523],
       [ 0.10244534,  0.10244534,  0.10244534,  0.10244534,  0.10244534,
         0.10244534,  5.10244534,  2.83014751,  3.4527351 ],
       [ 0.194408  ,  0.194408  ,  0.194408  ,  0.194408  ,  0.194408  ,
         0.194408  ,  5.194408  ,  3.09230106,  3.54684295],
       [-0.30889156, -0.30889156, -0.30889156, -0.30889156, -0

In [21]:
p_i

array([[4.70172623, 4.63788864, 4.65325176, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [4.67904096, 4.64577109, 4.64553578, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [2.775205  , 3.22747242, 3.15159806, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 3.98984537, 4.01005255,
        4.01005255, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 4.67005154, 4.66331582,
        4.66331582, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 4.07471679, 3.91961987, 4.07040633],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 4.28824369, 4.38588919, 4.28167762],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 2.86473106, 2.45257737, 2.85559642]])