# Movie ratings prediction (recommender system for movie-lens data set) 

### Read and process data

In [1]:
# python 2.7 with anaconda

import numpy as np
from sklearn import datasets
from scipy.stats import mode
import scipy as sp
import pandas as pd

In [2]:
# Data set: Movie Lens 20M (http://grouplens.org/datasets/movielens/20m/)

genome_scores = pd.read_csv('ml-20m/genome-scores.csv')
genome_tags = pd.read_csv('ml-20m/genome-tags.csv')
links = pd.read_csv('ml-20m/links.csv')
movies = pd.read_csv('ml-20m/movies.csv')
ratings = pd.read_csv('ml-20m/ratings.csv')
tags = pd.read_csv('ml-20m/tags.csv')

In [3]:
# display (movies.info())

In [4]:
for ds in (genome_scores, genome_tags, links, movies, ratings, tags):
    display(ds.head())

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


### Helper methods for making prediction

In [5]:
def add_genres(movies):
    new_movies = movies.copy()
    
    for ix in range(0, len(movies)):
        genres = movies['genres'][ix]
        new_movies.at[ix, 'genres'] = [] if genres == "(no genres listed)" \
                                         else genres.split('|')

    return new_movies

movies_raw = movies
movies_with_genres = add_genres(movies_raw.head())
display(movies_with_genres)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [6]:
movies = movies_with_genres
# check min rating
display (ratings['rating'].min()) # 0.5 star - 5.0 stars

0.5

#### Define user / rating matrix

In [7]:
user_rating = ratings.head(1000000)\
              .pivot(index = 'userId', columns ='movieId', values = 'rating')\
              .fillna(0)
        
user_rating_mx = user_rating.as_matrix()

In [8]:
from sklearn.model_selection import train_test_split

# generate test data
def gen_test_data(user_rating_mx, test_size=0.25):
    data = []

    for i in xrange(user_rating_mx.shape[0]):
        for j in xrange(user_rating_mx.shape[1]):
            if user_rating_mx[i, j] > 0.0:
                data.append(((i, j), user_rating_mx[i, j]))
                
    _, test_data = train_test_split(data, test_size=test_size, random_state=2018)
    return zip(*test_data)

Test data is 25% of all ratings

In [9]:
# test_x -> indices (in provided matrix) of a movie rating from a given user
# test_y -> rating
test_x, test_y = gen_test_data(user_rating_mx, test_size=0.25)

# clear test ratings
for i, j in test_x:
    user_rating_mx[i, j] = 0.0

In [10]:
# display (user_rating.head())
display (user_rating_mx)
display (test_x[:5], test_y[:5])
display (user_rating_mx[0, :5])

array([[ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  4. , ...,  0. ,  0. ,  0. ],
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  3. ,  3. , ...,  0. ,  0. ,  0. ]])

((5765, 2349), (3108, 1245), (4361, 10680), (6738, 1747), (455, 2723))

(4.0, 4.0, 5.0, 1.0, 1.0)

array([ 0. ,  3.5,  0. ,  0. ,  0. ])

In [11]:
display(user_rating.info())
display(user_rating.as_matrix()[0])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6743 entries, 1 to 6743
Columns: 13950 entries, 1 to 130642
dtypes: float64(13950)
memory usage: 717.7 MB


None

array([ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ])

In [12]:
def pearson(x, y):
    if len(x.shape) == 1:
        x = x.reshape((1, -1))
    if len(y.shape) == 1:
        y = y.reshape((1, -1))
        
    prsn = np.zeros((x.shape[0], y.shape[0]))
    for xi in np.arange(x.shape[0]):
        for yi in np.arange(y.shape[0]):
            prsn[xi, yi] = sp.stats.pearsonr(x[xi,:], y[yi,:])[0]
    
    return prsn

In [13]:
x = user_rating.head(10)
a = pearson(np.array(x[:2]), np.array(x[:5]))
print(a)

[[ 1.          0.09289165  0.23426417  0.02094374  0.09839049]
 [ 0.09289165  1.          0.15400786  0.03758312  0.15327809]]


### First try: user-user collaborative filtering using cosine distance

In [14]:
from sklearn.metrics.pairwise import pairwise_distances

user_cos_dist = pairwise_distances(user_rating, metric='cosine', n_jobs=-1)
item_cos_dist = pairwise_distances(user_rating.T, metric='cosine', n_jobs=-1)

In [15]:
display(user_cos_dist)
display(item_cos_dist)

array([[  0.00000000e+00,   9.02292165e-01,   7.58668270e-01, ...,
          9.18014452e-01,   9.77525990e-01,   9.01573332e-01],
       [  9.02292165e-01,   0.00000000e+00,   8.41312539e-01, ...,
          9.89489631e-01,   1.00000000e+00,   8.90775273e-01],
       [  7.58668270e-01,   8.41312539e-01,   3.33066907e-16, ...,
          9.53333382e-01,   9.76800798e-01,   8.18687381e-01],
       ..., 
       [  9.18014452e-01,   9.89489631e-01,   9.53333382e-01, ...,
          0.00000000e+00,   9.54671595e-01,   9.48529897e-01],
       [  9.77525990e-01,   1.00000000e+00,   9.76800798e-01, ...,
          9.54671595e-01,   0.00000000e+00,   9.33911411e-01],
       [  9.01573332e-01,   8.90775273e-01,   8.18687381e-01, ...,
          9.48529897e-01,   9.33911411e-01,   0.00000000e+00]])

array([[  3.33066907e-16,   6.90454172e-01,   7.54098403e-01, ...,
          1.00000000e+00,   9.79696494e-01,   9.76795993e-01],
       [  6.90454172e-01,   1.11022302e-16,   8.16372160e-01, ...,
          1.00000000e+00,   1.00000000e+00,   9.84710386e-01],
       [  7.54098403e-01,   8.16372160e-01,   0.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       ..., 
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       [  9.79696494e-01,   1.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [  9.76795993e-01,   9.84710386e-01,   1.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   0.00000000e+00]])

In [16]:
import gc 
gc.collect()

61

In [17]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(test_y, pred_y):
    return sqrt(mean_squared_error(test_y, pred_y))

In [18]:
def simple_mean_predict(movie_sum_rating, movie_ratings):
    return movie_sum_rating / float(np.count_nonzero(movie_ratings)) 

In [51]:
def user_based_predict(similarity, movie_ratings):
    """ predict single movie rating for a single user
        
        similarity    : vector of similariy of a given user to other users
        movie_ratings : each user movie rating (vector)
    """
    cnt = np.count_nonzero(movie_ratings)
    mr = movie_ratings.copy()
    for i in xrange(len(mr)):
        if mr[i] > 0.0: mr[i] = 1.0
    #print 'cnt:', cnt
    # TODO:
    #  - normalize ratings
    #  - 
    #display (mr[:10])
    #display (similarity[:10])
    return (1.0 - similarity).dot(movie_ratings) / ((1.0-similarity).dot(mr))

In [42]:
def user_collab_predict(similarity, ratings):
    mean_user_rating = ratings.mean(axis=1)
    ratings_diff = ratings - mean_user_rating[:, np.newaxis]
    pred = mean_user_rating[:, np.newaxis] + \
             similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        
    return pred

In [43]:
def get_non_nans(preds, test_y):
    return zip(*(filter(lambda x: np.isfinite(x[0]), zip(preds, test_y))))

In [53]:
# sample prediction
display(user_based_predict(user_cos_dist[2], user_rating_mx[:,0]))
display(user_rating)

3.9396705543637789

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
display (user_rating_mx.shape)
display (user_rating_mx.mean(axis=1))

(6743, 13950)

array([ 0.03301075,  0.01433692,  0.04229391, ...,  0.01197133,
        0.00451613,  0.03562724])

In [54]:
pred_with_user_based = map(lambda x: user_based_predict(user_cos_dist[x[0]], user_rating_mx[:, x[1]]), test_x)



In [55]:
display (pred_with_user_based[:5])

[3.2630160227674825,
 4.1491441100277164,
 3.3972315366959855,
 2.6662558978306805,
 2.5132289282898768]

In [56]:
pred_with_user_based, test_y_filtered = get_non_nans(pred_with_user_based, test_y)
# display (pred_with_user_based[:10])
print ('removed %d data points (which where nan)' % (len(test_y) - len(test_y_filtered),))

removed 857 data points (which where nan)


In [57]:
display (rmse(test_y_filtered, pred_with_user_based))

0.9422011814071392

In [58]:
display (test_y_filtered[:10], pred_with_user_based[:10])

(4.0, 4.0, 5.0, 1.0, 1.0, 1.0, 2.0, 4.0, 3.5, 4.0)

(3.2630160227674825,
 4.1491441100277164,
 3.3972315366959855,
 2.6662558978306805,
 2.5132289282898768,
 2.708920343439372,
 2.8100747314844252,
 3.8250394878201646,
 3.7879295364607404,
 3.8693825369381463)

Result is RMSE of around 0.942

In [59]:
movie_ratings_sum = user_rating_mx.sum(axis=0)

pred_with_simple_mean = \
    map(lambda x: simple_mean_predict(movie_ratings_sum[x[1]], user_rating_mx[:, x[1]]), test_x)

  


In [60]:
pred_with_simple_mean, test_y_filtered = get_non_nans(pred_with_simple_mean, test_y)
display (rmse(test_y_filtered, pred_with_simple_mean))

0.9502853719187646

In [61]:
display (test_y_filtered[:10], pred_with_simple_mean[:10])

(4.0, 4.0, 5.0, 1.0, 1.0, 1.0, 2.0, 4.0, 3.5, 4.0)

(3.2894736842105261,
 4.1635220125786168,
 3.3333333333333335,
 2.7681818181818181,
 2.3529411764705883,
 2.7086330935251799,
 2.7647058823529411,
 3.7749999999999999,
 3.768729641693811,
 3.8836443468715696)

In [62]:
pred_with_user_collab = user_collab_predict(user_cos_dist, user_rating_mx)

In [63]:
display (pred_with_user_collab)

array([[ 1.05003818,  0.39348991,  0.24658002, ...,  0.00615759,
         0.00666124,  0.00657758],
       [ 1.03474534,  0.38010867,  0.21916912, ..., -0.01317821,
        -0.01266035, -0.01273032],
       [ 1.04787261,  0.40317641,  0.25326638, ...,  0.01550136,
         0.01599866,  0.01593159],
       ..., 
       [ 1.02178092,  0.38065502,  0.22347203, ..., -0.0156229 ,
        -0.01512821, -0.01519942],
       [ 1.04422959,  0.37268127,  0.21691084, ..., -0.02370962,
        -0.02317393, -0.0232547 ],
       [ 1.03333748,  0.37500544,  0.2292211 , ...,  0.00843482,
         0.00892996,  0.00887527]])

In [34]:
user_cos_dist[0]

array([ 0.        ,  0.90229216,  0.75866827, ...,  0.91801445,
        0.97752599,  0.90157333])

In [35]:
display (test_x[:4], test_y[:4])
pred_with_user_collab.T[3108, 1245]

((5765, 2349), (3108, 1245), (4361, 10680), (6738, 1747))

(4.0, 4.0, 5.0, 1.0)

-0.0096814456008282625