# Movie ratings prediction (recommender system for movie-lens data set) 

### Read and process data

In [1]:
# python 2.7 with anaconda

import numpy as np
from sklearn import datasets
from scipy.stats import mode
import scipy as sp
import pandas as pd

In [2]:
# Data set: Movie Lens 20M (http://grouplens.org/datasets/movielens/20m/)
# Data is in ml-20m directory

genome_scores = pd.read_csv('ml-20m/genome-scores.csv')
genome_tags = pd.read_csv('ml-20m/genome-tags.csv')
links = pd.read_csv('ml-20m/links.csv')
movies = pd.read_csv('ml-20m/movies.csv')
ratings = pd.read_csv('ml-20m/ratings.csv')
tags = pd.read_csv('ml-20m/tags.csv')

In [3]:
# display (movies.info())

In [4]:
for ds in (genome_scores, genome_tags, links, movies, ratings, tags):
    display(ds.head())

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


### Helpers

In [5]:
def add_genres(movies):
    new_movies = movies.copy()
    
    for ix in range(0, len(movies)):
        genres = movies['genres'][ix]
        new_movies.at[ix, 'genres'] = [] if genres == "(no genres listed)" \
                                         else genres.split('|')

    return new_movies

movies_raw = movies
movies_with_genres = add_genres(movies_raw)
display(movies_with_genres)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
5,6,Heat (1995),"[Action, Crime, Thriller]"
6,7,Sabrina (1995),"[Comedy, Romance]"
7,8,Tom and Huck (1995),"[Adventure, Children]"
8,9,Sudden Death (1995),[Action]
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]"


In [8]:
movies = movies_with_genres
# check min rating
display (ratings['rating'].min()) # 0.5 star - 5.0 stars

0.5

#### Define user / rating matrix

In [9]:
user_rating = ratings.head(1000000)\
              .pivot(index = 'userId', columns ='movieId', values = 'rating')\
              .fillna(0)
        
display(user_rating.head())
user_rating_mx = user_rating.as_matrix()

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.model_selection import train_test_split

# generate test data
def gen_test_data(user_rating_mx, test_size=0.25):
    data = []

    for i in xrange(user_rating_mx.shape[0]):
        for j in xrange(user_rating_mx.shape[1]):
            if user_rating_mx[i, j] > 0.0:
                data.append(((i, j), user_rating_mx[i, j]))
                
    _, test_data = train_test_split(data, test_size=test_size, random_state=2018)
    return zip(*test_data)

Test data is 25% of all ratings

In [11]:
# test_x -> indices (in provided matrix) of a movie rating from a given user
# test_y -> rating
test_x, test_y = gen_test_data(user_rating_mx, test_size=0.25)

# clear test ratings
for i, j in test_x:
    user_rating_mx[i, j] = 0.0

In [12]:
# display (user_rating.head())
display (user_rating_mx)
display (test_x[:5], test_y[:5])
display (user_rating_mx[0, :5])

array([[ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  4. , ...,  0. ,  0. ,  0. ],
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  3. ,  3. , ...,  0. ,  0. ,  0. ]])

((5765, 2349), (3108, 1245), (4361, 10680), (6738, 1747), (455, 2723))

(4.0, 4.0, 5.0, 1.0, 1.0)

array([ 0. ,  3.5,  0. ,  0. ,  0. ])

In [13]:
display(user_rating.info())
display(user_rating.as_matrix()[0])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6743 entries, 1 to 6743
Columns: 13950 entries, 1 to 130642
dtypes: float64(13950)
memory usage: 717.7 MB


None

array([ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ])

In [14]:
def pearson(x, y):
    if len(x.shape) == 1:
        x = x.reshape((1, -1))
    if len(y.shape) == 1:
        y = y.reshape((1, -1))
        
    prsn = np.zeros((x.shape[0], y.shape[0]))
    for xi in np.arange(x.shape[0]):
        for yi in np.arange(y.shape[0]):
            prsn[xi, yi] = sp.stats.pearsonr(x[xi,:], y[yi,:])[0]
    
    return prsn

In [15]:
x = user_rating.head(10)
a = pearson(np.array(x[:2]), np.array(x[:5]))
print(a)

[[ 1.          0.09289165  0.23426417  0.02094374  0.09839049]
 [ 0.09289165  1.          0.15400786  0.03758312  0.15327809]]


### First try: user-user collaborative filtering using cosine distance

In [16]:
from sklearn.metrics.pairwise import pairwise_distances

user_cos_dist = pairwise_distances(user_rating, metric='cosine', n_jobs=-1)
item_cos_dist = pairwise_distances(user_rating.T, metric='cosine', n_jobs=-1)

In [17]:
display(user_cos_dist)
display(item_cos_dist)

array([[  0.00000000e+00,   9.02292165e-01,   7.58668270e-01, ...,
          9.18014452e-01,   9.77525990e-01,   9.01573332e-01],
       [  9.02292165e-01,   0.00000000e+00,   8.41312539e-01, ...,
          9.89489631e-01,   1.00000000e+00,   8.90775273e-01],
       [  7.58668270e-01,   8.41312539e-01,   3.33066907e-16, ...,
          9.53333382e-01,   9.76800798e-01,   8.18687381e-01],
       ..., 
       [  9.18014452e-01,   9.89489631e-01,   9.53333382e-01, ...,
          0.00000000e+00,   9.54671595e-01,   9.48529897e-01],
       [  9.77525990e-01,   1.00000000e+00,   9.76800798e-01, ...,
          9.54671595e-01,   0.00000000e+00,   9.33911411e-01],
       [  9.01573332e-01,   8.90775273e-01,   8.18687381e-01, ...,
          9.48529897e-01,   9.33911411e-01,   0.00000000e+00]])

array([[  3.33066907e-16,   6.90454172e-01,   7.54098403e-01, ...,
          1.00000000e+00,   9.79696494e-01,   9.76795993e-01],
       [  6.90454172e-01,   1.11022302e-16,   8.16372160e-01, ...,
          1.00000000e+00,   1.00000000e+00,   9.84710386e-01],
       [  7.54098403e-01,   8.16372160e-01,   0.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       ..., 
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       [  9.79696494e-01,   1.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [  9.76795993e-01,   9.84710386e-01,   1.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   0.00000000e+00]])

In [18]:
import gc 
gc.collect()

61

In [19]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# root-mean-square error
def rmse(test_y, pred_y):
    return sqrt(mean_squared_error(test_y, pred_y))

In [20]:
# simple prediction method, used as a benchmark
def simple_mean_predict(movie_sum_rating, movie_ratings):
    return movie_sum_rating / float(np.count_nonzero(movie_ratings)) 

In [21]:
# user-user Collaborative Filtering
def user_based_predict(similarity, movie_ratings):
    """ predict single movie rating for a single user
        
        similarity    : vector of similariy of a given user to other users
        movie_ratings : each user movie rating (vector)
    """
    mr = movie_ratings.copy()
    mr[mr > 0.0] = 1.0
    
    return (1.0 - similarity).dot(movie_ratings) / ((1.0 - similarity).dot(mr))

In [22]:
def zero_mean(mx, axis=1): 
    return np.true_divide(mx.sum(axis), (mx != 0.0).sum(axis))

In [23]:
mean_user_rating = zero_mean(user_rating_mx)
user_ratings_diff = user_rating_mx - mean_user_rating[:, np.newaxis]
user_ratings_diff[user_rating_mx == 0.0] = 0.0

display(mean_user_rating[:5])
display(user_ratings_diff[:5])

array([ 3.74390244,  4.08163265,  4.15492958,  3.47826087,  4.36      ])

array([[ 0.        , -0.24390244,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        , -0.08163265, ...,  0.        ,
         0.        ,  0.        ],
       [-0.15492958,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -1.36      ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [24]:
# item-item Collaborative Filtering
def item_based_predict(similarity, (user_ix, movie_ix)):        
    mr = user_rating_mx[user_ix].copy()
    mr[mr > 0.0] = 1.0
    
    pred = (1.0 - similarity).dot(user_rating_mx[user_ix]) / ((1.0 - similarity).dot(mr))
    return pred

In [25]:
movie_genres = map(lambda x: set(x), movies_with_genres.as_matrix()[:, 2])[:user_rating_mx.shape[1]]
display (movie_genres[:10])

print len(movie_genres)
user_rating_mx.shape

[{'Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'},
 {'Adventure', 'Children', 'Fantasy'},
 {'Comedy', 'Romance'},
 {'Comedy', 'Drama', 'Romance'},
 {'Comedy'},
 {'Action', 'Crime', 'Thriller'},
 {'Comedy', 'Romance'},
 {'Adventure', 'Children'},
 {'Action'},
 {'Action', 'Adventure', 'Thriller'}]

13950


(6743, 13950)

In [26]:
genres_sim = np.array([[len(movie_genres[j] & movie_genres[i]) for i in xrange(len(movie_genres))]
                                                               for j in xrange(len(movie_genres))])

In [27]:
genres_sim

array([[5, 3, 1, ..., 0, 1, 0],
       [3, 3, 0, ..., 0, 0, 0],
       [1, 0, 2, ..., 1, 1, 0],
       ..., 
       [0, 0, 1, ..., 3, 0, 1],
       [1, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 0, 1]])

In [28]:
def item_based_with_genres_predict(similarity, (user_ix, movie_ix)):        
    mr = user_rating_mx[user_ix].copy()
    mr[mr > 0.0] = 1.0

    pred = (1.0 - similarity).dot(user_rating_mx[user_ix]) / ((1.0 - similarity).dot(mr))
    pred2 = genres_sim[movie_ix].dot(user_rating_mx[user_ix]) / (genres_sim[movie_ix].dot(mr))
    
    return (pred2 * 0.5 + pred * 0.5)

In [29]:
def user_relative_predict(similarity, (user_ix, movie_ix)):        
    mr = user_rating_mx[:, movie_ix].copy()
    mr[mr > 0.0] = 1.0
    
    movie_wmean = (1.0 - similarity).dot(user_ratings_diff[:,movie_ix]) / ((1.0 - similarity).dot(mr))
    pred = mean_user_rating[user_ix] + movie_wmean
        
    return pred

In [30]:
# remove nan values from predictions and testY
def get_non_nans(preds, test_y):
    return zip(*(filter(lambda x: np.isfinite(x[0]), zip(preds, test_y))))

In [31]:
_pred_with_item_collab2 = map(lambda x: item_based_with_genres_predict(item_cos_dist[x[0]], x), test_x)

  """
  


In [32]:
display (user_rating_mx.shape)
display (user_rating_mx.mean(axis=1))

(6743, 13950)

array([ 0.03301075,  0.01433692,  0.04229391, ...,  0.01197133,
        0.00451613,  0.03562724])

In [33]:
_pred_with_user_based = map(lambda x: user_based_predict(user_cos_dist[x[0]], user_rating_mx[:, x[1]]), test_x)

  # This is added back by InteractiveShellApp.init_path()


In [34]:
print 'sample predictions:'
display (_pred_with_user_based[:5])

sample predictions:


[3.2630160227674825,
 4.1491441100277164,
 3.3972315366959855,
 2.6662558978306805,
 2.5132289282898768]

In [35]:
pred_with_user_based, test_y_filtered = get_non_nans(_pred_with_user_based, test_y)
print ('removed %d data points (which where nan)' % (len(test_y) - len(test_y_filtered),))

removed 857 data points (which where nan)


In [36]:
print 'user-user CF (cos dist):', rmse(test_y_filtered, pred_with_user_based)

user-user CF (cos dist): 0.942201181407


In [37]:
movie_ratings_sum = user_rating_mx.sum(axis=0)

_pred_with_simple_mean = \
    map(lambda x: simple_mean_predict(movie_ratings_sum[x[1]], user_rating_mx[:, x[1]]), test_x)

  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
pred_with_simple_mean, test_y_filtered = get_non_nans(_pred_with_simple_mean, test_y)
print 'simple mean of user ratings prediction:', rmse(test_y_filtered, pred_with_simple_mean)

simple mean of user ratings prediction: 0.950285371919


In [39]:
_pred_with_user_relative = map(lambda x: user_relative_predict(user_cos_dist[x[0]], x), test_x)

  """


In [40]:
pred_with_user_relative, test_y_filtered = get_non_nans(_pred_with_user_relative, test_y)
print 'user-user (relative version):', rmse(test_y_filtered, pred_with_user_relative)

user-user (relative version): 0.87340621559


In [41]:
_pred_with_item_collab = map(lambda x: item_based_predict(item_cos_dist[x[0]], x), test_x)

  


In [42]:
pred_with_item_collab, test_y_filtered = get_non_nans(_pred_with_item_collab, test_y)
print 'item-item (cos dists) prediction:', rmse(test_y_filtered, pred_with_item_collab)

item-item (cos dists) prediction: 0.978951911295


In [43]:
item_pearson_dist = np.corrcoef(user_rating_mx.T)
item_pearson_dist[item_pearson_dist < 0.0] = 0.0
item_pearson_dist = 1.0 - item_pearson_dist
np.nan_to_num(item_pearson_dist, copy=False)

  c /= stddev[:, None]
  c /= stddev[None, :]
  


array([[ 0.        ,  0.82952891,  0.8636499 , ...,  0.        ,
         0.9836136 ,  0.98024263],
       [ 0.82952891,  0.        ,  0.8943687 , ...,  0.        ,
         1.        ,  0.98811392],
       [ 0.8636499 ,  0.8943687 ,  0.        , ...,  0.        ,
         1.        ,  1.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.9836136 ,  1.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.98024263,  0.98811392,  1.        , ...,  0.        ,
         1.        ,  0.        ]])

In [44]:
_pred_with_item_collab2 = map(lambda x: item_based_predict(item_pearson_dist[x[0]], x), test_x)

  


In [45]:
pred_with_item_collab2, test_y_filtered = get_non_nans(_pred_with_item_collab2, test_y)
print 'item-item (pearson dists) prediction:', rmse(test_y_filtered, pred_with_item_collab2)

item-item (pearson dists) prediction: 0.984140819237


In [46]:
_pred_with_item_collab2 = map(lambda x: item_based_with_genres_predict(item_cos_dist[x[0]], x), test_x)
pred_with_item_collab2, test_y_filtered = get_non_nans(_pred_with_item_collab2, test_y)
print 'item-item (cos dists with genres) prediction:', rmse(test_y_filtered, pred_with_item_collab2)

  """
  


item-item (cos dists with genres) prediction: 0.970763552313


In [47]:
user_pearson_dist = np.corrcoef(user_rating_mx)
user_pearson_dist[user_pearson_dist < 0.0] = 0.0
user_pearson_dist = 1.0 - user_pearson_dist

In [48]:
_pred_with_user_pear_collab = map(lambda x: user_relative_predict(user_pearson_dist[x[0]], x), test_x)

  """


In [49]:
pred_with_user_pear_collab, test_y_filtered = get_non_nans(_pred_with_user_pear_collab, test_y)
print 'user-user CF (relative - pearson dist):', rmse(test_y_filtered, pred_with_user_pear_collab)

user-user CF (relative - pearson dist): 0.872653033835


In [50]:
_pred_with_user_pear_based = map(lambda x: user_based_predict(user_pearson_dist[x[0]], user_rating_mx[:, x[1]]), test_x)

  # This is added back by InteractiveShellApp.init_path()


In [51]:
pred_with_user_pear_based, test_y_filtered = get_non_nans(_pred_with_user_pear_based, test_y)
print 'user-user CF (pearson dist):', rmse(test_y_filtered, pred_with_user_pear_based)

user-user CF (pearson dist): 0.941467176308


Conclusion: Pearson distance measure is slightly better then cosine distance