# Movie ratings prediction (recommender system for movie-lens data set) 

### Read and process data

In [1]:
# python 2.7 with anaconda

import numpy as np
from sklearn import datasets
from scipy.stats import mode
import scipy as sp
import pandas as pd

In [2]:
# Data set: Movie Lens 20M (http://grouplens.org/datasets/movielens/20m/)

genome_scores = pd.read_csv('ml-20m/genome-scores.csv')
genome_tags = pd.read_csv('ml-20m/genome-tags.csv')
links = pd.read_csv('ml-20m/links.csv')
movies = pd.read_csv('ml-20m/movies.csv')
ratings = pd.read_csv('ml-20m/ratings.csv')
tags = pd.read_csv('ml-20m/tags.csv')

In [3]:
# display (movies.info())

In [4]:
for ds in (genome_scores, genome_tags, links, movies, ratings, tags):
    display(ds.head())

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


### Helper methods for making prediction

In [5]:
def add_genres(movies):
    new_movies = movies.copy()
    
    for ix in range(0, len(movies)):
        genres = movies['genres'][ix]
        new_movies.at[ix, 'genres'] = [] if genres == "(no genres listed)" \
                                         else genres.split('|')

    return new_movies

movies_raw = movies
movies_with_genres = add_genres(movies_raw.head())
display(movies_with_genres)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [6]:
movies = movies_with_genres
# check min rating
display (ratings['rating'].min()) # 0.5 star - 5.0 stars

0.5

#### Define user / rating matrix

In [7]:
user_rating = ratings.head(1000000)\
              .pivot(index = 'userId', columns ='movieId', values = 'rating')\
              .fillna(0)
        
user_rating_mx = user_rating.as_matrix()

In [8]:
from sklearn.model_selection import train_test_split

# generate test data
def gen_test_data(user_rating_mx, test_size=0.25):
    data = []

    for i in xrange(user_rating_mx.shape[0]):
        for j in xrange(user_rating_mx.shape[1]):
            if user_rating_mx[i, j] > 0.0:
                data.append(((i, j), user_rating_mx[i, j]))
                
    _, test_data = train_test_split(data, test_size=test_size, random_state=2018)
    return zip(*test_data)

Test data is 25% of all ratings

In [9]:
# test_x -> indices (in provided matrix) of a movie rating from a given user
# test_y -> rating
test_x, test_y = gen_test_data(user_rating_mx, test_size=0.25)

# clear test ratings
for i, j in test_x:
    user_rating_mx[i, j] = 0.0

In [10]:
# display (user_rating.head())
display (user_rating_mx)
display (test_x[:5], test_y[:5])
display (user_rating_mx[0, :5])

array([[ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  4. , ...,  0. ,  0. ,  0. ],
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  3. ,  3. , ...,  0. ,  0. ,  0. ]])

((5765, 2349), (3108, 1245), (4361, 10680), (6738, 1747), (455, 2723))

(4.0, 4.0, 5.0, 1.0, 1.0)

array([ 0. ,  3.5,  0. ,  0. ,  0. ])

In [11]:
display(user_rating.info())
display(user_rating.as_matrix()[0])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6743 entries, 1 to 6743
Columns: 13950 entries, 1 to 130642
dtypes: float64(13950)
memory usage: 717.7 MB


None

array([ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ])

In [12]:
def pearson(x, y):
    if len(x.shape) == 1:
        x = x.reshape((1, -1))
    if len(y.shape) == 1:
        y = y.reshape((1, -1))
        
    prsn = np.zeros((x.shape[0], y.shape[0]))
    for xi in np.arange(x.shape[0]):
        for yi in np.arange(y.shape[0]):
            prsn[xi, yi] = sp.stats.pearsonr(x[xi,:], y[yi,:])[0]
    
    return prsn

In [13]:
x = user_rating.head(10)
a = pearson(np.array(x[:2]), np.array(x[:5]))
print(a)

[[ 1.          0.09289165  0.23426417  0.02094374  0.09839049]
 [ 0.09289165  1.          0.15400786  0.03758312  0.15327809]]


### First try: user-user collaborative filtering using cosine distance

In [14]:
from sklearn.metrics.pairwise import pairwise_distances

user_cos_dist = pairwise_distances(user_rating, metric='cosine', n_jobs=-1)
item_cos_dist = pairwise_distances(user_rating.T, metric='cosine', n_jobs=-1)

In [15]:
display(user_cos_dist)
display(item_cos_dist)

array([[  0.00000000e+00,   9.02292165e-01,   7.58668270e-01, ...,
          9.18014452e-01,   9.77525990e-01,   9.01573332e-01],
       [  9.02292165e-01,   0.00000000e+00,   8.41312539e-01, ...,
          9.89489631e-01,   1.00000000e+00,   8.90775273e-01],
       [  7.58668270e-01,   8.41312539e-01,   3.33066907e-16, ...,
          9.53333382e-01,   9.76800798e-01,   8.18687381e-01],
       ..., 
       [  9.18014452e-01,   9.89489631e-01,   9.53333382e-01, ...,
          0.00000000e+00,   9.54671595e-01,   9.48529897e-01],
       [  9.77525990e-01,   1.00000000e+00,   9.76800798e-01, ...,
          9.54671595e-01,   0.00000000e+00,   9.33911411e-01],
       [  9.01573332e-01,   8.90775273e-01,   8.18687381e-01, ...,
          9.48529897e-01,   9.33911411e-01,   0.00000000e+00]])

array([[  3.33066907e-16,   6.90454172e-01,   7.54098403e-01, ...,
          1.00000000e+00,   9.79696494e-01,   9.76795993e-01],
       [  6.90454172e-01,   1.11022302e-16,   8.16372160e-01, ...,
          1.00000000e+00,   1.00000000e+00,   9.84710386e-01],
       [  7.54098403e-01,   8.16372160e-01,   0.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       ..., 
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00],
       [  9.79696494e-01,   1.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [  9.76795993e-01,   9.84710386e-01,   1.00000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   0.00000000e+00]])

In [16]:
import gc 
gc.collect()

61

In [17]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(test_y, pred_y):
    return sqrt(mean_squared_error(test_y, pred_y))

In [18]:
def simple_mean_predict(movie_sum_rating, movie_ratings):
    return movie_sum_rating / float(np.count_nonzero(movie_ratings)) 

In [25]:
def user_based_predict(similarity, movie_ratings):
    """ predict single movie rating for a single user
        
        similarity    : vector of similariy of a given user to other users
        movie_ratings : each user movie rating (vector)
    """
    mr = movie_ratings.copy()
    mr[mr > 0.0] = 1.0
    
    return (1.0 - similarity).dot(movie_ratings) / ((1.0 - similarity).dot(mr))

In [20]:
def zero_mean(mx, axis=1): 
    return np.true_divide(mx.sum(axis), (mx != 0.0).sum(axis))

mean_user_rating = zero_mean(user_rating_mx)
user_ratings_diff = user_rating_mx - mean_user_rating[:, np.newaxis]
user_ratings_diff[user_rating_mx == 0.0] = 0.0

display(mean_user_rating[:5])
display(user_ratings_diff[:5])

array([ 3.74390244,  4.08163265,  4.15492958,  3.47826087,  4.36      ])

array([[ 0.        , -0.24390244,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        , -0.08163265, ...,  0.        ,
         0.        ,  0.        ],
       [-0.15492958,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -1.36      ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [21]:
def user_collab_predict(similarity, (user_ix, movie_ix)):        
    mr = user_rating_mx[:, movie_ix].copy()
    mr[mr > 0.0] = 1.0
    
    movie_wmean = (1.0 - similarity).dot(user_ratings_diff[:,movie_ix]) / ((1.0 - similarity).dot(mr))
    pred = mean_user_rating[user_ix] + 1.2 * movie_wmean
        
    return pred

def item_collab_predict(similarity, (user_ix, movie_ix)):        
    mr = user_rating_mx[user_ix].copy()
    mr[mr > 0.0] = 1.0
    
    pred = (1.0 - similarity).dot(user_rating_mx[user_ix]) / ((1.0 - similarity).dot(mr))
    return pred

In [22]:
def get_non_nans(preds, test_y):
    return zip(*(filter(lambda x: np.isfinite(x[0]), zip(preds, test_y))))

In [23]:
display(user_collab_predict(user_cos_dist[0], (0, 0)))

4.1060482685052264

In [26]:
# sample prediction
display(user_based_predict(user_cos_dist[0], user_rating_mx[:,0]))
#display(user_rating_mx[0,:])
# display(user_rating)

3.8903240903730274

In [27]:
display (user_rating_mx.shape)
display (user_rating_mx.mean(axis=1))

(6743, 13950)

array([ 0.03301075,  0.01433692,  0.04229391, ...,  0.01197133,
        0.00451613,  0.03562724])

In [28]:
pred_with_user_based = map(lambda x: user_based_predict(user_cos_dist[x[0]], user_rating_mx[:, x[1]]), test_x)

  # Remove the CWD from sys.path while we load stuff.


In [29]:
display (pred_with_user_based[:5])

[3.2630160227674825,
 4.1491441100277164,
 3.3972315366959855,
 2.6662558978306805,
 2.5132289282898768]

In [30]:
pred_with_user_based, test_y_filtered = get_non_nans(pred_with_user_based, test_y)
# display (pred_with_user_based[:10])
print ('removed %d data points (which where nan)' % (len(test_y) - len(test_y_filtered),))

removed 857 data points (which where nan)


In [31]:
display (rmse(test_y_filtered, pred_with_user_based))

0.9422011814071392

In [32]:
display (test_y_filtered[:5], pred_with_user_based[:5])

(4.0, 4.0, 5.0, 1.0, 1.0)

(3.2630160227674825,
 4.1491441100277164,
 3.3972315366959855,
 2.6662558978306805,
 2.5132289282898768)

Result of cos-dist weithed prediction is RMSE of around 0.942

In [33]:
movie_ratings_sum = user_rating_mx.sum(axis=0)

pred_with_simple_mean = \
    map(lambda x: simple_mean_predict(movie_ratings_sum[x[1]], user_rating_mx[:, x[1]]), test_x)

  


In [34]:
pred_with_simple_mean, test_y_filtered = get_non_nans(pred_with_simple_mean, test_y)
display (rmse(test_y_filtered, pred_with_simple_mean))

0.9502853719187646

Result of simple mean prediction is 0.9502

In [35]:
display (test_y_filtered[:5], pred_with_simple_mean[:5])

(4.0, 4.0, 5.0, 1.0, 1.0)

(3.2894736842105261,
 4.1635220125786168,
 3.3333333333333335,
 2.7681818181818181,
 2.3529411764705883)

In [36]:
pred_with_user_collab = map(lambda x: user_collab_predict(user_cos_dist[x[0]], x), test_x)

  """


In [37]:
display (pred_with_user_collab[:5])

[3.3910175275073962,
 4.0719214874606156,
 3.8597021959457738,
 1.6051475958712356,
 2.3240709786201554]

In [38]:
pred_with_user_collab, test_y_filtered = get_non_nans(pred_with_user_collab, test_y)
display (rmse(test_y_filtered, pred_with_user_collab))

0.8799264248156585

Result of relative cos mean prediction is 0.8799

In [39]:
pred_with_item_collab = map(lambda x: item_collab_predict(item_cos_dist[x[0]], x), test_x)

  


In [40]:
display (pred_with_item_collab[:5])
pred_with_item_collab, test_y_filtered = get_non_nans(pred_with_item_collab, test_y)
display (rmse(test_y_filtered, pred_with_item_collab))

[3.3524286392204394,
 3.5352753046873833,
 4.1380028660672608,
 2.4410680451918485,
 3.1632057990383342]

0.978951911295399

Result of item-item (cos dists) prediction is 0.97

In [69]:
user_pearson_dist = np.corrcoef(user_rating_mx)
user_pearson_dist[user_pearson_dist < 0.0] = 0.0

In [71]:
user_pearson_dist = 1.0 - user_pearson_dist
display (user_pearson_dist)
#display (user_pearson_dist.min())

array([[  1.11022302e-16,   9.07108354e-01,   7.65735829e-01, ...,
          9.22668121e-01,   9.80351662e-01,   9.10153167e-01],
       [  9.07108354e-01,   0.00000000e+00,   8.45992135e-01, ...,
          9.92611937e-01,   1.00000000e+00,   8.95879915e-01],
       [  7.65735829e-01,   8.45992135e-01,   0.00000000e+00, ...,
          9.58471664e-01,   9.79782685e-01,   8.26937332e-01],
       ..., 
       [  9.22668121e-01,   9.92611937e-01,   9.58471664e-01, ...,
          0.00000000e+00,   9.56294222e-01,   9.53742576e-01],
       [  9.80351662e-01,   1.00000000e+00,   9.79782685e-01, ...,
          9.56294222e-01,   0.00000000e+00,   9.36713521e-01],
       [  9.10153167e-01,   8.95879915e-01,   8.26937332e-01, ...,
          9.53742576e-01,   9.36713521e-01,   0.00000000e+00]])

In [72]:
pred_with_user_pear_collab = map(lambda x: user_collab_predict(user_pearson_dist[x[0]], x), test_x)

  """


In [73]:
pred_with_user_pear_collab, test_y_filtered = get_non_nans(pred_with_user_pear_collab, test_y)
display (rmse(test_y_filtered, pred_with_user_pear_collab))

0.8790586035422778

Result of relative pearson mean prediction is 0.87905

In [74]:
pred_with_user_pear_based = map(lambda x: user_based_predict(user_pearson_dist[x[0]], user_rating_mx[:, x[1]]), test_x)

  # Remove the CWD from sys.path while we load stuff.


In [75]:
pred_with_user_pear_based, test_y_filtered = get_non_nans(pred_with_user_pear_based, test_y)
display (rmse(test_y_filtered, pred_with_user_pear_based))

0.9414671763082018

Conclusion: Pearson distance measure is slightly better then cosine distance