In [26]:
% matplotlib inline

import numpy as np
import pandas as pd
import pydotplus

from matplotlib import pyplot as plt
from scipy.sparse.linalg import svds
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

# Exercise: Build Your Own Recommendation System

Since the GetGlue dataset has not been available since january 2015. I decided to use the infamous MovieLens dataset of movie ratings.

## EDA

### Ratings Data File Structure

> UserID::MovieID::Rating::Timestamp

Ratings are made on a 5-star scale, with half-star increments.

### Movies Data File Structure

> MovieID::Title::Genres

Movie titles, by policy, should be entered identically to those found in IMDB, including year of release. However, they are entered manually, so errors and inconsistencies may exist.

Genres are a pipe-separated list, and are selected from the following:

* Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western

In [2]:
movielens_path = 'movielens/ml-1m/'
movielens_ratings = '{}ratings.dat'.format(movielens_path)
movielens_movies = '{}movies.dat'.format(movielens_path)

In [3]:
movies_df = pd.read_csv(movielens_movies, sep='::', names=['movie_id','title','genres'])
ratings_df = pd.read_csv(movielens_ratings, sep='::', names=['user_id','movie_id','ratings','timestamp'])

  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [4]:
movies_df.head(n=2)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [5]:
ratings_df.head(n=2)

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


In [6]:
nb_users = ratings_df['user_id'].unique().shape[0]
nb_movies = ratings_df['movie_id'].unique().shape[0]
print('Number of unique users = {}\nNumber of unique movies = {}'.format(nb_users, nb_movies))

Number of unique users = 6040
Number of unique movies = 3706


In [7]:
features_df = ratings_df[['user_id', 'movie_id']]
target_df = ratings_df[['ratings']]

In [8]:
train_data, test_data = train_test_split(ratings_df, test_size=0.2)

In [9]:
train_data.head()
print(max(train_data.movie_id), max(test_data.movie_id))

3952 3952


## Memory-based collaborative filtering

In [10]:
#Create two user-item matrices, one for training and another for testing
train_matrix = np.zeros((max(train_data.user_id), max(train_data.movie_id)))
for line in train_data.itertuples():
    train_matrix[line.user_id-1, line.movie_id-1] = line.ratings
    
train_transpose = np.zeros((max(train_data.movie_id), max(train_data.user_id)))
for line in train_data.itertuples():
    train_transpose[line.movie_id-1, line.user_id-1] = line.ratings

test_matrix = np.zeros((max(train_data.user_id), max(train_data.movie_id)))
for line in test_data.itertuples():
    test_matrix[line.user_id-1, line.movie_id-1] = line.ratings

In [11]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_matrix, metric='cosine')
item_similarity = pairwise_distances(train_transpose, metric='cosine')

In [12]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [13]:
item_prediction = predict(train_matrix, item_similarity, type='item')
user_prediction = predict(train_matrix, user_similarity, type='user')

In [14]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [15]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_matrix)))

User-based CF RMSE: 3.2299851620693385
Item-based CF RMSE: 3.528739526251054


## Model-based collaborative filtering 

Using SVD for matrix factorization (MF).

In [25]:
sparsity=round(1.0-len(ratings_df)/float(nb_users*nb_movies),3)
print('MovieLens sparsity level: ' +  str(sparsity*100) + '%')

MovieLens sparsity level: 95.5%


In [27]:
u, s, vt = svds(train_matrix)

In [28]:
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_matrix)))

User-based CF MSE: 2.8466296824921047
