In [1]:
import os
os.chdir ("D:\MIT Big Data\Module 4\Graded Case study\MovieLens 100K dataset\ml-100k")

In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import graphlab as gl

In [3]:
# renaming column names
col_names = ["user_id", "item_id", "rating", "timestamp"]
# reading data
data = pd.read_table('u.data', names=col_names)

In [4]:
# deleting the last column contain timestamps
data = data.drop('timestamp', 1)

In [5]:
# check the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
user_id    100000 non-null int64
item_id    100000 non-null int64
rating     100000 non-null int64
dtypes: int64(3)
memory usage: 2.3 MB


In [6]:
# plotting histogram from ratings
%matplotlib notebook
plt.hist(data['rating'])
plt.title("Histogram For Ratings")
plt.xlabel("Level of Ratings")
plt.ylabel("Number of Movies Rated")
plt.show()

<IPython.core.display.Javascript object>

#### The data sparsity can be calculated as follow
##### """ Sparsity = (# of ratings / # of movies * # of Users) * 100% """

In [7]:
# Now, we can recalculate the sparsity as we did earlier.
num_ratings = float(len(data))
num_movies = float(len(np.unique(data["item_id"])))
num_users = float(len(np.unique(data["user_id"])))
sparsity = (num_ratings / (num_movies * num_users)) * 100.0
print "Sparsity of Dataset is", sparsity, "Percent"

Sparsity of Dataset is 6.30466936422 Percent


In [8]:
# Get the column containing the users
users = data.user_id
# Create a dictionary from users to their number of ratings
ratings_count = dict()
for user in users:
    # If we already have the user, just add 1 to their rating count
    if user in ratings_count:
        ratings_count[user] += 1
    # Otherwise, set their rating count to 1
    else:
        ratings_count[user] = 1

In [9]:
# We want our users to have at least 50 ratings to be considred
RATINGS_CUTOFF = 50
remove_users = []
for user, num_ratings in ratings_count.iteritems():
    if num_ratings < RATINGS_CUTOFF:
        remove_users.append(user)
data = data.loc[~data.user_id.isin(remove_users)]

In [10]:
# Now, we can recalculate the sparsity as we did earlier.
num_ratings = float(len(data))
num_movies = float(len(np.unique(data["item_id"])))
num_users = float(len(np.unique(data["user_id"])))
sparsity = (num_ratings / (num_movies * num_users)) * 100.0
print "UPDATED Sparsity of Dataset is", sparsity, "Percent"


UPDATED Sparsity of Dataset is 9.26584192843 Percent


####   Integrating a Popularity Recommender ####

In [12]:
# convert our data to a graphlab SFrame object
sf = gl.SFrame(data)

#### Train/Test/Validation Split

In [13]:
# splitting into train and test set
sf_train, sf_test = sf.random_split(.70)

In [14]:
# splitting further the 70% train set into 75% train set and 25% validate set
sf_train, sf_validate = sf_train.random_split(.75)

#### Training data on trainset and examine RMSE on Test set
(using graphlab popularity reccomender)

In [15]:
import graphlab as gl
popularity_recommender = gl.recommender.popularity_recommender.create(sf_train, target='rating')
popularity_recommender.evaluate_rmse(sf_test, 'rating')

{'rmse_by_item': Columns:
 	item_id	int
 	count	int
 	rmse	float
 
 Rows: 1480
 
 Data:
 +---------+-------+----------------+
 | item_id | count |      rmse      |
 +---------+-------+----------------+
 |   118   |   76  | 1.11399712174  |
 |   660   |   45  | 1.00982156014  |
 |   1236  |   1   | 0.522490459243 |
 |   1379  |   3   | 1.37436854187  |
 |   699   |   33  | 1.13021740734  |
 |   567   |   8   | 1.32064892133  |
 |   773   |   6   | 1.01379375505  |
 |   1029  |   4   | 1.57797338381  |
 |   1504  |   2   | 0.707106781187 |
 |   435   |   59  | 0.812435324826 |
 +---------+-------+----------------+
 [1480 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	user_id	int
 	count	int
 	rmse	float
 
 Rows: 568
 
 Data:
 +---------+-------+----------------+
 | user_id | count |      rmse      |
 +---------+-------+----------------+
 |   118   |   23  | 1.

### Integrating a Collaborative Filtering Recommender ######

Using graphlab factorization recommender model


In [16]:
# We will try regularization terms in the range [10**-5, 10**-1]
regularization_terms = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1]

In [17]:
# We want to keep track of our best results as we go
best_regularization_term = 0
best_RMSE = np.inf
for regularization_term in regularization_terms:
    # Train with this reg term
    factorization_recommender = gl.recommender.factorization_recommender.create(sf_train,
                                                                                target='rating',
                                                                                regularization=regularization_term)

In [18]:
# Evaluate on our validation set (NOT test set yet)
evaluation = factorization_recommender.evaluate_rmse(sf_validate, 'rating')
    # See if we found the best score yet
if evaluation['rmse_overall'] < best_RMSE:
    best_RMSE = evaluation['rmse_overall']
    best_regularization_term = regularization_term

print "Best Regularization Term", best_regularization_term
print "Best Validation RMSE Achieved", best_RMSE

Best Regularization Term 0.1
Best Validation RMSE Achieved 1.03376984179


In [19]:
# Now that we know the best parameter, we can use it to evaluate on our TEST set
factorization_recommender = gl.recommender.factorization_recommender.create(sf_train,
                                                                            target='rating',
                                                                            regularization=best_regularization_term)
print "Test RMSE on best model", factorization_recommender.evaluate_rmse(sf_test, 'rating')['rmse_overall']


Test RMSE on best model

 1.0409264676


##### Integrating an Item-Item Similarity Recommender #####

In [20]:
# graphlab's Item-Item Similarity Recommender
item_similarity_recommender = gl.recommender.item_similarity_recommender.create(sf_train, target='rating')
test_rmse = item_similarity_recommender.evaluate_rmse(sf_test, 'rating')['rmse_overall']
print "Test RMSE on model", test_rmse

Test RMSE on model 3.67383571044


#### Getting Top k Recommendations ###############

In [21]:
# calculate the top k recommendations for each user
k=5
popularity_top_k = popularity_recommender.recommend(k=k)
factorization_top_k = factorization_recommender.recommend(k=k)
item_similarity_top_k = item_similarity_recommender.recommend(k=k)
print factorization_top_k

+---------+---------+---------------+------+
| user_id | item_id |     score     | rank |
+---------+---------+---------------+------+
|   244   |    50   | 4.04519540698 |  1   |
|   244   |   174   | 3.96801305444 |  2   |
|   244   |   127   | 3.96160466106 |  3   |
|   244   |    98   | 3.95758024485 |  4   |
|   244   |    64   | 3.93204925091 |  5   |
|   298   |    50   | 4.14142725126 |  1   |
|   298   |   174   | 4.06424489872 |  2   |
|   298   |    64   | 4.02828109519 |  3   |
|   298   |   172   |  4.0024915661 |  4   |
|   298   |   100   | 3.99847102419 |  5   |
+---------+---------+---------------+------+
[2840 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


###### Evaluation: Confusion Matrix, Precision and Recall ######

In [22]:
# calculate the precision/recall matrices between the three different models
models = [popularity_recommender, factorization_recommender, item_similarity_recommender]
model_names = ['popularity_recommender', 'factorization_recommender', 'item_similarity_recommender']
precision_recall = gl.recommender.util.compare_models(sf_test,
                                                      models,
                                                      metric='precision_recall',
                                                      model_names=model_names)

PROGRESS: Evaluate model popularity_recommender

Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    |  0.00176056338028 | 2.70855904659e-05 |
|   2    |  0.00176056338028 | 5.64283134706e-05 |
|   3    |  0.00117370892019 | 5.64283134706e-05 |
|   4    | 0.000880281690141 | 5.64283134706e-05 |
|   5    |  0.00105633802817 | 8.27053788479e-05 |
|   6    |  0.00146713615023 | 0.000105335623216 |
|   7    |  0.00176056338028 | 0.000140993292173 |
|   8    |  0.00242077464789 | 0.000292208163967 |
|   9    |  0.00254303599374 | 0.000331067842109 |
|   10   |  0.00228873239437 | 0.000331067842109 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model factorization_recommender

Precision and recall summary statistics by cutoff
+--------+----------------+------------------+
| cutoff | mean