In [25]:
import graphlab as gl
gl.canvas.set_target("ipynb")

In [26]:
implicit = gl.SFrame('implicit')
explicit = gl.SFrame('explicit')
items = gl.SFrame('items')
ratings = gl.SFrame('ratings')

In [5]:
ratings.show()

## Split the data into a training set and a test set

This allows us to evaluate generalization ability.

In [27]:
train, valid = gl.recommender.util.random_split_by_user(implicit)

## Feature engineering

Compute the number of times each item has been rated.

In [28]:
num_ratings_per_item = train.groupby('item_id', {'num_users': gl.aggregate.COUNT})
items = items.join(num_ratings_per_item, on='item_id')

Transform the count into a categorical variable using the `feature_engineering` module.

In [29]:
binner = gl.feature_engineering.FeatureBinner(features=['num_users'], strategy='logarithmic', num_bins=5)
items = binner.fit_transform(items)

Convert each genre element into a dictionary and each year to an integer.

In [30]:
items['genres'] = items['genres'].apply(lambda x: {k:1 for k in x})
items['year'] = items['year'].astype(int)

In [31]:
items

item_id,genres,title,year,num_users
1,"{""Children's"": 1, 'Comedy': 1, 'Animati ...",Toy Story,1995,num_users_4
2,"{""Children's"": 1, 'Adventure': 1, ...",Jumanji,1995,num_users_3
3,"{'Romance': 1, 'Comedy': 1} ...",Grumpier Old Men,1995,num_users_3
4,"{'Drama': 1, 'Comedy': 1}",Waiting to Exhale,1995,num_users_2
5,{'Comedy': 1},Father of the Bride Part II ...,1995,num_users_2
6,"{'Action': 1, 'Thriller': 1, 'Crime': 1} ...",Heat,1995,num_users_3
7,"{'Romance': 1, 'Comedy': 1} ...",Sabrina,1995,num_users_3
8,"{""Children's"": 1, 'Adventure': 1} ...",Tom and Huck,1995,num_users_2
9,{'Action': 1},Sudden Death,1995,num_users_2
10,"{'Action': 1, 'Adventure': 1, ...",GoldenEye,1995,num_users_3


## Train models

#### Collaborative filtering approach that uses the Jaccard similarity of two users' item lists

In [32]:
m0 = gl.item_similarity_recommender.create(train)

#### Collaborative filtering approach that learns latent factors for each user and each item

In [33]:
m1 = gl.ranking_factorization_recommender.create(train, max_iterations=10)

#### Collaborative filtering approach that learns latent factors for users, items, and side data

In [34]:
m2 = gl.ranking_factorization_recommender.create(train, 
                                                 item_data=items[['item_id', 'year']], 
                                                 max_iterations=10)

In [35]:
m3 = gl.ranking_factorization_recommender.create(train, 
                                                 item_data=items[['item_id', 'year', 'genres']], 
                                                 max_iterations=10)

#### Train a recommender that leverages the similarity between items

Create a nearest neighbor model that uses the genres in common and the year of the movie.

In [36]:
dist = [[['genres'], 'jaccard', 1.0], 
        [['year'], 'euclidean', 1.0]]
nn_model = gl.nearest_neighbors.create(items, 'item_id', distance=dist)

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [37]:
gl.nearest_neighbors.create?

Compute a nearest neighbor graph.

In [38]:
similar = nn_model.query(items, 'item_id', k=100)\
             .rename({'query_label': 'item_id', 'reference_label': 'similar', 'distance': 'score'})\
             .join(items[['item_id', 'title']], on='item_id')\
             .join(items[['item_id', 'title']], on={'similar': 'item_id'})
similar['score'] = 1 - similar['score']
similar.print_rows(100, max_row_width=200)

+---------+---------+----------------+------+-----------+--------------------------------+
| item_id | similar |     score      | rank |   title   |            title.1             |
+---------+---------+----------------+------+-----------+--------------------------------+
|    1    |    1    |      1.0       |  1   | Toy Story |           Toy Story            |
|    1    |   239   |      0.75      |  2   | Toy Story |         Goofy Movie, A         |
|    1    |    13   | 0.666666666667 |  3   | Toy Story |             Balto              |
|    1    |    54   | 0.666666666667 |  4   | Toy Story |         Big Green, The         |
|    1    |   888   | 0.666666666667 |  5   | Toy Story | Land Before Time III: The ...  |
|    1    |    34   |      0.5       |  6   | Toy Story |              Babe              |
|    1    |   745   |      0.5       |  7   | Toy Story |         Close Shave, A         |
|    1    |    48   |      0.4       |  8   | Toy Story |           Pocahontas           |

Use this similarity data as the basis for a recommender.

In [39]:
m5 = gl.item_similarity_recommender.create(train, nearest_items=similar)

## Evaluation

Create a precision/recall plot to compare the recommendation quality of the above models given our heldout data.

In [40]:
model_comparison = gl.compare(valid, [m0, m1, m2, m3, m5], user_sample=.3)

compare_models: using 297 users to estimate model performance
PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+----------------+-----------------+
| cutoff | mean_precision |   mean_recall   |
+--------+----------------+-----------------+
|   1    | 0.340067340067 | 0.0273701812558 |
|   2    | 0.308080808081 | 0.0478083726971 |
|   3    | 0.288439955107 | 0.0644063022978 |
|   4    | 0.273569023569 | 0.0837581789951 |
|   5    | 0.259259259259 |  0.097804796748 |
|   6    | 0.246913580247 |  0.110896121437 |
|   7    | 0.239057239057 |  0.120171306579 |
|   8    | 0.231902356902 |  0.133021390364 |
|   9    | 0.21922933034  |  0.140607202562 |
|   10   | 0.211111111111 |  0.150910548487 |
+--------+----------------+-----------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1

Precision and recall summary statistics by cutoff
+--------+----------------+-----------------+
| cutoff | mean_precision |   mean_recall   |
+--------+-----------

In [24]:
gl.show_comparison(model_comparison, [m0, m1, m2, m3, m5])