In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import pandas
import os
import zipfile
import requests
import tqdm
from sklearn import dummy, metrics, cross_validation, ensemble

import keras.models as kmodels
import keras.layers as klayers
import keras.backend as K
import keras

In [3]:
# Download the dataset. It's small, only about 6 MB.
if not os.path.exists('./ml-1m'):
    url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
    response = requests.get(url, stream=True)
    total_length = response.headers.get('content-length')
    bar = tqdm.tqdm_notebook(total=int(total_length))
    with open('./ml-1m.zip', 'wb') as f:
        for data in response.iter_content(chunk_size=4096):
            f.write(data)
            bar.update(4096)
    zip_ref = zipfile.ZipFile('./ml-1m.zip', 'r')
    zip_ref.extractall('.')
    zip_ref.close()

In [5]:
# Read in the dataset, and do a little preprocessing,
# mostly to set the column datatypes.
users = pandas.read_csv('./ml-1m/users.dat', sep='::', 
                        engine='python', 
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pandas.read_csv('./ml-1m/ratings.dat', engine='python', 
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies = pandas.read_csv('./ml-1m/movies.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre']).set_index('movieid')
movies['genre'] = movies.genre.str.split('|')

users.age = users.age.astype('category')
users.gender = users.gender.astype('category')
users.occupation = users.occupation.astype('category')
ratings.movieid = ratings.movieid.astype('category')
ratings.userid = ratings.userid.astype('category')

In [31]:
# Count the movies and users
n_movies = movies.shape[0]
n_users = users.shape[0]

# Also, make vectors of all the movie ids and user ids. These are
# pandas categorical data, so they range from 1 to n_movies and 1 to n_users, respectively.
movieid = ratings.movieid.cat.codes.values
userid = ratings.userid.cat.codes.values
print (ratings.movieid)
print (ratings.shape)
print ("____________")
print (userid)
print ("____________")
print (movieid)

0          1193
1           661
2           914
3          3408
4          2355
5          1197
6          1287
7          2804
8           594
9           919
10          595
11          938
12         2398
13         2918
14         1035
15         2791
16         2687
17         2018
18         3105
19         2797
20         2321
21          720
22         1270
23          527
24         2340
25           48
26         1097
27         1721
28         1545
29          745
           ... 
1000179    2762
1000180    1036
1000181     508
1000182    1041
1000183    3735
1000184    2791
1000185    2794
1000186     527
1000187    2003
1000188     535
1000189    2010
1000190    2011
1000191    3751
1000192    2019
1000193     541
1000194    1077
1000195    1079
1000196     549
1000197    2020
1000198    2021
1000199    2022
1000200    2028
1000201    1080
1000202    1089
1000203    1090
1000204    1091
1000205    1094
1000206     562
1000207    1096
1000208    1097
Name: movieid, Length: 1

In [32]:
# And finally, set up a y variable with the rating,
# as a one-hot encoded matrix.
#
# note the '- 1' for the rating. That's because ratings
# go from 1 to 5, while the matrix columns go from 0 to 4

y = np.zeros((ratings.shape[0], 5))
y[np.arange(ratings.shape[0]), ratings.rating - 1] = 1
print (y)

[[ 0.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 ..., 
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  0.  1.  0.]]


In [45]:
# Dummy classifier! Just see how well stupid can do.
pred = dummy.DummyClassifier(strategy='prior')
pred.fit(ratings[['userid', 'movieid']], ratings.rating)
print (ratings[['userid', 'movieid','rating']])
print(metrics.mean_absolute_error(ratings.rating, pred.predict(ratings[['userid', 'movieid']])))


        userid movieid  rating
0            1    1193       5
1            1     661       3
2            1     914       3
3            1    3408       4
4            1    2355       5
5            1    1197       3
6            1    1287       5
7            1    2804       5
8            1     594       4
9            1     919       4
10           1     595       5
11           1     938       4
12           1    2398       4
13           1    2918       4
14           1    1035       5
15           1    2791       4
16           1    2687       3
17           1    2018       4
18           1    3105       5
19           1    2797       4
20           1    2321       3
21           1     720       3
22           1    1270       5
23           1     527       5
24           1    2340       3
25           1      48       5
26           1    1097       4
27           1    1721       4
28           1    1545       4
29           1     745       3
...        ...     ...     ...
1000179 

In [35]:
# Now, the deep learning classifier

# First, we take the movie and vectorize it.
# The embedding layer is normally used for sequences (think, sequences of words)
# so we need to flatten it out.
# The dropout layer is also important in preventing overfitting
movie_input = keras.layers.Input(shape=[1])
movie_vec = keras.layers.Flatten()(keras.layers.Embedding(n_movies + 1, 32)(movie_input))
movie_vec = keras.layers.Dropout(0.5)(movie_vec)


# Same thing for the users
user_input = keras.layers.Input(shape=[1])
user_vec = keras.layers.Flatten()(keras.layers.Embedding(n_users + 1, 32)(user_input))
user_vec = keras.layers.Dropout(0.5)(user_vec)

# Next, we join them all together and put them
# through a pretty standard deep learning architecture
input_vecs = keras.layers.merge([movie_vec, user_vec], mode='concat')
nn = keras.layers.Dropout(0.5)(keras.layers.Dense(128, activation='relu')(input_vecs))
nn = keras.layers.normalization.BatchNormalization()(nn)
nn = keras.layers.Dropout(0.5)(keras.layers.Dense(128, activation='relu')(nn))
nn = keras.layers.normalization.BatchNormalization()(nn)
nn = keras.layers.Dense(128, activation='relu')(nn)

# Finally, we pull out the result!
result = keras.layers.Dense(5, activation='softmax')(nn)

# And make a model from it that we can actually run.
model = kmodels.Model([movie_input, user_input], result)
model.compile('adam', 'categorical_crossentropy')

# If we wanted to inspect part of the model, for example, to look
# at the movie vectors, here's how to do it. You don't need to 
# compile these models unless you're going to train them.
final_layer = kmodels.Model([movie_input, user_input], nn)
movie_vec = kmodels.Model(movie_input, movie_vec)

  name=name)


In [20]:
# Split the data into train and test sets...
a_movieid, b_movieid, a_userid, b_userid, a_y, b_y = cross_validation.train_test_split(movieid, userid, y)
print( a_y)
print("----------")
print( b_y)
print("----------")
print(a_movieid, b_movieid, a_userid, b_userid)

[[ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  1.]
 ..., 
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  1.]]
----------
[[ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  1.]
 ..., 
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  1.]]
----------
[1201 1743 2505 ...,  263 1824 1935] [1420  954 1899 ..., 2634 1110 1089] [5348 1957 4131 ..., 5642 5963  628] [1065  537 5921 ..., 5910 1973 5515]


In [37]:
# And of _course_ we need to make sure we're improving, so we find the MAE before
# training at all.
metrics.mean_absolute_error(np.argmax(b_y, 1)+1, np.argmax(model.predict([b_movieid, b_userid]), 1)+1)

1.7980268183145174

In [38]:
try:
    history = model.fit([a_movieid, a_userid], a_y, 
                         nb_epoch=20, 
                         validation_data=([b_movieid, b_userid], b_y))
    plot(history.history['loss'])
    plot(history.history['val_loss'])
except KeyboardInterrupt:
    pass

  after removing the cwd from sys.path.


Train on 750156 samples, validate on 250053 samples
Epoch 1/20
  1024/750156 [..............................] - ETA: 506732s - loss: 1.83 - ETA: 513437s - loss: 1.80 - ETA: 511306s - loss: 1.90 - ETA: 514319s - loss: 1.87 - ETA: 514609s - loss: 1.87 - ETA: 512198s - loss: 1.88 - ETA: 511048s - loss: 1.85 - ETA: 510101s - loss: 1.83 - ETA: 508993s - loss: 1.81 - ETA: 508976s - loss: 1.81 - ETA: 508880s - loss: 1.78 - ETA: 508983s - loss: 1.77 - ETA: 509785s - loss: 1.76 - ETA: 508820s - loss: 1.76 - ETA: 508010s - loss: 1.76 - ETA: 507895s - loss: 1.76 - ETA: 507185s - loss: 1.76 - ETA: 506684s - loss: 1.75 - ETA: 506646s - loss: 1.74 - ETA: 506202s - loss: 1.73 - ETA: 505756s - loss: 1.73 - ETA: 507130s - loss: 1.72 - ETA: 506767s - loss: 1.72 - ETA: 506441s - loss: 1.71 - ETA: 506383s - loss: 1.71 - ETA: 505957s - loss: 1.71 - ETA: 505781s - loss: 1.70 - ETA: 505666s - loss: 1.69 - ETA: 505515s - loss: 1.69 - ETA: 505502s - loss: 1.68 - ETA: 505652s - loss: 1.68 - ETA: 505492s - loss:

In [23]:
# This is the number that matters. It's the held out 
# test set score. Note the + 1, because np.argmax will
# go from 0 to 4, while our ratings go 1 to 5.
metrics.mean_absolute_error(
    np.argmax(b_y, 1)+1, 
    np.argmax(model.predict([b_movieid, b_userid]), 1)+1)

0.67782430124813542

In [24]:
# For comparison's sake, here's the score on the training set.
metrics.mean_absolute_error(
    np.argmax(a_y, 1)+1, 
    np.argmax(model.predict([a_movieid, a_userid]), 1)+1)

0.65481579831395065