# Working With Movielens 20 Million

## Loading Data
We'll use pandas to read it in from the csv files

In [1]:
import pandas as pd

data_dir = "~/Code/dl/datasets/movielens-20m/"
(ratings, movies) = (pd.read_csv(data_dir + fileName) for fileName in ["ratings.csv", "movies.csv"])

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


### Creating Train/Test Sets

In [14]:
import numpy as np
np.random.seed = 42

normalize = lambda x: (x - 0.5) / 4.5
denormalize = lambda x: (x * 4.5) + 0.5

full_X = [np.array(ratings.movieId), np.array(ratings.userId)]
full_Y = normalize(np.array(ratings.rating))

mask = np.random.rand(len(ratings)) < 0.8

train_X = [full_X[0][mask], full_X[1][mask]]
train_Y = full_Y[mask]

test_X = [full_X[0][~mask], full_X[1][~mask]]
test_Y = full_Y[~mask]

print("Number of training samples: ", len(train_Y))
print("Number of test samples: ", len(test_Y))

Number of training samples:  16000822
Number of test samples:  3999441


## Defining Model

In [34]:
from keras.layers import Input, Embedding, Flatten, add

embedding_dim = 50

# Movie embedding and bias
n_movies = len(set(np.squeeze(train_X[0])))

movie_in = Input(shape=(1,))

movie_embedding = Embedding(n_movies, embedding_dim, name='movie_embed')(movie_in)
movie_embedding = Flatten()(movie_embedding)

movie_bias = Embedding(n_movies, 1, name='movie_bias')(movie_in)
movie_bias = Flatten()(movie_bias)

movie_model = add([movie_embedding, movie_bias])

# User embedding and bias
n_users = len(set(np.squeeze(train_X[1])))

user_in = Input(shape=(1,))
user_embedding = Embedding(n_users, embedding_dim, name='user_embed')(user_in)
user_embedding = Flatten()(user_embedding)

user_bias = Embedding(n_users, 1, name='user_bias')(user_in)
user_bias = Flatten()(user_bias)

user_model = add([user_embedding, user_bias])

In [35]:
from keras.models import Sequential
from keras.layers import concatenate, Dense, Dropout, BatchNormalization

out = concatenate([movie_model, user_model])
out = Dropout(0.7)(out)
out = Dense(32, activation='relu')(out)
out = Dropout(0.6)(out)
out = Dense(1, activation='sigmoid')(out)

In [36]:
from keras.models import Model

model = Model(inputs=[movie_in, user_in], outputs=out)
model.compile(optimizer='adam', loss='mse')
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_7 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_8 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
movie_embed (Embedding)          (None, 1, 50)         1292700     input_7[0][0]                    
____________________________________________________________________________________________________
movie_bias (Embedding)           (None, 1, 1)          25854       input_7[0][0]                    
___________________________________________________________________________________________

## Training

In [38]:
history = model.fit(train_X, train_Y, batch_size=1024, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
def eval_accuracy(current_model, truth_X, truth_Y):
    truth = denormalize(np.expand_dims(truth_Y, axis=1))
    
    global preds = current_model.predict(truth_X, batch_size=1024) # Global for debugging purposes
    preds = np.clip(denormalize(preds), 0.5, 5.0)
    
    print(np.mean(np.square(preds - truth)))

In [40]:
eval_accuracy(model, test_X, test_Y)

Predictions (for debug) [[ 3.93534541]
 [ 3.79731107]
 [ 3.8624599 ]
 ..., 
 [ 3.38506079]
 [ 3.38506079]
 [ 3.38506079]]
0.740683187092


In [41]:
model.save('trained-0.74-accuracy.h5')

## Fun With Bias Embeddings

In [202]:
from keras import backend as K

unique_movie_ids = np.array(list(set(movies.movieId)))
biases_variable = model.get_layer('movie_bias').call(unique_movie_ids)
biases = np.squeeze(K.get_value(biases_variable))

In [203]:
sorted_bias_idxs = biases.argsort()

In [204]:
sorted_movie_ids = unique_movie_ids[sorted_bias_idxs]
sorted_movie_names = [movies[movies.movieId == idx].title.values[0] for idx in sorted_movie_ids]

In [205]:
print_list = lambda l: print('\n'.join([str(i + 1) + '\t' + title for (i, title) in enumerate(l)]))

### The Best 25 Movies of All Time

In [206]:
print_list(sorted_movie_names[-25:][::-1])

1	Blue Angel, The (Blaue Engel, Der) (1930)
2	Shawshank Redemption, The (1994)
3	Godfather, The (1972)
4	Usual Suspects, The (1995)
5	Iron & Silk (1990)
6	Schindler's List (1993)
7	Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)
8	Godfather: Part II, The (1974)
9	Rear Window (1954)
10	Seven Samurai (Shichinin no samurai) (1954)
11	Casablanca (1942)
12	North by Northwest (1959)
13	Third Man, The (1949)
14	Fight Club (1999)
15	One Flew Over the Cuckoo's Nest (1975)
16	Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
17	Vertigo (1958)
18	To Kill a Mockingbird (1962)
19	Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
20	Double Indemnity (1944)
21	Chinatown (1974)
22	Black Pirate, The (1926)
23	Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
24	Grand Illusion (La grande illusion) (1937)
25	12 Angry Men (1957)


### The Worst 25 Movies of All Time

In [207]:
print_list(sorted_movie_names[:25])

1	From Justin to Kelly (2003)
2	SuperBabies: Baby Geniuses 2 (2004)
3	Timerider: The Adventure of Lyle Swann (1982)
4	Glitter (2001)
5	Barney's Great Adventure (1998)
6	Pokémon Heroes (2003)
7	Turbo: A Power Rangers Movie (1997)
8	House of the Dead, The (2003)
9	Faces of Death 6 (1996)
10	Gigli (2003)
11	Ernest Goes to Africa (1997)
12	Carnosaur 3: Primal Species (1996)
13	Yu-Gi-Oh! (2004)
14	Pokemon 4 Ever (a.k.a. Pokémon 4: The Movie) (2002)
15	Faces of Death 5 (1996)
16	Hip Hop Witch, Da (2000)
17	Faces of Death 4 (1990)
18	Faces of Death 3 (1985)
19	Wash, The (2001)
20	Iron Eagle IV (1995)
21	Ernest in the Army (1998)
22	3 Ninjas: High Noon On Mega Mountain (1998)
23	Faces of Death: Fact or Fiction? (1999)
24	Bones (2001)
25	Carnosaur 2 (1995)
