In [295]:
from __future__ import print_function
from sklearn import preprocessing
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten
import json
import urllib
import cStringIO
from keras.optimizers import SGD
from keras.optimizers import Adadelta

from keras import backend as K
import ast

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
import numpy as np
from sklearn.model_selection import train_test_split


In [273]:
# import data 

# we have two data one with 128 * 128 posters, one with 32 * 32 posters
# we are running the 128 * 128 on AWS since it takes a long time.
# this notebook shows how run our model and is using 32 * 32 posters for faster performance
# data = pd.read_pickle('imgs_20000_128.pkl')
data = pd.read_pickle('imgs.pkl')
data.head()

Unnamed: 0,RGB,genre_ids
0,"[[[15, 36, 71], [13, 33, 68], [14, 34, 70], [1...","[14, 10402, 10749]"
1,"[[[8, 9, 8], [10, 10, 10], [11, 11, 11], [13, ...","[28, 18, 878]"
2,"[[[147, 122, 120], [170, 140, 132], [129, 100,...","[16, 35, 18, 10751, 10402]"
3,"[[[138, 47, 13], [150, 58, 16], [167, 74, 26],...","[28, 12, 14]"
4,"[[[255, 255, 255], [255, 255, 255], [254, 254,...","[28, 80, 53]"


In [274]:
# this saves 
# data.to_pickle('imgs.pkl')

In [275]:
# stack RGB values into the right shape
new_RGB = np.stack(data.RGB, axis = 0)
new_RGB.shape

(9991, 32, 32, 3)

In [217]:
# Drop bad values
data = data.drop(data.index[[686,1784,2731,3311,5121,5653,8056,8063,9401,11334,12760,13628,14071,16186,17271,18552,18997,19659,19690]])
data.RGB.shape

(19785,)

In [216]:
# test which ones are the bad values, -> delete
res = data.RGB[0]
for i in range(len(data.RGB[1:])):
    try:
        np.stack((res, data.RGB[i]), axis=0)
    except:
        print(i)


686
1784
2731
3311
5121
5653
8056
8063
9401
11334
12760
13628
14071
16186
17271
18552
18997
19659
19690


In [276]:
# get genre list -> for getting the correct Y values
genre_list = urllib.urlopen("https://api.themoviedb.org/3/genre/movie/list?api_key=2dc6c9f1d17bd39dcbaef83321e1b5a3&language=en-US")

genre_list_json = json.loads(genre_list.read()) 

genre_lst = {}
for i in genre_list_json['genres']:
    genre_lst[i['id']] = str(i['name'])
    
labels = []
for i in data.genre_ids:
    label_matrix = np.zeros(len(genre_lst.keys()), dtype=int)
    for j in ast.literal_eval(i):
        if j in genre_lst.keys():
            label_matrix[genre_lst.keys().index(j)] = 1
    labels.append(label_matrix)
data['labels'] = labels

In [283]:
# input image dimensions - 128 * 128
# img_rows, img_cols = 128, 128
img_rows, img_cols = 32, 32

# smaller batch size means noisier gradient, but more updates per epoch
batch_size = 512
# this is fixed, we have 19 genres in our data set
num_classes = 19
# number of iterations over the complete training data
epochs = 20

# the data, shuffled and split between train and test sets
X = new_RGB
new_labels = np.stack(data['labels'], axis = 0)
Y = new_labels


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

input_shape = (img_rows, img_cols, 3)

# normalize image values to [0,1]
# interestingly the keras example code does not center the data
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
print (y_test.shape, 'y test samples')

x_train shape: (6993, 32, 32, 3)
6993 train samples
2998 test samples
(2998, 19) y test samples


In [298]:
# create an empty network model
model = Sequential()

# --- input layer ---
model.add(Conv2D(16, kernel_size=(5, 5), activation='relu', input_shape=input_shape))
# --- max pool ---
model.add(MaxPooling2D(pool_size=(2, 2)))

# --- next layer ---
# we could double the number of filters as max pool made the 
# feature maps much smaller 
# just not doing this to improve runtime
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
# --- max pool ---
model.add(MaxPooling2D(pool_size=(2, 2)))

# flatten for fully connected classification layer
model.add(Flatten())
# note that the 19 is the number of classes we have
# the classes are not mutually exclusive so softmax is not a good choice - > we use sigmoid
# --- fully connected layer ---
model.add(Dense(64, activation='relu'))
# --- classification ---
model.add(Dense(19, activation='sigmoid'))

# prints out a summary of the model architecture
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_31 (Conv2D)           (None, 28, 28, 16)        1216      
_________________________________________________________________
max_pooling2d_31 (MaxPooling (None, 14, 14, 16)        0         
_________________________________________________________________
conv2d_32 (Conv2D)           (None, 12, 12, 32)        4640      
_________________________________________________________________
max_pooling2d_32 (MaxPooling (None, 6, 6, 32)          0         
_________________________________________________________________
flatten_16 (Flatten)         (None, 1152)              0         
_________________________________________________________________
dense_31 (Dense)             (None, 64)                73792     
_________________________________________________________________
dense_32 (Dense)             (None, 19)                1235      
Total para

In [280]:
# new metrics function

## all these somehow don't work
from keras import metrics
import keras.backend as K

def precision(y_true, y_pred):
    """Precision metric.
    Only computes a batch-wise average of precision.
    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.
    Only computes a batch-wise average of recall.
    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [301]:
# this does all necessary compiling. In tensorflow this is much quicker than in theano
# the setup is our basic categorical crossentropy with stochastic gradient decent
# we also specify that we want to evaluate our model in terms of accuracy
sgd = SGD(lr=0.1, momentum=0.9)
model.compile(loss='binary_crossentropy',
              optimizer=sgd,
              metrics=['accuracy', precision, recall, f1_score])

In [302]:
# this is now the actual training
# in addition to the training data we provide validation data
# this data is used to calculate the performance of the model over all the epochs
# this is useful to determine when training should stop
# in our case we just use it to monitor the evolution of the model over the training epochs
# if we use the validation data to determine when to stop the training or which model to save, we 
# should not use the test data, but a separate validation set. 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
#                     epochs=epochs,
                    epochs=20,
                    verbose=1,
                    validation_data=(x_test, y_test))

# once training is complete, let's see how well we have done
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 6993 samples, validate on 2998 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.313871270522
Test accuracy: 0.871194845124


In [290]:
# # here is a visualization of the training process
# # typically we gain a lot in the beginning and then
# # training slows down
# plt.plot(history.history['acc'])
# plt.xlabel("epoch")
# plt.ylabel("accuracy")

### Trying a different setting for the model

In [303]:
# create an empty network model
model2 = Sequential()

# --- input layer ---
model2.add(Conv2D(16, kernel_size=(5, 5), activation='relu', input_shape=input_shape))
# --- max pool ---
model2.add(MaxPooling2D(pool_size=(2, 2)))

# --- next layer ---
# we could double the number of filters as max pool made the 
# feature maps much smaller 
# just not doing this to improve runtime
model2.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
# --- max pool ---
model2.add(MaxPooling2D(pool_size=(2, 2)))

# flatten for fully connected classification layer
model2.add(Flatten())
# note that the 19 is the number of classes we have
# the classes are not mutually exclusive so softmax is not a good choice
# --- fully connected layer ---
model2.add(Dense(64, activation='relu'))
# --- classification ---
model2.add(Dense(19, activation='sigmoid'))

# prints out a summary of the model architecture
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_33 (Conv2D)           (None, 28, 28, 16)        1216      
_________________________________________________________________
max_pooling2d_33 (MaxPooling (None, 14, 14, 16)        0         
_________________________________________________________________
conv2d_34 (Conv2D)           (None, 12, 12, 32)        4640      
_________________________________________________________________
max_pooling2d_34 (MaxPooling (None, 6, 6, 32)          0         
_________________________________________________________________
flatten_17 (Flatten)         (None, 1152)              0         
_________________________________________________________________
dense_33 (Dense)             (None, 64)                73792     
_________________________________________________________________
dense_34 (Dense)             (None, 19)                1235      
Total para

In [304]:
# adaptive learning rate
ada = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=0.0)
model2.compile(loss='binary_crossentropy',
              optimizer=ada,
              metrics=['accuracy', precision, recall, f1_score])

In [305]:
# this is now the actual training
# in addition to the training data we provide validation data
# this data is used to calculate the performance of the model over all the epochs
# this is useful to determine when training should stop
# in our case we just use it to monitor the evolution of the model over the training epochs
# if we use the validation data to determine when to stop the training or which model to save, we 
# should not use the test data, but a separate validation set. 
history2 = model2.fit(x_train, y_train,
                    batch_size=batch_size,
#                     epochs=epochs,
                    epochs=20,
                    verbose=1,
                    validation_data=(x_test, y_test))

# once training is complete, let's see how well we have done
score = model2.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 6993 samples, validate on 2998 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.320583623656
Test accuracy: 0.868877511489
