In [105]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import os
from keras.utils import to_categorical
from collections import OrderedDict

In [106]:
from platform import python_version
print(python_version())

3.6.4


### Exp of classifing movie genres based on their events.


In [107]:
movies = pd.read_csv('../../data/metadata_events_table.tsv', sep = '\t')

In [108]:
movies.Genre_code.value_counts()

2    2003
4    1337
1    1018
3     541
0     388
Name: Genre_code, dtype: int64

In [109]:
df0 = movies[movies['Genre_code'] == 0].sample(500, replace = True)
df1 = movies[movies['Genre_code'] == 1].sample(500, replace = True)
df2 = movies[movies['Genre_code'] == 2].sample(500, replace = True)
df3 = movies[movies['Genre_code'] == 3].sample(500, replace = True)
df4 = movies[movies['Genre_code'] == 4].sample(500, replace = True)

In [110]:
movies = pd.concat([df0, df1, df2, df3, df4])

In [111]:
movies.sample(5)

Unnamed: 0,Wikipedia_id,Freebase_id,Name,Release_date,Revenue,Runtime,Languages,Countries,Genres,Events,Genre_code
2349,3408276,/m/099ysh,Song of Freedom,1936,,73.0,dict_values(['English Language']),dict_values(['United Kingdom']),Drama,"['amuse-31.1', 'render-29.90', 'put-9.1-2', 'a...",2
4226,1189804,/m/04fv_9,Presumed Innocent,1990,221303188.0,127.0,dict_values(['English Language']),dict_values(['United States of America']),Thriller,"['characterize-29.2', 'fill-9.8-1', 'conjectur...",4
4724,2269465,/m/06_nv6,Jason X,2001-07-24,16951798.0,91.0,dict_values(['English Language']),dict_values(['United States of America']),Thriller,"['steal-10.5-1', 'subjugate-42.3', 'put-9.1-2'...",4
4597,27216746,/m/0bwhkkw,Machine Gun Preacher,2011-09-11,2874510.0,129.0,"dict_values(['Arabic Language', 'English Langu...",dict_values(['United States of America']),Action,"['tape-22.4', 'help-72-1', 'discover-84', 'oth...",0
1788,33096502,/m/065y3w2,Red vs. Blue: Recreation,2009,,120.0,dict_values([]),dict_values(['United States of America']),Action,"['cooperate-73-3', 'appear-48.1.1', 'force-59-...",0


In [112]:
len(movies)

2500

### Encode as one-hot vectors

In [113]:
t = Tokenizer()

In [147]:
events = []
for e in movies.Events.tolist():
    events.append([item.split('-')[0] for item in eval(e)])

In [148]:
events[0:5]

[['seem', 'steal', 'future_having', 'tell', 'own', 'wipe_manner', 'bring'],
 ['animal_sounds',
  'investigate',
  'get',
  'cost',
  'meander',
  'sound_emission',
  'confront',
  'fill',
  'preparing',
  'escape'],
 ['chase', 'chase', 'chase', 'chase', 'chase'],
 ['performance',
  'consider',
  'assessment',
  'exist',
  'force',
  'transfer_mesg',
  'preparing',
  'escape',
  'dress',
  'split',
  'other_cos',
  'marvel',
  'lodge',
  'rummage',
  'steal',
  'give',
  'obtain',
  'discover',
  'force'],
 ['contiguous_location',
  'other_cos',
  'order',
  'amuse',
  'steal',
  'escape',
  'investigate',
  'battle',
  'other_cos']]

In [149]:
t.fit_on_texts(events)

In [150]:
t.word_index

{"1'": 1,
 '1': 2,
 "2'": 3,
 '13': 4,
 '5': 5,
 '37': 6,
 '31': 7,
 '51': 8,
 '29': 9,
 "3'": 10,
 '3': 11,
 '47': 12,
 '2': 13,
 "'amuse": 14,
 'amuse': 15,
 '36': 16,
 "'get": 17,
 'get': 18,
 "'escape": 19,
 'escape': 20,
 "7'": 21,
 '26': 22,
 "4'": 23,
 '10': 24,
 '48': 25,
 '109': 26,
 '45': 27,
 "'discover": 28,
 'discover': 29,
 'cos': 30,
 "'transfer": 31,
 'mesg': 32,
 'transfer_mesg': 33,
 '55': 34,
 "'conjecture": 35,
 'conjecture': 36,
 "'own": 37,
 "100'": 38,
 'own': 39,
 "'meander": 40,
 'meander': 41,
 '6': 42,
 "84'": 43,
 '7': 44,
 "'say": 45,
 'say': 46,
 "'become": 47,
 'become': 48,
 "'correspond": 49,
 'correspond': 50,
 '9': 51,
 "'other": 52,
 'other_cos': 53,
 '11': 54,
 "'force": 55,
 'force': 56,
 '30': 57,
 '4': 58,
 "5'": 59,
 "6'": 60,
 '42': 61,
 "'admit": 62,
 "65'": 63,
 'admit': 64,
 "'obtain": 65,
 'obtain': 66,
 '8': 67,
 "'give": 68,
 'give': 69,
 "'meet": 70,
 'meet': 71,
 "'appear": 72,
 'appear': 73,
 "'convert": 74,
 'convert': 75,
 "'future":

In [151]:
msk = np.random.rand(len(movies)) < 0.8
train = []
for item in movies[msk]['Events'].tolist():
    train.append(item)
test = []
for item in movies[~msk]['Events'].tolist():
    test.append(item)


In [152]:
# test

In [153]:
x_train = t.texts_to_sequences(train)
x_train = pad_sequences(x_train, maxlen=20)


In [154]:
x_train

array([[ 76,  77,   4, ...,  54,  11,   1],
       [  3,  40,  12, ...,  19,   8,   1],
       [  0,   0,   0, ..., 199,   8,  60],
       ...,
       [ 19,   8,   1, ..., 386,  28,  43],
       [ 35,   9,   5, ...,   8,   2,   1],
       [  1,  88,  61, ...,   5,   2,   1]], dtype=int32)

In [155]:
x_test = t.texts_to_sequences(test)
x_test = pad_sequences(x_test, maxlen=20)


In [157]:
y_train = to_categorical(movies[msk]['Genre_code'].tolist())
y_test = to_categorical(movies[~msk]['Genre_code'].tolist())

### Simple LSTM 

In [162]:
# create the model
embedding_vector_length = 16
model = Sequential()
model.add(Embedding(3000, embedding_vector_length))
model.add(LSTM(100))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, epochs=3, batch_size=32)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, None, 16)          48000     
_________________________________________________________________
lstm_27 (LSTM)               (None, 100)               46800     
_________________________________________________________________
dense_28 (Dense)             (None, 5)                 505       
Total params: 95,305
Trainable params: 95,305
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x14af97550>

In [163]:
# Final evaluation of the model
scores = model.evaluate(x_test,
                        y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 29.78%
