In [477]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import os
from keras.utils import to_categorical
from keras import backend as K
from keras.models import Model
from keras.models import load_model
import random
from scipy.spatial.distance import cosine

In [2]:
from platform import python_version
print(python_version())

3.6.6


### Exp of classifing movie genres based on their events.


In [319]:
movies = pd.read_csv('../../data/metadata_events_table.tsv', sep = '\t')

In [320]:
print(len(movies))

13017


In [349]:
smr_path = ('../../data/raw/MovieSummaries/indiv_summaries/')
smr_list = [item for item in os.listdir(smr_path) if 'txt' in item]

In [351]:
d = {}
for item in smr_list:
    d[item.split('.')[0]] = open(os.path.join(smr_path, item), 'r').read()

In [357]:
movies.Wikipedia_id = movies.Wikipedia_id.astype(str)
movies['Summary'] = movies['Wikipedia_id'].map(d)

In [456]:
movies.head(2)

Unnamed: 0,Wikipedia_id,Freebase_id,Name,Release_date,Revenue,Runtime,Languages,Countries,Genres,Events,vecs,Summary
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","['send', 'hold', 'find', 'learn', 'murder', 'f...","[1.0, -0.8731093406677246, -0.0, -0.9997652173...","Set in the second half of the 22nd century, th..."
1,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}","['draw', 'learn', 'meet', 'fall', 'enter', 'fi...","[0.0, -0.0, -0.7400037050247192, -0.0, -0.0, 0...","Eva, an upper class housewife, becomes frustra..."


### Encode as one-hot vectors

In [162]:
t = Tokenizer()

In [163]:
events = []
for e in movies.Events.tolist():
    events.append([item.split('-')[0] for item in eval(e)])

In [164]:
# Size of data
print(len(events))

13017


In [165]:
t.fit_on_texts(events)

In [166]:
# t.word_index

In [167]:
vocab_size = max(t.word_index.values())+1

In [168]:
vocab_size

3608

In [172]:
msk = np.random.rand(len(movies)) < 0.9
train = []
for e in movies[msk]['Events'].tolist():
    train.append([item.split('-')[0] for item in eval(e)])
test = []
for e in movies[~msk]['Events'].tolist():
    test.append([item.split('-')[0] for item in eval(e)])

In [173]:
train = t.texts_to_sequences(train)
train = pad_sequences(train, maxlen=70)
# x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train = train[:,:-1]
y_train = train[:,-1]
y_train = to_categorical(y_train, num_classes=vocab_size)

In [174]:
test = t.texts_to_sequences(test)
test = pad_sequences(x_train, maxlen=70)
# x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = test[:,:-1]
y_test = test[:,-1]
y_test = to_categorical(y_test, num_classes=vocab_size)

In [186]:
events_trans = t.texts_to_sequences(events)
events_trans = pad_sequences(events_trans, maxlen=70)
# events_trans = np.reshape(events_trans, (events_trans.shape[0], events_trans.shape[1], 1))
x_all = events_trans[:,:-1]

In [188]:
x_all.shape

(13017, 69)

### Simple LSTM 

In [None]:
# %%timeit
embedding_vector_length = 20
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=69))
model.add( LSTM(64, input_shape = (69,embedding_vector_length)))
model.add(Dense(vocab_size, activation='softmax')) 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, epochs=800)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_27 (Embedding)     (None, 69, 20)            72160     
_________________________________________________________________
lstm_54 (LSTM)               (None, 64)                21760     
_________________________________________________________________
dense_27 (Dense)             (None, 3608)              234520    
Total params: 328,440
Trainable params: 328,440
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800

In [None]:
# Final evaluation of the model
scores = model.evaluate(x_test,
                        y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
model.save('lstm.h5')

In [461]:
model = load_model('lstm.h5')

### Obtain embeddings

In [184]:
# #Alternative way, same result
# intermediate_layer_model = Model(inputs=model.input,
#                                  outputs=model.get_layer('lstm_53').output)


In [462]:
#Get indimediate layer output using tensorflow backend
get_lstm_output = K.function([model.layers[0].input], [model.layers[1].output])
output = get_lstm_output([x_all])

In [463]:
vecs = output[0]

In [468]:
rd = random.choice(range(len(vecs)))
rd

8671

In [467]:
# rd = 2457

In [445]:
# movies[movies.Name == 'Star Wars Episode VI: Return of the Jedi']

In [469]:
vecs_d = np.delete(vecs, rd, axis=0)

In [470]:
def find_closest(vc, dest):
    dist = 1
    min_idx = 0
    for i in range(len(vc)):
        dist_u = cosine(dest, vc[i])
        if dist_u < dist:
            dist = dist_u
            min_idx = i
    return dist, min_idx

In [471]:
rd_c = find_closest(vecs_d, vecs[rd])

In [472]:
rd_c

(0.09848213195800781, 5580)

In [473]:
movies.iloc[rd]

Wikipedia_id                                              2375811
Freebase_id                                             /m/077hq8
Name                         Jason Goes To Hell: The Final Friday
Release_date                                           1993-08-13
Revenue                                               1.59351e+07
Runtime                                                        89
Languages                      {"/m/02h40lc": "English Language"}
Countries               {"/m/09c7w0": "United States of America"}
Genres          {"/m/01q03": "Cult", "/m/03npn": "Horror", "/m...
Events          ['attack', 'blow', 'process', 'bring', 'become...
vecs            [0.0, -0.0, -1.0, -0.0, -0.3068475127220154, -...
Summary         {{Plot}} Undercover FBI agent Elizabeth Marcus...
Name: 8671, dtype: object

In [474]:
movies.iloc[rd].Summary

'{{Plot}} Undercover FBI agent Elizabeth Marcus, staying at a rundown cabin on Crystal Lake, encounters Jason and leads him to a clearing in the woods where he is attacked and blown to pieces by a government task force. The agents are celebrating the victory, and we cut to a mysterious figure in the forest questioning what the government just did saying "I don\'t think so", pointing out that Jason has been resurrected before. Jason\'s remains are sent to a morgue. The coroner is processing the autopsy making various notes, including the heart being twice the size of a normal human heart. As he brings a scalpel to the heart, it begins to beat slowly, then faster and faster. The coroner becomes hypnotized by Jason\'s beating heart and is compelled to eat it. This causes spirits to emanate from Jason\'s scarred body parts, and the coroner becomes possessed by the \'spirit\' of Jason. We see this as the reflection of newly hosted coroner is that of Jason\'s original form. The now possessed

In [475]:
movies.iloc[rd_c[1]]

Wikipedia_id                                              8266590
Freebase_id                                            /m/026ycs4
Name                                                        Billa
Release_date                                           1980-01-24
Revenue                                                       NaN
Runtime                                                       175
Languages                          {"/m/07c9s": "Tamil Language"}
Countries                                   {"/m/03rk0": "India"}
Genres          {"/m/02kdv5l": "Action", "/m/01chg": "Bollywood"}
Events          ['manage', 'replace', 'tell', 'go', 'give', 's...
vecs            [0.30103787779808044, -0.0, -0.668038666248321...
Summary         Billa is the story of one of the most powerful...
Name: 5580, dtype: object

In [476]:
movies.iloc[rd_c[1]].Summary

'Billa is the story of one of the most powerful men in the business of crime, who in spite of being one of the most wanted on the list of Interpol, remains elusive to the police. Along with the police, Billa  makes a few other enemies through his merciless approach in running his organisation, especially when he kills one of his own men, Rajesh, when Rajesh decides to leave the business. This introduces Billa to two new enemies, Kamini ([[Helen , Rajesh’s fiancee, and Priya,  Rajesh’s sister. When Kamini seduces Billa and attempts to have the police arrest him, her plan backfires as Billa outsmarts her and escapes, and in the process Kamini was killed. A shattered, revenge-seeking Priya cuts her hair short, trains in judo and karate,and then enters Billa’s gang after deceiving them into thinking that she too is on the wrong side of the law. Billa is impressed with her fighting skills and allows her to work for him, without realising her true intentions. Meanwhile, after a couple of uns