In [46]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import os
from keras.utils import to_categorical
from collections import OrderedDict

### Exp of classifing movie genres based on their events.


In [2]:
movies = pd.read_csv('movie_metadata_cleaned.tsv', sep = '\t')

In [3]:
movies.head(2)

Unnamed: 0,Wikipedia_id,Freebase_id,Name,Release_date,Revenue,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,dict_values(['English Language']),dict_values(['United States of America']),Thriller
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,dict_values(['English Language']),dict_values(['United States of America']),Drama


### Read in events data

In [4]:
event_file = [item for item in os.listdir('protag_events/') if 'txt' in item]

In [5]:
d = {}
for name in event_file:
    with open(os.path.join('protag_events/', name), 'r') as f:
        nlines = []
        for line in [line.strip() for line in f.readlines()]:
            line = line.split(' ')
            line[0] = 'protagonist'
            line = ' '.join([item for item in line])
            nlines.append(line)
        nlines = list(OrderedDict((x, True) for x in nlines).keys())
        if len(nlines) > 1:    
            d[int(name.split('_')[0])] = nlines

In [6]:
movies.Wikipedia_id = movies.Wikipedia_id.astype(int)

In [7]:
movies['Events'] = movies['Wikipedia_id'].map(d)

In [8]:
movies = movies.dropna(subset = ['Events'])

In [9]:
movies['Genre_code'] = movies.Genres.astype("category").cat.codes

In [10]:
movies.head(2)

Unnamed: 0,Wikipedia_id,Freebase_id,Name,Release_date,Revenue,Runtime,Languages,Countries,Genres,Events,Genre_code
25,156558,/m/014k4y,Baby Boy,2001-06-27,29381649.0,123.0,dict_values(['English Language']),dict_values(['United States of America']),Drama,"[protagonist urge-58.1 <NE>0 EmptyParameter, p...",2
171,28649243,/m/0cz8rml,Martha,,,116.0,dict_values([]),dict_values(['West Germany']),Drama,[protagonist conjecture-29.5-2 EmptyParameter ...,2


In [11]:
movies = movies.dropna(subset = ['Events'])

In [12]:
movies.Genre_code.value_counts()

2    204
4    162
1     94
3     61
0     33
Name: Genre_code, dtype: int64

In [13]:
df0 = movies[movies['Genre_code'] == 0].sample(50, replace = True)
df1 = movies[movies['Genre_code'] == 1].sample(50, replace = True)
df2 = movies[movies['Genre_code'] == 2].sample(50, replace = True)
df3 = movies[movies['Genre_code'] == 3].sample(50, replace = True)
df4 = movies[movies['Genre_code'] == 4].sample(50, replace = True)

In [14]:
movies = pd.concat([df0, df1, df2, df3, df4])

In [15]:
movies.sample(5)

Unnamed: 0,Wikipedia_id,Freebase_id,Name,Release_date,Revenue,Runtime,Languages,Countries,Genres,Events,Genre_code
12876,6326152,/m/0g13g7,The Seedling,1974,,131.0,dict_values(['Hindi Language']),dict_values(['India']),Romance Film,[protagonist stop-55.4-1 Synset('study.n.09') ...,3
8617,3724436,/m/09xcn4,The 39 Steps,1935-06,,86.0,dict_values(['English Language']),dict_values(['United Kingdom']),Thriller,[protagonist search-35.2 Synset('presentation....,4
1432,2651593,/m/07vfy4,Water,2005-09-08,10422387.0,114.0,"dict_values(['Hindi Language', 'English Langua...","dict_values(['Canada', 'India'])",Romance Film,[protagonist appear-48.1.1 Synset('wrath.n.02'...,3
29101,2563837,/m/07n90q,Femme Fatale,2002-04-30,16838910.0,114.0,"dict_values(['French Language', 'English Langu...",dict_values(['France']),Thriller,[protagonist cooperate-73-2 EmptyParameter Syn...,4
52264,11051391,/m/02qz9m7,Nikaah,1982,,144.0,dict_values(['Hindi Language']),dict_values(['India']),Drama,[protagonist amuse-31.1 EmptyParameter EmptyPa...,2


In [16]:
len(movies)

250

### Encode as one-hot vectors

In [17]:
t = Tokenizer()

In [40]:
events = []
for e in movies.Events.tolist():
    events.append(e)

In [41]:
t.fit_on_texts(events)

In [70]:
# t.word_index

In [42]:
msk = np.random.rand(len(movies)) < 0.8
train = []
for item in movies[msk]['Events'].tolist():
    train.append(item)
test = []
for item in movies[~msk]['Events'].tolist():
    test.append(item)


In [24]:
# test

In [47]:
x_train = t.texts_to_sequences(train)
x_train = pad_sequences(x_train, maxlen=20)


In [48]:
x_test = t.texts_to_sequences(test)
x_test = pad_sequences(x_test, maxlen=20)


In [49]:
x_train

array([[   0,    0,    0, ...,  128,  129,  130],
       [ 345,  131,  172, ...,  360,  361,  362],
       [   0,    0,    0, ...,  183,  184,  185],
       ...,
       [   0,    0,    0, ..., 2788, 2789, 2790],
       [2792, 2793, 2794, ..., 2808, 2809, 2810],
       [   0,    0, 2811, ..., 2822, 2823,    3]], dtype=int32)

In [53]:
y_train = to_categorical(movies[msk]['Genre_code'].tolist())
y_test = to_categorical(movies[~msk]['Genre_code'].tolist())

### Simple LSTM 

In [92]:
# create the model
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(3000, embedding_vector_length))
model.add(LSTM(10))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, epochs=3, batch_size=32)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, None, 32)          96000     
_________________________________________________________________
lstm_21 (LSTM)               (None, 10)                1720      
_________________________________________________________________
dense_21 (Dense)             (None, 5)                 55        
Total params: 97,775
Trainable params: 97,775
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1351207f0>

In [93]:
# Final evaluation of the model
scores = model.evaluate(x_test,
                        y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 68.29%
