In [84]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)
from keras.preprocessing.text import Tokenizer
import pandas as pd
import os
from keras.utils import to_categorical

### Exp of classifing movie genres based on their events.


In [61]:
movies = pd.read_csv('movie_metadata_cleaned.tsv', sep = '\t')

In [62]:
movies.head(2)

Unnamed: 0,Wikipedia_id,Freebase_id,Name,Release_date,Revenue,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,dict_values(['English Language']),dict_values(['United States of America']),Thriller
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,dict_values(['English Language']),dict_values(['United States of America']),Drama


### Read in events data

In [118]:
event_file = [item for item in os.listdir('protag_events/') if 'txt' in item]

In [119]:
d = {}
for name in event_file:
    with open(os.path.join('protag_events/', name), 'r') as f:
        nlines = []
        for line in [line.strip() for line in f.readlines()]:
            line = line.split(' ')
            line[0] = 'protagonist'
            line = ' '.join([item for item in line])
            nlines.append(line)
        d[int(name.split('_')[0])] = nlines

In [120]:
movies.Wikipedia_id = movies.Wikipedia_id.astype(int)

In [121]:
movies['Events'] = movies['Wikipedia_id'].map(d)

In [122]:
movies = movies.dropna(subset = ['Events'])

In [123]:
movies['Genre_code'] = movies.Genres.astype("category").cat.codes

In [124]:
movies.head(2)

Unnamed: 0,Wikipedia_id,Freebase_id,Name,Release_date,Revenue,Runtime,Languages,Countries,Genres,Events,Genre_code
25,156558,/m/014k4y,Baby Boy,2001-06-27,29381649.0,123.0,dict_values(['English Language']),dict_values(['United States of America']),Drama,"[protagonist urge-58.1 <NE>0 EmptyParameter, p...",2
171,28649243,/m/0cz8rml,Martha,,,116.0,dict_values([]),dict_values(['West Germany']),Drama,[protagonist conjecture-29.5-2 EmptyParameter ...,2


In [125]:
len(movies)

311

### Encode as one-hot vectors

In [126]:
t = Tokenizer()

In [127]:
events = []
for e in movies.Events.tolist():
    events.extend(e)

In [128]:
t.fit_on_texts(events)

In [129]:
msk = np.random.rand(len(movies)) < 0.8
train = movies[msk]['Events'].tolist()
test = movies[~msk]['Events'].tolist()

In [141]:
# test

In [132]:
x_train = t.texts_to_matrix(train,mode='count')

In [133]:
x_test = t.texts_to_matrix(test,mode='count')

In [139]:
x_train.shape

(232, 2178)

In [134]:
y_train = to_categorical(movies[msk]['Genre_code'].tolist())
y_test = to_categorical(movies[~msk]['Genre_code'].tolist())

In [135]:
y_train.shape

(232, 5)

### Simple LSTM 

In [140]:
# create the model
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(1000, embedding_vector_length))
model.add(LSTM(100))
model.add(Dense(5, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 32)          32000     
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 505       
Total params: 85,705
Trainable params: 85,705
Non-trainable params: 0
_________________________________________________________________
None
Train on 232 samples, validate on 79 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12a913f28>

In [138]:
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 80.00%
