In [69]:
import re
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from plotly.offline import iplot, plot, init_notebook_mode
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)

print("GPU Available: ", tf.test.is_gpu_available())

GPU Available:  True


In [2]:
nlp = spacy.load("en_core_web_lg")

In [132]:
with open('../data/django_unchained_script.txt', 'r', encoding='utf8') as file:
    django = file.read()
    file.close()
    
with open('../data/inglorious_basterds_script.txt', encoding='utf8') as file:
    ingbast = file.read()
    file.close()
    
with open('../data/pulp_fiction_script.txt', 'r', encoding='utf8') as file:
    pulp = file.read()
    file.close()
    
with open('../data/reservoir_dogs_screenplay.txt', 'r', encoding='utf8') as file:
    dogs = file.read()
    file.close()

## Looks like these scripts have character names and other key details in all Caps. 

## Removing these words to help obscure the text from the film. 

## Also removing punctuation.

## Also Lemmatizing words

In [133]:
def text_cleaner(line):
    line = re.sub(r'\b[A-Z]+\b', '', line)
    line = re.sub(r'[^\w\s]','', line)
    line = line.lower()
    line = ' '.join(line.split())
    
    lemma_line = []
    line = nlp(line)
    for word in line:
        lemma_line.append(word.lemma_)
    
    #lemma_line = [l for l in lemma_line]
    return ' '.join(lemma_line)

In [134]:
django = text_cleaner(django)
ingbast = text_cleaner(ingbast)
pulp = text_cleaner(pulp)
dogs = text_cleaner(dogs)

In [135]:
# need to batch these into multiple dataframes

def line_splitter(line, input_len=25):
    split_line = line.split()
    line_list = []
    for i in range(0, len(line), input_len):
        line_list.append(split_line[i:i+input_len])
        
    return line_list

In [136]:
django = pd.DataFrame([line_splitter(django)]).T
ingbast = pd.DataFrame([line_splitter(ingbast)]).T
pulp = pd.DataFrame([line_splitter(pulp)]).T
dogs = pd.DataFrame([line_splitter(dogs)]).T


django.columns = ingbast.columns = pulp.columns = dogs.columns = ['line'] #['index', 'line']

In [140]:
movie_dict = {
    'Django Unchained': 0,
    'Inglorius Bastards': 1,
    'Resevoir Dogs': 2,
    'Pulp Fiction': 3
}

In [141]:
django['film'] = 0
ingbast['film'] = 1
dogs['film'] = 2
pulp['film'] = 3

In [10]:
df = pd.concat([django, ingbast, dogs, pulp])

df = df[df.astype(str)['line'] != '[]']

In [11]:
# looks like these are relatively even, Not as much for inglorious bastards as the others, but it should be fine.
df['film'].value_counts()

0    1400
1    1166
3    1011
2     708
Name: film, dtype: int64

# Begin to assemble word2idx and idx2word

In [12]:
df['text'] = df['line'].apply(lambda x: ' '.join(x))

all_text = ' '.join(df['text'].values.tolist())
all_test_l = list(set(all_text.split()))

In [13]:
# all_test_l contains a unique list of words used in all of the texts
print(f'{len(all_test_l)} unique vocab words')

6683 unique vocab words


In [14]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx2word = {0: '<PAD>', 1: '<UNK>'}

for i in range(len(all_test_l)):
    idx2word[i+2] = all_test_l[i]
    word2idx[idx2word[i]] = i

In [15]:
def word2idx_mapper(line):
    idx_list = []
    for word in line:
        try:
            idx_list.append(word2idx[word])
        except:
            idx_list.append(word2idx['<UNK>'])
            pass
    return idx_list

In [16]:
df['tokenized'] = df['line'].map(word2idx_mapper)

In [17]:
df.head()

Unnamed: 0,line,film,text,tokenized
0,"[write, by, quentin, tarantino, as, the, film,...",0,write by quentin tarantino as the film play co...,"[4545, 889, 5521, 2727, 5700, 3525, 3985, 3943..."
1,"[location, be, somewhere, in, texas, the, blac...",0,location be somewhere in texas the black man a...,"[2516, 3977, 2508, 2948, 5446, 3525, 5566, 520..."
2,"[be, two, slave, trader, call, the, and, one, ...",0,be two slave trader call the and one of the se...,"[3977, 2305, 447, 3398, 2518, 3525, 4278, 6568..."
3,"[or, may, not, notice, a, tiny, small, r, burn...",0,or may not notice a tiny small r burn into -PR...,"[2732, 6490, 981, 179, 6126, 2836, 5159, 584, ..."
4,"[have, be, by, bull, whip, beating, as, the, o...",0,have be by bull whip beating as the operatic o...,"[3414, 3977, 889, 3193, 453, 1238, 5700, 3525,..."


In [38]:
X = df['tokenized']
y = df['film']

maxlen = 25

X = pad_sequences(
    X,
    maxlen=maxlen,
    padding='post'
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Time to model!

In [146]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import GlobalAveragePooling1D, Embedding, Dense, LSTM, Flatten
from tensorflow.keras.layers import Conv1D, MaxPool1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [150]:
embedding_dim = 32
vocab_size = len(word2idx)

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))

#model.add(Flatten())
model.add(LSTM(100))
#model.add(Dense(16, activation='relu'))
model.add(Dense(4, activation='softmax'))

model.summary()

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X, y,
    epochs=30,
    batch_size=128,
    validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 25, 32)            213856    
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_28 (Dense)             (None, 4)                 404       
Total params: 267,460
Trainable params: 267,460
Non-trainable params: 0
_________________________________________________________________
Train on 3428 samples, validate on 857 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30


Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30


Epoch 28/30
Epoch 29/30
Epoch 30/30


In [151]:
history = pd.DataFrame(history.history)

loss = go.Scatter(
    x = history.index,
    y = history.loss,
    name = 'loss'
)

val_loss = go.Scatter(
    x = history.index,
    y = history.val_loss,
    name = 'val_loss'
)

acc = go.Scatter(
    x = history.index,
    y = history.acc,
    name = 'acc'
)

val_acc = go.Scatter(
    x = history.index,
    y = history.val_acc,
    name = 'val_acc'
)

fig = tools.make_subplots(rows=1, cols=2)

fig.append_trace(loss, 1, 1)
fig.append_trace(val_loss, 1, 1)

fig.append_trace(acc, 1, 2)
fig.append_trace(val_acc, 1, 2)

fig['layout'].update(title='Loss | Accuracy')

iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]

