In [33]:
import os
import re
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from plotly.offline import iplot, plot, init_notebook_mode
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)

from tensorflow.keras import Sequential
from tensorflow.keras.layers import GlobalAveragePooling1D, Embedding, Dense, LSTM, Flatten
from tensorflow.keras.layers import Conv1D, MaxPool1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("GPU Available: ", tf.test.is_gpu_available())

GPU Available:  True


In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
comedy_dir = '../data/comedy/'
horror_dir = '../data/horror/'

# Combining Comedy and Horror Scripts into 2 large scripts

In [12]:
comedy = ''
for file in os.listdir(comedy_dir):
    
    with open(comedy_dir + file) as script:
        s = script.read()
        comedy += ' ' + s
        script.close()
        
horror = ''
for file in os.listdir(horror_dir):
    
    with open(horror_dir + file) as script:
        s = script.read()
        horror += ' ' + s
        script.close()

## Looks like these scripts have character names and other key details in all Caps. 

## Removing these words to help obscure the text from the film. 

## Also removing punctuation.

## Also Lemmatizing words

In [15]:
def text_cleaner(line):
    line = re.sub(r'\b[A-Z]+\b', '', line)
    line = re.sub(r'[^\w\s]','', line)
    line = re.sub(" \d+", "", line)
    line = line.lower()
    line = ' '.join(line.split())
    
    lemma_line = []
    line = nlp(line)
    for word in line:
        lemma_line.append(word.lemma_)
    
    #lemma_line = [l for l in lemma_line]
    return ' '.join(lemma_line)

In [18]:
if len(comedy) > len(horror):
    nlp.max_length = len(comedy) + 1
else:
    nlp.max_length = len(horror) + 1

comedy = text_cleaner(comedy)
horror = text_cleaner(horror)

In [19]:
# need to batch these into multiple dataframes

def line_splitter(line, input_len=25):
    split_line = line.split()
    line_list = []
    for i in range(0, len(line), input_len):
        line_list.append(split_line[i:i+input_len])
        
    return line_list

In [20]:
comedy = pd.DataFrame([line_splitter(comedy)]).T
horror = pd.DataFrame([line_splitter(horror)]).T


comedy.columns = horror.columns = ['line'] #['index', 'line']

In [21]:
genre_dict = {
    'Comedy': 0,
    'Horror': 1
}

In [22]:
comedy['genre'] = 0
horror['genre'] = 1

In [23]:
df = pd.concat([comedy, horror])

df = df[df.astype(str)['line'] != '[]']

In [24]:
# looks like these are relatively even, Not as much for inglorious bastards as the others, but it should be fine.
df['genre'].value_counts()

1    10031
0     6841
Name: genre, dtype: int64

# Begin to assemble word2idx and idx2word

In [25]:
df['text'] = df['line'].apply(lambda x: ' '.join(x))

all_text = ' '.join(df['text'].values.tolist())
all_test_l = list(set(all_text.split()))

In [26]:
# all_test_l contains a unique list of words used in all of the texts
print(f'{len(all_test_l)} unique vocab words')

17092 unique vocab words


In [27]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx2word = {0: '<PAD>', 1: '<UNK>'}

for i in range(len(all_test_l)):
    idx2word[i+2] = all_test_l[i]
    word2idx[idx2word[i]] = i

In [28]:
def word2idx_mapper(line):
    idx_list = []
    for word in line:
        try:
            idx_list.append(word2idx[word])
        except:
            idx_list.append(word2idx['<UNK>'])
            pass
    return idx_list

In [29]:
df['tokenized'] = df['line'].map(word2idx_mapper)

In [30]:
df.head()

Unnamed: 0,line,genre,text,tokenized
0,"[endless, green, hill, bisect, by, a, ribbon, ...",0,endless green hill bisect by a ribbon of highw...,"[14979, 4158, 9320, 13556, 111, 8402, 14125, 1..."
1,"[through, midafternoon, labor, flank, the, wor...",0,through midafternoon labor flank the work pris...,"[11449, 16351, 8205, 12513, 9685, 8069, 5288, ..."
2,"[name, get, out, and, start, curse, and, kick,...",0,name get out and start curse and kick the vehi...,"[6961, 647, 10155, 3517, 1722, 15727, 3517, 11..."
3,"[gang, smile, at, the, oncoming, man, poke, a,...",0,gang smile at the oncoming man poke a prisoner...,"[13538, 1889, 15095, 9685, 2799, 14309, 11337,..."
4,"[yeah, there, go, the, neighborhood, brady, la...",0,yeah there go the neighborhood brady laugh as ...,"[3223, 17055, 8417, 9685, 8895, 11165, 13656, ..."


In [36]:
X = df['tokenized']
y = df['genre']

maxlen = 25

X = pad_sequences(
    X,
    maxlen=maxlen,
    padding='post'
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Time to model!

In [46]:
embedding_dim = 16
vocab_size = len(word2idx)

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(GlobalAveragePooling1D())
#model.add(Flatten())
#model.add(LSTM(100))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X, y,
    epochs=30,
    batch_size=512,
    validation_split=0.3)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 25, 16)            273472    
_________________________________________________________________
global_average_pooling1d_5 ( (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 17        
Total params: 273,761
Trainable params: 273,761
Non-trainable params: 0
_________________________________________________________________
Train on 11810 samples, validate on 5062 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30


In [47]:
history = pd.DataFrame(history.history)

loss = go.Scatter(
    x = history.index,
    y = history.loss,
    name = 'loss'
)

val_loss = go.Scatter(
    x = history.index,
    y = history.val_loss,
    name = 'val_loss'
)

acc = go.Scatter(
    x = history.index,
    y = history.acc,
    name = 'acc'
)

val_acc = go.Scatter(
    x = history.index,
    y = history.val_acc,
    name = 'val_acc'
)

fig = tools.make_subplots(rows=1, cols=2)

fig.append_trace(loss, 1, 2)
fig.append_trace(val_loss, 1, 2)

fig.append_trace(acc, 1, 1)
fig.append_trace(val_acc, 1, 1)

fig['layout'].update(title='Accuracy | Loss')

iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]

