In [1]:
import os
import re
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from plotly.offline import iplot, plot, init_notebook_mode
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)

from tensorflow.keras import Sequential
from tensorflow.keras.layers import GlobalAveragePooling1D, Embedding, Dense, LSTM, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, Dropout, GRU, BatchNormalization
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

print("GPU Available: ", tf.test.is_gpu_available())

GPU Available:  True


In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
comedy_dir = '../data/' + 'comedy/'
horror_dir = '../data/' + 'horror/'
thriller_dir = '../data/' + 'thriller/'

# Combining Comedy,  Horror, Thriller Scripts into 3 large scripts

In [4]:
comedy = ''
for file in os.listdir(comedy_dir):
    
    with open(comedy_dir + file) as script:
        s = script.read()
        comedy += ' ' + s
        script.close()
        
horror = ''
for file in os.listdir(horror_dir):
    
    with open(horror_dir + file) as script:
        s = script.read()
        horror += ' ' + s
        script.close() 
        
thriller = ''
for file in os.listdir(thriller_dir):
    
    with open(thriller_dir + file) as script:
        s = script.read()
        thriller += ' ' + s
        script.close()

## Looks like these scripts have character names and other key details in all Caps. 

## Removing these words to help obscure the text from the film. 

## Also removing punctuation.

## Also Lemmatizing words

In [5]:
def text_cleaner(line):
    line = re.sub(r'\b[A-Z]+\b', '', line)
    line = re.sub(r'[^\w\s]','', line)
    line = re.sub(" \d+", "", line)
    line = line.lower()
    line = ' '.join(line.split())
    
    lemma_line = []
    line = nlp(line)
    for word in line:
        lemma_line.append(word.lemma_)
    
    #lemma_line = [l for l in lemma_line]
    return ' '.join(lemma_line)

In [6]:
nlp.max_length = max([len(comedy), len(horror), len(thriller)])

comedy = text_cleaner(comedy)
horror = text_cleaner(horror)
thriller = text_cleaner(thriller)

In [7]:
# need to batch these into multiple dataframes

def line_splitter(line, input_len=25):
    split_line = line.split()
    line_list = []
    for i in range(0, len(line), input_len):
        line_list.append(split_line[i:i+input_len])
        
    return line_list

In [8]:
comedy = pd.DataFrame([line_splitter(comedy)]).T
horror = pd.DataFrame([line_splitter(horror)]).T
thriller = pd.DataFrame([line_splitter(thriller)]).T


comedy.columns = horror.columns = thriller.columns = ['line']

In [9]:
genre_dict = {
    0: 'Comedy',
    1: 'Horror',
    2: 'Thriller'
}

In [10]:
comedy['genre'] = 0
horror['genre'] = 1
thriller['genre'] = 2

In [11]:
df = pd.concat([comedy, horror, thriller])

df = df[df.astype(str)['line'] != '[]']

In [12]:

df['genre'].value_counts()

2    11519
1    10031
0     7600
Name: genre, dtype: int64

# Begin to assemble word2idx and idx2word

In [13]:
df['text'] = df['line'].apply(lambda x: ' '.join(x))

all_text = ' '.join(df['text'].values.tolist())
all_test_l = list(set(all_text.split()))

In [14]:
# all_test_l contains a unique list of words used in all of the texts
print(f'{len(all_test_l)} unique vocab words')

22840 unique vocab words


In [15]:
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx2word = {0: '<PAD>', 1: '<UNK>'}

for i in range(len(all_test_l)):
    idx2word[i+2] = all_test_l[i]
    word2idx[idx2word[i]] = i

In [16]:
def word2idx_mapper(line):
    idx_list = []
    for word in line:
        try:
            idx_list.append(word2idx[word])
        except:
            idx_list.append(word2idx['<UNK>'])
            pass
    return idx_list

In [17]:
df['tokenized'] = df['line'].map(word2idx_mapper)

In [18]:
df.head()

Unnamed: 0,line,genre,text,tokenized
0,"[endless, green, hill, bisect, by, a, ribbon, ...",0,endless green hill bisect by a ribbon of highw...,"[16708, 12632, 13704, 8211, 4499, 1775, 15519,..."
1,"[through, -PRON-, midafternoon, labor, flank, ...",0,through -PRON- midafternoon labor flank the wo...,"[13505, 2292, 10529, 20560, 10907, 18330, 1816..."
2,"[shudder, stall, big, blackfoot, indian, name,...",0,shudder stall big blackfoot indian name get ou...,"[7998, 16303, 16322, 2219, 11611, 22654, 3336,..."
3,"[near, the, center, of, the, work, gang, -PRON...",0,near the center of the work gang -PRON- smile ...,"[2593, 18330, 5331, 20924, 18330, 18165, 7591,..."
4,"[of, the, prisoner, be, who, look, up, grin, a...",0,of the prisoner be who look up grin at brady y...,"[20924, 18330, 14303, 10666, 3311, 939, 4319, ..."


In [19]:
X = df['tokenized']
y = df['genre']

maxlen = 25

X = pad_sequences(
    X,
    maxlen=maxlen,
    padding='post'
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Time to model!

In [113]:
embedding_dim = 128
vocab_size = len(word2idx)

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))

model.add(LSTM(64, return_sequences=True))

model.add(Conv1D(filters=64, kernel_size=5, activation='relu', kernel_regularizer=l2(0.5)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=4))


#model.add(Conv1D(32, 4, activation='relu', kernel_regularizer=l2(0.5) ))
#model.add(BatchNormalization())
#model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())
model.add(Dense(96, activation='relu'))

model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.summary()


#opt = Adam(lr = 0.1)

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                   epochs=10,
                   batch_size=512,
                   verbose=2, 
                   validation_data = (X_test, y_test),
                   shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 25, 128)           2923520   
_________________________________________________________________
lstm_13 (LSTM)               (None, 25, 64)            49408     
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 21, 64)            20544     
_________________________________________________________________
batch_normalization_6 (Batch (None, 21, 64)            256       
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 5, 64)             0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 320)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 96)                30816     
__________

In [114]:
history = pd.DataFrame(history.history)

loss = go.Scatter(
    x = history.index,
    y = history.loss,
    name = 'loss'
)

val_loss = go.Scatter(
    x = history.index,
    y = history.val_loss,
    name = 'val_loss'
)

acc = go.Scatter(
    x = history.index,
    y = history.acc,
    name = 'acc',
    line = dict(color='rgb(0, 255, 255)')
)

val_acc = go.Scatter(
    x = history.index,
    y = history.val_acc,
    name = 'val_acc',
    line = dict(color='rgb(0, 255, 0)')
)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Accuracy', 'Loss'))

fig.append_trace(loss, 1, 2)
fig.append_trace(val_loss, 1, 2)

fig.append_trace(acc, 1, 1)
fig.append_trace(val_acc, 1, 1)

fig['layout'].update(title='Model Training')

iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [115]:
pred = model.predict(X_test)

preds = []

for p in pred:
    result = np.where(p == np.amax(p))
    preds.append(result[0][0])
    
print(classification_report(preds, y_test))

              precision    recall  f1-score   support

           0       0.70      0.76      0.73      1372
           1       0.72      0.88      0.79      1632
           2       0.88      0.72      0.79      2826

    accuracy                           0.78      5830
   macro avg       0.77      0.79      0.77      5830
weighted avg       0.79      0.78      0.78      5830



In [116]:
def genre(pred):
    preds = []
    for p in pred:
        result = np.where(p == np.amax(pred))
        preds.append(result[0][0])

    return genre_dict[preds[0]]

In [117]:
def model_pipeline(line):
    line = text_cleaner(line)
    line = word2idx_mapper(line.split())
    line = np.array(line).reshape(1, -1)
    line = pad_sequences(line, maxlen=maxlen, padding='post')
    return line

In [118]:
genre_dict

{0: 'Comedy', 1: 'Horror', 2: 'Thriller'}

In [119]:
text = '''Sometimes The Clothes At Gap Kids Are Too Flashy, So I'm Forced To Go To The American Girl Store 
        And Order Clothes For Large Colonial Dolls'''

genre(
    model.predict(
        model_pipeline(text)
    )
)

'Thriller'

In [120]:
model.evaluate(X_test, y_test)



[0.6386782437614061, 0.777701543718832]

# Fun Testing!

In [127]:
comedy_lines = [
    'Sometimes The Clothes At Gap Kids Are Too Flashy, So I\'m Forced To Go To The American Girl Store And Order Clothes For Large Colonial Dolls',   # Angela from The Office
    'Just get a job? Why dont I strap on my job helmet, and squeeze down into a job cannon, AND FIRE OFF INTO JOBLAND, WHERE JOBS GROW ON JOBBIES?!!!',
    'Yeah, well, we won\'t get got though. We gonna get. See, Dee, people like us, we don\'t get got. We go get.',
    '[Holds up iPod] TOM PUT ALL MY RECORDS INTO THIS RECTANGLE. THE SONGS JUST PLAY ONE AFTER THE OTHER. THIS IS AN EXCELLENT RECTANGLE.',
    'A bookstore is one of the only pieces of evidence we have that people are still thinking.',
    'The IRS! Theyâ€™re like the Mafia, they can take anything they want!',
]

In [147]:
comedy_lines = df[df['genre'] == 0]['text'].sample(10).values.tolist()
horror_lines = df[df['genre'] == 1]['text'].sample(10).values.tolist()
thriller_lines = df[df['genre'] == 2]['text'].sample(10).values.tolist()

In [148]:
print('------------COMEDY------------')
for line in comedy_lines:
    g = genre(
        model.predict(
            model_pipeline(line)
        )
    )
    sample_text = ' '.join(line.split()[0:10]) + '... '
    print(f'GENRE: {g} | LINE: {sample_text}')
    
print('------------------------\n')
    
print('------------HORROR------------')
for line in horror_lines:
    g = genre(
        model.predict(
            model_pipeline(line)
        )
    )
    sample_text = ' '.join(line.split()[0:10]) + '... '
    print(f'GENRE: {g} | LINE: {sample_text}')
    
print('------------------------\n')

print('------------THRILLER------------')
for line in thriller_lines:
    g = genre(
        model.predict(
            model_pipeline(line)
        )
    )
    sample_text = ' '.join(line.split()[0:10]) + '... '
    print(f'GENRE: {g} | LINE: {sample_text}')

------------COMEDY------------
GENRE: Comedy | LINE: be on the sofa make out when dana enter -PRON-... 
GENRE: Comedy | LINE: keep -PRON- there do whatever -PRON- have to but do... 
GENRE: Comedy | LINE: when -PRON- see -PRON- -PRON- will know when -PRON- see... 
GENRE: Comedy | LINE: from a point in the center of the screen a... 
GENRE: Comedy | LINE: the window and look out at the big jumbotron with... 
GENRE: Comedy | LINE: false one take -PRON- nose off well well -PRON- do... 
GENRE: Comedy | LINE: the building m tired so m go to sleep -PRON-... 
GENRE: Comedy | LINE: ganz be be chase cate fire at ganz ganz duck... 
GENRE: Comedy | LINE: dick who now who s be stupid the guy who... 
GENRE: Comedy | LINE: of reporter all shout question at once as babble be... 
------------------------

------------HORROR------------
GENRE: Horror | LINE: see -PRON- the classical music and freddy can be hear... 
GENRE: Thriller | LINE: customer but if -PRON- say this be an matter will... 
GENRE: Horro

In [149]:
model.save('../models/model.h5')