In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

import re
from spacy.lang.en.stop_words import STOP_WORDS
import spacy



In [2]:
from spacy.lang.en.stop_words import STOP_WORDS
import spacy

In [3]:
print(spacy.__version__)

3.7.5


In [4]:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
bechdel_df_long = pd.read_csv("parsedScripts.csv")
bechdel_df_long.head()

Unnamed: 0,imdb_movie_id,script,bechdel_rating
0,2024544,T 12 YEARS A SLAVE\nD Written by\nC John Ridle...,1
1,116922,S Lost Highway\nN A 21st Century Noir Horror F...,0
2,119822,T AS GOOD AS IT GETSAS GOOD AS IT GETS\nO by\n...,1
3,251736,"S ""HOUSE OF 1000 CORPSES""\nD Written by\nC R.W...",1
4,266543,T --------------------------------------------...,0


In [6]:
bechdel_df = bechdel_df_long[['script', 'bechdel_rating']]
bechdel_df['script'] = bechdel_df['script'].astype(str)
bechdel_df.head()

Unnamed: 0,script,bechdel_rating
0,T 12 YEARS A SLAVE\nD Written by\nC John Ridle...,1
1,S Lost Highway\nN A 21st Century Noir Horror F...,0
2,T AS GOOD AS IT GETSAS GOOD AS IT GETS\nO by\n...,1
3,"S ""HOUSE OF 1000 CORPSES""\nD Written by\nC R.W...",1
4,T --------------------------------------------...,0


In [7]:
bechdel_df = bechdel_df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS
STOPWORDS |= {'ve', 'int', 'ext', 'just', 'don', 'know', 'like', 'day', 'continued', 'got', 'away', 'looks', 'cont',
                    'door', 'll', 'night', 'room', 'right', 'look', 'www', 'pdf', 'little', 'house', 'eyes', 'face',
                    'screentalk', 'later', 'revs', '03', '10', '12', '21', 'draft', 'screening', 'script', 'revised', 'screenplay'}

def clean_script(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
bechdel_df['script'] = bechdel_df['script'].apply(clean_script)
bechdel_df['script'] = bechdel_df['script'].str.replace('\d+', '')

In [8]:
bechdel_df['script'][42]

'n men origins wolverinen written byn david benioff skip woodss et rural road dayn twolane road carves endless forest toweringn black spruce flock pintails skims treetops n heading south snowfalln young boy undersized feral trudges home bookbagn strapped blackhaired paleskinned boyn hasnt good meal long timen hear boy turns watching bend inn road moment hear thrum hardcharging enginen oldsmobile super 88 rounds bend accelerating itn hits straightaway boy steps road butn car comes straight himn boy closes oldsmobiles brakes clamp downn wheels car shivers halt inches ton spare laughter spills cars open windowsn high school lettermen pile car wearing theirn leathersleeved football jackets gilman largest shakes head laughsc gilmand kid didnt thatd wheres survival instincts d boyn boy says readjusts bookbag resumesn long walk northwardc gilman contd d heyn gilman grabs boys shoulder spins aroundc gilman contd d hell going hear med talking youn football players mason walks andn inspects boys

In [9]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(bechdel_df['script'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 218860 unique tokens.


In [10]:
total_vocab = len(word_index) + 1

In [11]:
total_vocab

218861

In [12]:
X = tokenizer.texts_to_sequences(bechdel_df['script'].values)
X = tf.keras.utils.pad_sequences(X)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (414, 30090)


In [13]:
Y = bechdel_df['bechdel_rating'].values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (414,)


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)
print(f'X_train is: {X_train.shape} and Y_train is: {Y_train.shape}')
print(f'X_test is: {X_test.shape} and Y_test is: {Y_test.shape}')

X_train is: (310, 30090) and Y_train is: (310,)
X_test is: (104, 30090) and Y_test is: (104,)


In [15]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(total_vocab, 100, input_length=X.shape[1]))
model.add(tf.keras.layers.SpatialDropout1D(0.2))
model.add(tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])


In [16]:
model.summary()

In [17]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m615s[0m 122s/step - binary_accuracy: 0.4901 - loss: 0.6933 - val_binary_accuracy: 0.4516 - val_loss: 0.6948
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m611s[0m 123s/step - binary_accuracy: 0.5500 - loss: 0.6858 - val_binary_accuracy: 0.4516 - val_loss: 0.6973
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m615s[0m 121s/step - binary_accuracy: 0.5347 - loss: 0.6747 - val_binary_accuracy: 0.4516 - val_loss: 0.7044
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m597s[0m 120s/step - binary_accuracy: 0.5784 - loss: 0.6311 - val_binary_accuracy: 0.4516 - val_loss: 0.7064
Epoch 5/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m622s[0m 120s/step - binary_accuracy: 0.8162 - loss: 0.5735 - val_binary_accuracy: 0.4194 - val_loss: 0.6939


In [18]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, Y_test, batch_size=64)

# Print the evaluation results
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 12s/step - binary_accuracy: 0.5489 - loss: 0.6925
Test Loss: 0.6924281716346741
Test Accuracy: 0.557692289352417


In [19]:
from sklearn.metrics import f1_score
Y_pred = model.predict(X_test)
Y_pred = (Y_pred > 0.5)
f1_score(Y_test, Y_pred, average='micro')

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 12s/step


0.5576923076923077