# Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy

from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
from nltk import word_tokenize
from ast import literal_eval

In [2]:
import functions as fun

Using TensorFlow backend.


In [3]:
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

In [4]:
tqdm_notebook.pandas()

# Scrubbing Data

Reading back in data that was created in 'obtaining' file.

In [5]:
df_good = pd.read_csv('df_good.csv')
df_bad = pd.read_csv('df_bad.csv')
rotten_df = pd.read_csv('rotten_df.csv')

In [6]:
# df_good = pd.read_csv('df_good_obtain.csv')
# df_bad = pd.read_csv('df_bad_obtain.csv')
# rotten_df = pd.read_csv('rotten_df_obtain.csv')

In [7]:
try:
    rotten_df.drop('Unnamed: 0', axis=1, inplace=True)
except:
    pass

In [8]:
rotten_df.columns = ['titles', 'titles_formatted', 'rotten_scores', 
                     'scripts', 'all_together_now', 'no_stop', 'just_words']

In [9]:
rotten_df.head()

Unnamed: 0,titles,titles_formatted,rotten_scores,scripts,all_together_now,no_stop,just_words
0,the_gallows_act_ii,gallows-act-ii-the,0,"['\n\n 1', ' - Do it.',...","['\n\n 1', ' - Do it.',...","['\n\n 1', ' - it.', ' - right.', ' oh, gosh....",nn 1 right oh gosh oh gosh bro charlie mexica...
1,portals,portals,0,"['\n\n 1', ' [boulderli...","['\n\n 1', ' [boulderli...","['\n\n 1', ' [boulderlight pictures theme]', ...",nn 1 boulderlight pictures theme bloody disgu...
2,mob_town,mob-town,0,"['\n\n 1', ' Police rai...","['\n\n 1', ' Police rai...","['\n\n 1', ' police raid organized crime meet...",nn 1 police raid organized crime meeting home...
3,solis,solis,0,"['\n\n 1', ' [EERIE]', ...","['\n\n 1', ' [EERIE]', ...","['\n\n 1', ' [eerie]', ' woman: come in, 2024...",nn 1 eerie woman come 2024 hathor 18 confirmi...
4,welcome_to_curiosity,welcome-to-curiosity,0,"['\n\n Help!', ' Someon...","['\n\n Help!', ' Someon...","['\n\n help!', ' help!', ' help!', ' paging d...",nn help help help paging dr jones paging dr j...


## metacritic

In [10]:
def stop_it(text, punct=False):
    """Removes stop words and punctuation.
    
    Arguments:
    
        text: string
            Text to have stop words and punctuation removed from..
            
        punct: bool
            If set to true, will also remove punctuation.
            
    Returns:
        String of words with stop words removed, and punctuation removed if
        selected."""
    
    text=text
    
    # Remove punctuation if indicated in arguments to do so.
    if punct == True:
        punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~"""
        for x in text: 
            if x in punctuations: 
                text = text.replace(x, '')
    
    # Split the string into a list of words to compare against the
    # spacy nlp.Defaults.stop_words list.
    split_it = text.split()
    stopped = [word.lower() for word in split_it \
               if word.lower() not in nlp.Defaults.stop_words]
    
    # Converting list back into a continuous string.
    last_word = ''
    for word in stopped:
        last_word += (' ' + word)
        
    return last_word

### Setting up DataFrame

In [11]:
# Re-setting columns and index after re-importing.
df_good.columns = ['titles', 'scripts', 'good_or_bad']
# df_good.set_index('titles', inplace=True)
df_bad.columns = ['titles', 'scripts', 'good_or_bad']
# df_bad.set_index('titles', inplace=True)

In [12]:
# Adding labels, combining good and bad, and dropping missing scripts.
df_good['good_or_bad'] = 1
df_bad['good_or_bad'] = 0

screenplays = pd.concat([df_good, df_bad])
screenplays.columns = ['titles', 'scripts', 'good_or_bad']

screenplays.dropna(inplace=True)

In [13]:
print('Good: ', len(screenplays[screenplays['good_or_bad'] == 1]))
print('Bad: ', len(screenplays[screenplays['good_or_bad'] == 0]))

Good:  1270
Bad:  1514


In [14]:
# converting imported screenplays back to lists from strings
screenplays.scripts = screenplays.scripts.apply(literal_eval)

In [15]:
# Series with a list of lines for each screenplay.
good_to_count = screenplays[screenplays['good_or_bad'] == 1]
bad_to_count = screenplays[screenplays['good_or_bad'] == 0]

In [16]:
# Single string of all good words.
splice_scripts = ''
for script in good_to_count['scripts']:
    splice_scripts += ''.join(script)

all_good_words = ''.join(splice_scripts)

In [17]:
# Single string of all bad words.
splice_scripts = ''
for script in bad_to_count['scripts']:
    splice_scripts += ''.join(script)

all_bad_words = ''.join(splice_scripts)

In [18]:
# Lists of all words lumped together and tokenized
good_data = word_tokenize(all_good_words)
bad_data = word_tokenize(all_bad_words)

### Script Metrics

In [19]:
print('good words total: ', len(good_data))
print('bad words total: ', len(bad_data))

good words total:  14020073
bad words total:  15477699


In [20]:
print('good vocabulary: ', len(set(good_data)))
print('bad vocabulary: ', len(set(bad_data)))

good vocabulary:  172220
bad vocabulary:  183840


In [21]:
print('good % vocab to total: ', round(len(set(good_data)) / len(good_data),4))
print('good % vocab to total: ', round(len(set(bad_data)) / len(bad_data),4))

good % vocab to total:  0.0123
good % vocab to total:  0.0119


In [22]:
# Total words divided by total number of sripts.
print('Average Good # Words: ', len(good_data) / len(good_to_count))
print('Average Bad # Words: ', len(bad_data) / len(bad_to_count))

Average Good # Words:  11039.427559055119
Average Bad # Words:  10223.050858652576


In [23]:
print('Ave difference by words, good vs bad: ', ((len(good_data) / len(good_to_count)) - (len(bad_data) / len(bad_to_count)))\
    / (len(bad_data) / len(bad_to_count)))

Ave difference by words, good vs bad:  0.07985646473739089


Counting punctuation and comparing good to bad.

In [24]:
good_colons = good_data.count(':') / len(good_data)
bad_colons = bad_data.count(':') / len(bad_data)
good_semis = good_data.count(';') / len(good_data)
bad_semis = bad_data.count(';') / len(bad_data)
good_commas = good_data.count(',') / len(good_data)
bad_commas = bad_data.count(',') / len(bad_data)
good_elipses = good_data.count('...') / len(good_data)
bad_elipses = bad_data.count('...') / len(bad_data)
good_exclam = good_data.count('!') / len(good_data)
bad_exclam = bad_data.count('!') / len(bad_data)

In [25]:
print('Good : ratio: ', good_colons)
print('Bad : ratio: ', bad_colons)
print('Good ; ratio: ', np.format_float_positional(good_semis))
print('Bad ; ratio: ', np.format_float_positional(bad_semis))
print('Good , ratio: ', good_commas)
print('Bad , ratio: ', bad_commas)
print('Good ... ratio: ', good_elipses)
print('Bad ... ratio: ', bad_elipses)
print('Good ! ratio: ', good_exclam)
print('Bad ! ratio: ', bad_exclam)

Good : ratio:  0.0016251698546790733
Bad : ratio:  0.0014276669936532556
Good ; ratio:  0.000047645971600861136
Bad ; ratio:  0.00011416425658620187
Good , ratio:  0.04718106674622878
Bad , ratio:  0.04844324728113655
Good ... ratio:  0.011329113621590985
Bad ... ratio:  0.010440957664314314
Good ! ratio:  0.010267706879985575
Bad ! ratio:  0.0142683999734069


In [26]:
print('Good-Bad % for : ', (good_colons - bad_colons) / bad_colons)
print('Good-Bad % for ; ', (good_semis - bad_semis) / bad_semis)
print('Good-Bad % for , ', (good_commas - bad_commas) / bad_commas)
print('Good-Bad % for ... ', (good_elipses - bad_elipses) / bad_elipses)
print('Good-Bad % for ! ', (good_exclam - bad_exclam) / bad_exclam)

Good-Bad % for :  0.1383395861246521
Good-Bad % for ;  -0.5826542122237257
Good-Bad % for ,  -0.026054829222797712
Good-Bad % for ...  0.08506460669908276
Good-Bad % for !  -0.2803883477388999


Bad scripts on average use 28% more exclamation marks and 58% more semicolons.

In [27]:
splice_scripts = ''
for script in screenplays['scripts']:
    splice_scripts += ''.join(script)

all_words = ''.join(splice_scripts)

In [28]:
temp = []
for script in screenplays['scripts']:
    temp.append(''.join(script))

In [29]:
# This has each script as one long string inside of its cell, 
# as opposed with a list of lines.
screenplays['all_together_now'] = temp

In [30]:
data = word_tokenize(all_words)

Looking for # of unique tokens so I know roughly how many to play with when modeling.

In [31]:
len(set(data))

278462

In [32]:
nlp = spacy.load('en_core_web_sm')

The below function will be used in an apply function remove the stop words for purposes further down.

In [33]:
screenplays['no_stop'] = screenplays['all_together_now']\
    .progress_apply(stop_it, punct=False)

HBox(children=(IntProgress(value=0, max=2784), HTML(value='')))




In [34]:
screenplays['just_words'] = screenplays['all_together_now']\
    .progress_apply(stop_it, punct=True)

HBox(children=(IntProgress(value=0, max=2784), HTML(value='')))




## Rottentomatoes

### Setting up DataFrame

In [35]:
try:
    rotten_df.drop(rotten_df['Unnamed 0'], inplace=True, axis=1)
except:
    pass

In [36]:
rotten_df.dropna(inplace=True)

In [38]:
temp = []
for script in rotten_df.scripts:
    temp.append(''.join(script))

In [39]:
# This has each script as one long string inside of its cell, 
# as opposed with a list of lines.
rotten_df['all_together_now'] = temp

In [40]:
nlp = spacy.load('en_core_web_sm')

The below function will be used in an apply function remove the stop words for purposes further down.

In [41]:
rotten_df['no_stop'] = rotten_df['all_together_now']\
    .progress_apply(stop_it, punct=False)

rotten_df['just_words'] = rotten_df['all_together_now']\
    .progress_apply(stop_it, punct=True)

HBox(children=(IntProgress(value=0, max=1536), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1536), HTML(value='')))




In [42]:
rotten_df.rename(columns={'.':'PER'}, inplace=True)

In [43]:
rotten_df[40:50]

Unnamed: 0,titles,titles_formatted,rotten_scores,scripts,all_together_now,no_stop,just_words
40,twisted,twisted,1,['\n\n I can hear your ...,['\n\n I can hear your ...,"['\n\n hear heart beating.', ' sounds like li...",nn hear heart beating sounds like little anim...
41,kings_ransom,kings-ransom,1,"['\n\n Wake up, Chicago...","['\n\n Wake up, Chicago...","['\n\n wake up, chicago.', ' morning man...',...",nn wake chicago morning man getting way 810 w...
42,getting_even_with_dad,getting-even-with-dad,2,"[""\n\n (MONEY THAT'S WH...","[""\n\n (MONEY THAT'S WH...","[""\n\n (money that's wan playing)"", ' best th...",nn money thats wan playing best things life f...
43,passion_play,passion-play,2,"['\n\n Hey, Billy.', "" ...","['\n\n Hey, Billy.', "" ...","['\n\n hey, billy.', "" it's thursday."", ' com...",nn hey billy thursday come tomorrow actually ...
44,half_past_dead,half-past-dead,2,"['\n\n So, Nick...', ' ...","['\n\n So, Nick...', ' ...","['\n\n so, nick...', ' ...this it?', ' promis...",nn nick promised lets plane catch know sonny ...
45,feardotcom,feardotcom,2,"['\n\n No!', ' Damn.', ...","['\n\n No!', ' Damn.', ...","['\n\n no!', ' damn.', ' thanks present, benn...",nn damn thanks present bennie guess business ...
46,the_darkness,darkness-the,2,"['\n\n Now, I want you ...","['\n\n Now, I want you ...","['\n\n now, want try tell us', ' happened.', ...",nn want try tell happened remember cant cant ...
47,jack_and_jill,jack-and-jill,2,['\n\n And we were born...,['\n\n And we were born...,"['\n\n born', ' september 15...', "" and... sh...",nn born september 15 shes older older twin im...
48,bless_the_child,bless-the-child,2,"[""\n\n - [ Bell Ringing...","[""\n\n - [ Bell Ringing...","[""\n\n - [ bell ringing ]|- ~ 'tis season jol...",nn bell ringing | tis season jolly fa la la l...
49,battlefield_earth,battlefield-earth,2,['\n\n The entire tribe...,['\n\n The entire tribe...,"['\n\n entire tribe not|be endangered. . .', ...",nn entire tribe not|be endangered defiance of...


In [44]:
# rotten_df.dropna(inplace=True)
# screenplays.dropna(inplace=True)

rotten_df = rotten_df.drop_duplicates(subset=['titles']).copy()
screenplays = screenplays.drop_duplicates(['titles']).copy()

screenplays.to_csv('screenplays_scrub.csv')
rotten_df.to_csv('rotten_df_scrub.csv')