# Imports

In [28]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy

from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
from nltk import word_tokenize
from ast import literal_eval
from importlib import reload

nlp = spacy.load('en_core_web_sm')

# Importing my own functions file.
import functions as fun

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
tqdm_notebook.pandas()

# Scrubbing Data

Reading back in data that was created in 'obtaining' file.

In [4]:
# I had intended to use files directly from the 'obtain' notebook to be cleaner,
# but I can't get back into the screenplay site, so I'll have to improvise.
# df_good = pd.read_csv('df_good_obtain.csv')
# df_bad = pd.read_csv('df_bad_obtain.csv')
# rotten_df = pd.read_csv('rotten_df_obtain.csv')

In [5]:
df_good = pd.read_csv('../project_resources/df_good.csv', index_col=False)
df_bad = pd.read_csv('../project_resources/df_bad.csv', index_col=False)
rotten_df = pd.read_csv('../project_resources/rotten_df.csv', index_col=0)

rotten_df.columns = ['titles', 'titles_formatted', 'rotten_scores', 
                     'scripts', 'all_together_now', 'no_stop', 'just_words']

## metacritic

### Setting up DataFrame

In [8]:
# Re-setting columns and index after re-importing.
df_good.columns = ['titles', 'scripts', 'good_or_bad']
df_bad.columns = ['titles', 'scripts', 'good_or_bad']

# Adding labels, combining good and bad, and dropping missing scripts.
df_good['good_or_bad'] = 1
df_bad['good_or_bad'] = 0

screenplays = pd.concat([df_good, df_bad])
screenplays.columns = ['titles', 'scripts', 'good_or_bad']

screenplays.dropna(inplace=True)

In [10]:
print('# of good screeenplays: ', len(screenplays[screenplays['good_or_bad'] == 1]))
print('# of bad screenplays: ', len(screenplays[screenplays['good_or_bad'] == 0]))

# of good screeenplays:  1270
# of bad screenplays:  1514


In [11]:
# converting imported screenplays back to lists from strings
screenplays.scripts = screenplays.scripts.apply(literal_eval)

# Series with a list of lines for each screenplay.
good_to_count = screenplays[screenplays['good_or_bad'] == 1]
bad_to_count = screenplays[screenplays['good_or_bad'] == 0]

# Single string of all good words.
splice_scripts = ''
for script in good_to_count['scripts']:
    splice_scripts += ''.join(script)

all_good_words = ''.join(splice_scripts)

# Single string of all bad words.
splice_scripts = ''
for script in bad_to_count['scripts']:
    splice_scripts += ''.join(script)

all_bad_words = ''.join(splice_scripts)

# Lists of all words lumped together and tokenized
good_data = word_tokenize(all_good_words)
bad_data = word_tokenize(all_bad_words)

### Script Metrics

Creating some simple word metrics.

In [12]:
print('good words total: ', len(good_data))
print('bad words total: ', len(bad_data))
print('----'*20)

print('good vocabulary: ', len(set(good_data)))
print('bad vocabulary: ', len(set(bad_data)))
print('----'*20)

print('good % vocab to total: ', round(len(set(good_data)) / len(good_data),4))
print('good % vocab to total: ', round(len(set(bad_data)) / len(bad_data),4))
print('----'*20)

# Total words divided by total number of sripts.
print('Average Good # Words: ', len(good_data) / len(good_to_count))
print('Average Bad # Words: ', len(bad_data) / len(bad_to_count))
print('----'*20)

print('Ave difference by words, good vs bad: ', 
      round(((len(good_data) / len(good_to_count)) \
       - (len(bad_data) / len(bad_to_count))) / (len(bad_data) \
                                                 / len(bad_to_count)),2))
print('----'*20)

good words total:  14020073
bad words total:  15477699
--------------------------------------------------------------------------------
good vocabulary:  172220
bad vocabulary:  183840
--------------------------------------------------------------------------------
good % vocab to total:  0.0123
good % vocab to total:  0.0119
--------------------------------------------------------------------------------
Average Good # Words:  11039.427559055119
Average Bad # Words:  10223.050858652576
--------------------------------------------------------------------------------
Ave difference by words, good vs bad:  0.08
--------------------------------------------------------------------------------


Counting punctuation and comparing good to bad.

In [13]:
for p in [':', ';', ',', '...', '!']:
    good_p = good_data.count(p) / len(good_data)
    bad_p = bad_data.count(p) / len(bad_data)
    print(f'Good \'{p}\' ratio: ', np.format_float_positional(round(good_p, 6)))
    print(f'Bad \'{p}\' ratio: ', np.format_float_positional(round(bad_p,6)))
    print(f'Good-Bad % for \'{p}\'', round((good_p - bad_p) / bad_p,2))
    print('-----'*10)

Good ':' ratio:  0.001625
Bad ':' ratio:  0.001428
Good-Bad % for ':' 0.14
--------------------------------------------------
Good ';' ratio:  0.000048
Bad ';' ratio:  0.000114
Good-Bad % for ';' -0.58
--------------------------------------------------
Good ',' ratio:  0.047181
Bad ',' ratio:  0.048443
Good-Bad % for ',' -0.03
--------------------------------------------------
Good '...' ratio:  0.011329
Bad '...' ratio:  0.010441
Good-Bad % for '...' 0.09
--------------------------------------------------
Good '!' ratio:  0.010268
Bad '!' ratio:  0.014268
Good-Bad % for '!' -0.28
--------------------------------------------------


Bad scripts on average use 28% more exclamation marks and 58% more semicolons.

Now creating additional columns for scripts in different formats. Right now, each script line is an element of a list. Breaking those apart into one long string.

In [14]:
splice_scripts = ''
for script in screenplays['scripts']:
    splice_scripts += ''.join(script)

all_words = ''.join(splice_scripts)

temp = []
for script in screenplays['scripts']:
    temp.append(''.join(script))

# This has each script as one long string inside of its cell, 
# as opposed with a list of lines.
screenplays['all_together_now'] = temp

data = word_tokenize(all_words)

Looking for # of unique tokens so I know roughly how many to play with when modeling.

In [15]:
len(set(data))

278462

The below function will be used in an apply function remove the stop words for purposes further down. One will keep punctuation and one will remove it.

In [34]:
screenplays['no_stop'] = screenplays['all_together_now']\
    .progress_apply(fun.stop_it, punct=False)

screenplays['just_words'] = screenplays['all_together_now']\
    .progress_apply(fun.stop_it, punct=True)

HBox(children=(IntProgress(value=0, max=2784), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2784), HTML(value='')))

## Rottentomatoes

### Setting up DataFrame

In [35]:
rotten_df.dropna(inplace=True)

In [36]:
temp = []
for script in rotten_df.scripts:
    temp.append(''.join(script))

In [37]:
# This has each script as one long string inside of its cell, 
# as opposed with a list of lines.
rotten_df['all_together_now'] = temp

Using this function again for rotten_df this time to remove stop words for one columns of scripts and remove stop words plus punctuation for another.

In [39]:
rotten_df['no_stop'] = rotten_df['all_together_now']\
    .progress_apply(fun.stop_it, punct=False)

rotten_df['just_words'] = rotten_df['all_together_now']\
    .progress_apply(fun.stop_it, punct=True)

HBox(children=(IntProgress(value=0, max=1536), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1536), HTML(value='')))

In [40]:
# Need to rename the period since it causes problems later on.
rotten_df.rename(columns={'.':'PER'}, inplace=True)

In [41]:
# rotten_df.dropna(inplace=True)
# screenplays.dropna(inplace=True)

rotten_df = rotten_df.drop_duplicates(subset=['titles']).copy()
screenplays = screenplays.drop_duplicates(['titles']).copy()

screenplays.to_csv('screenplays_scrub.csv')
rotten_df.to_csv('rotten_df_scrub.csv')