# Import Libraries

In [129]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import string
import requests
import re
from bs4 import BeautifulSoup
import csv
from pdfminer import high_level

# Function Engineering

In [1]:
def pull_script_titles(url):
    '''pull titles from url using Beautiful Soup and clean titles before appending them to script_titles list
    
    url: url from IMSDB
        example: https://www.imsdb.com/alphabetical/A'''
    temp_titles1 = []
    temp_titles2 = []
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content,'html.parser')
    titles = list(soup.findAll('p'))
    for x in titles:
        temp_titles1.append(str(x).split('title="')[1])
    for x in temp_titles1:
        temp_titles2.append(x.split('">')[0])
    for x in temp_titles2:
        script_titles.append(x.replace(' Script',''))

In [2]:
def pull_drew_scripts(url):
    '''pull titles from Drews Script-o-Rama
    url: url that points to Drews Script-o-Rama index page'''
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    titles = list(soup.findAll('p', align = 'LEFT'))[::2]
    for x in titles:
        temp_titles.append(str(x).split('<a')[1])

In [3]:
def script_pull_imsdb(url):
    '''pull scripts from IMSDB.com'''
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    script = str(soup.findAll('pre'))
    imsdb_scripts.append(script)

In [4]:
def clean_script_html(script):
    '''removes html tags from scripts scraped from the web'''
    cleaned_script = script.replace('[','')
    cleaned_script = cleaned_script.replace(']','')
    cleaned_script = cleaned_script.replace('<pre>','')
    cleaned_script = cleaned_script.replace('</pre>','')
    cleaned_script = cleaned_script.replace('<b>','')
    cleaned_script = cleaned_script.replace('</b>','')
    cleaned_script = cleaned_script.replace('</pre>','')
    cleaned_script = cleaned_script.replace('\n','')
    cleaned_script = cleaned_script.replace('\r','')
    cleaned_script = cleaned_script.replace('<html>','')
    cleaned_script = cleaned_script.replace('</html>','')
    cleaned_script = cleaned_script.replace('<head','')
    cleaned_script = cleaned_script.replace('</head>','')
    cleaned_script = cleaned_script.replace('<title>','')
    cleaned_script = cleaned_script.replace('</title>','')
    return cleaned_script

# Re-Loaded Data Frame from Web Scraping Notebook

In [2]:
all_films = pd.read_csv('all_films.csv', index_col= 'Unnamed: 0')
all_films

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1
1,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1
2,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1
...,...,...,...,...,...,...,...,...
5867,"20,000 Leagues Under the Sea",2013,200000,8000000,8000000,7800000,39.00,1
5868,Swingers,2000,200000,4505922,6618578,6418578,32.09,1
5899,She's Gotta Have It,2010,175000,7137502,7137502,6962502,39.79,1
5908,Sweet Sweetback's Baad Asssss Song,2015,150000,15200000,15200000,15050000,100.33,1


# Scrape Available Titles from IMSDB.com

Before I can begin webscraping scripts, I need to find on which websites they reside. Starting with IMSDB which is the most popular database for scripts on the internet. IMSDB pages are arranged with a similar url with a capitalized letter attached to the end. Created a list of capital letters and the number 0 to iterate through to scrape a list of all scripts available on IMSDB

In [4]:
alphabet_string = string.ascii_uppercase

In [5]:
alphabet_list = list(alphabet_string)

In [6]:
alphabet_list.append('0')

## Pull Titles from IMSDB

In [8]:
script_titles = []
for i in alphabet_list:
    pull_script_titles('https://www.imsdb.com/alphabetical/{}'.format(i))

In [9]:
len(script_titles)

1210

Script titles pulled had ",The" at the end. Used for loop to move "The" to the beginning of the string and remove comma

In [10]:
for x in script_titles:
    if x[-3:] == 'The':
        script_titles.append(str(x[-3:] + ' ' + x[:-3]).replace(', ',''))

Moved through list to look for matches in data frame

In [11]:
matches = []
for x in list(all_films['title']):
    for y in script_titles:
        if x == y:
            matches.append(x)

In [12]:
len(matches)

578

Appended matches to data frame by creating new column

In [13]:
all_films['script_available_IMSDB'] = ['yes' if x in matches else 'no' for x in list(all_films['title'])]

In [14]:
all_films['script_available_IMSDB'].value_counts()

no     2796
yes     578
Name: script_available_IMSDB, dtype: int64

## Pull Titles from The Daily Script

Pulled titles from Daily Script A-M using Beautiful Soup

In [15]:
html_page = requests.get('http://www.dailyscript.com/movie.html')
soup = BeautifulSoup(html_page.content,'html.parser')
titles = list(soup.findAll('a'))
daily_titles = list(titles)[8::2]

In [16]:
str(daily_titles[0]).split('">')[1][:-4]

'10 Things I Hate About You'

In [17]:
daily_titles

[<a href="scripts/10Things.html">10 Things I Hate About You</a>,
 <a href="scripts/12+and+Holding.pdf">12 And Holding</a>,
 <a href="scripts/twelve_monkeys.html">12 Monkeys</a>,
 <a href="scripts/13_days.html">13 Days</a>,
 <a href="scripts/thirteen_ghosts.pdf">13 Ghosts</a>,
 <a href="scripts/15minutes.html">15 Minutes</a>,
 <a href="scripts/16_Blocks_by_Richard_Wenk.pdf">16 Blocks</a>,
 <a href="scripts/25thhour_all.pdf">25th Hour</a>,
 <a href="scripts/three-kings_shooting.html">3 Kings</a>,
 <a href="scripts/threekings_shootingdraft.pdf">3 Kings</a>,
 <a href="scripts/three-kings_unproduced.html">3 Kings (Spoils of War)</a>,
 <a href="scripts/40_year_old_virgin.pdf">40 Year Old Virgin</a>,
 <a href="scripts/8MILE.pdf">8 Mile (aka Untitled Detroit Project)</a>,
 <a href="scripts/eight-millimeter.html">8 Millimeter</a>,
 <a href="scripts/84%20Charlie%20MoPic.txt">84 Charlie MoPic</a>,
 <a href="scripts/Above_the_Law.pdf">Above the Law</a>,
 <a href="scripts/Absolute_Power.PDF">Absolu

Iterate through Daily Titles A-M to remove HTML tags

In [18]:
new_daily_titles = []
for x in daily_titles:
    new_daily_titles.append(str(x).split('">')[1][:-4])

In [19]:
len(new_daily_titles)

662

Pulled N-Z Daily Script Titles and iterated through to remove HTML tags

In [20]:
html_page = requests.get('http://www.dailyscript.com/movie_n-z.html')
soup = BeautifulSoup(html_page.content,'html.parser')
titles = list(soup.findAll('a'))
daily_titles = list(titles)[8::2]
for x in daily_titles:
    new_daily_titles.append(str(x).split('">')[1][:-4])

In [21]:
len(new_daily_titles)

1014

In [22]:
new_daily_titles

['10 Things I Hate About You',
 '12 And Holding',
 '12 Monkeys',
 '13 Days',
 '13 Ghosts',
 '15 Minutes',
 '16 Blocks',
 '25th Hour',
 '3 Kings',
 '3 Kings',
 '3 Kings (Spoils of War)',
 '40 Year Old Virgin',
 '8 Mile (aka Untitled Detroit Project)',
 '8 Millimeter',
 '84 Charlie MoPic',
 'Above the Law',
 'Absolute Power',
 'The Abyss',
 'Ace Ventura: Pet Detective',
 'The Addams Family',
 'The Adventures of Ford Fairlane (Ford Fairlane )',
 'An Affair to Remember',
 'Affliction',
 'The African Queen',
 'Airforce One',
 'Airplane II: The Sequel',
 'Airplane!',
 'Alfie',
 'Ali',
 'Alien',
 'Alien',
 'Alien Nation',
 'Alien vs. Predator',
 'Aliens',
 'Aliens',
 'All About Eve',
 'All About Eve',
 "All The King's Men",
 "All The President's Men",
 "All The President's Men",
 'Almost Famous',
 'Almost Famous',
 'Amadeus',
 'American Beauty',
 'American Beauty',
 'American Graffiti',
 'American Madness',
 'American Outlaws',
 'American Pie',
 'American Psycho',
 'American Psycho',
 'Americ

Look for matches and appended matches to data frame

In [23]:
matches = []
for x in list(all_films['title']):
    for y in new_daily_titles:
        if x == y:
            matches.append(x)

In [24]:
len(matches)

158

In [25]:
all_films['script_available_daily_script'] = ['yes' if x in matches else 'no' for x in list(all_films['title'])]

In [26]:
all_films['script_available_daily_script'].value_counts()

no     3248
yes     126
Name: script_available_daily_script, dtype: int64

In [27]:
all_films

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1,no,no
1,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no
2,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1,no,no
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1,no,no
...,...,...,...,...,...,...,...,...,...,...
5867,"20,000 Leagues Under the Sea",2013,200000,8000000,8000000,7800000,39.00,1,no,no
5868,Swingers,2000,200000,4505922,6618578,6418578,32.09,1,yes,yes
5899,She's Gotta Have It,2010,175000,7137502,7137502,6962502,39.79,1,no,no
5908,Sweet Sweetback's Baad Asssss Song,2015,150000,15200000,15200000,15050000,100.33,1,no,no


## Pull Titles from MovieScriptsAndScreenplays.com

Used Beautiful Soup to Pull Titles from MovieScriptsAndScreenplays.com

In [28]:
html_page = requests.get('http://www.moviescriptsandscreenplays.com/index.html#top')
soup = BeautifulSoup(html_page.content,'html.parser')
titles = list(soup.findAll('a'))
msas_titles = list(titles)[13::6]
msas_titles
#print(soup)

[<a href="http://www.dailyscript.com/scripts/10Things.html" target="_blank">10 Things I Hate About You</a>,
 <a href="http://www.dailyscript.com/scripts/12+and+Holding.pdf" target="_blank">12 And Holding</a>,
 <a href="http://www.dailyscript.com/scripts/13_days.html" target="_blank">13 Days</a>,
 <a href="http://www.dailyscript.com/scripts/16_Blocks_by_Richard_Wenk.pdf" target="_blank">16 Blocks</a>,
 <a href="http://www.angelfire.com/movies/ridleyscott/script/1492-ConquestOfParadise.txt" target="_blank">1492: Conquest of Paradise:</a>,
 <a href="http://www.dailyscript.com/scripts/25thhour_all.pdf" target="_blank">25th Hour</a>,
 <a href="http://www.scifiscripts.com/scripts/2001.txt" target="_blank">2001: A Space Odyssey</a>,
 <a href="http://ez-files.net/download.php?file=fbd7411b" target="_blank">3001 (filmed as Idiocracy)</a>,
 <a href="http://www.dailyscript.com/scripts/three-kings_shooting.html" target="_blank">3 Kings</a>,
 <a href="http://www.dailyscript.com/scripts/40_year_old_

Used for loop to remove HTML tags from titles

In [29]:
new_msas_titles = []
for x in msas_titles:
    new_msas_titles.append(str(x).split('">')[1][:-4])

In [30]:
new_msas_titles

['10 Things I Hate About You',
 '12 And Holding',
 '13 Days',
 '16 Blocks',
 '1492: Conquest of Paradise:',
 '25th Hour',
 '2001: A Space Odyssey',
 '3001 (filmed as Idiocracy)',
 '3 Kings',
 '40 Year Old Virgin',
 'Five Easy Pieces',
 'The 5th Element',
 '531',
 '7 Days to Live',
 'Eight Legged Freaks (Originally Titled Arac Attack)',
 '8 Millimeter',
 '84 Charlie MoPic',
 'Above the Law',
 'The Abyss',
 'Adaptation',
 'Addicted to Murder 5: The Last Vampire',
 'The Adventure',
 'The Adventures of Fartman',
 'An Affair to Remember',
 'The African Queen',
 'After School Special',
 'Airforce One',
 'Alien',
 'Alien',
 'Aliens',
 'Alien 3',
 'Alien 3',
 'Alien 3',
 'Alien 3',
 'Alien 3',
 'Alien 4 Resurrection',
 'Alien 5',
 'Alien Nation',
 'Alien vs. Predator',
 'All About Eve',
 "All The King's Men",
 "All The President's Men",
 'Almost Famous',
 'Amadeus',
 'American Beauty',
 'American Beauty',
 'American Graffiti',
 'American History X',
 'American Madness',
 'American Pie',
 'Amer

Looked for matches and append to data frame

In [31]:
matches = []
for x in list(all_films['title']):
    for y in new_msas_titles:
        if x == y:
            matches.append(x)

In [32]:
len(matches)

75

In [33]:
html_page = requests.get('http://www.moviescriptsandscreenplays.com/movie-scripts.html')
soup = BeautifulSoup(html_page.content,'html.parser')
titles = list(soup.findAll('a'))
msas_titles = list(titles)[13::6]
msas_titles

[<a href="http://www.angelfire.com/movies/ridleyscott/script/GIJane.txt" target="_blank">G. I. Jane</a>,
 <a href="http://www.scifiscripts.com/scripts/GalaxyQuest.txt" target="_blank">Galaxy Quest</a>,
 <a href="http://www.dailyscript.com/scripts/the-game_shooting.html" target="_blank">The Game</a>,
 <a href="http://www.dailyscript.com/scripts/Game_6.pdf" target="_blank">Game 6</a>,
 <a href="http://www.weeklyscript.com/Gandhi.txt" target="_blank">Gandhi</a>,
 <a href="http://sfy.ru/sfy.html?script=gangs_of_new_york_ds" target="_blank">Gangs of New York</a>,
 <a href="http://n.1asphost.com/cinemaobsession/read/General.doc" target="_blank">General</a>,
 <a href="http://www.fortunecity.com/tattooine/clarke/38/scripts/TheGermanLieutenant.txt" target="_blank">The German Lieutenant</a>,
 <a href="http://www.dailyscript.com/scripts/Get_Carter.pdf" target="_blank">Get Carter</a>,
 <a href="http://www.scifiscripts.com/msol/get_shorty.txt" target="_blank">Get Shorty</a>,
 <a href="http://www.da

In [34]:
for x in msas_titles:
    new_msas_titles.append(str(x).split('">')[1][:-4])

In [35]:
html_page = requests.get('http://www.moviescriptsandscreenplays.com/movie-scripts2.html')
soup = BeautifulSoup(html_page.content,'html.parser')
titles = list(soup.findAll('a'))
msas_titles = list(titles)[13::6]
msas_titles

[<a href="http://www.dailyscript.com/scripts/The_Pacifier.pdf" target="_blank">The Pacifier</a>,
 <a href="http://www.dailyscript.com/scripts/Panic_Room_Koepp.html" target="_blank">Panic Room</a>,
 <a href="http://www.bttf.com/ImageFolio3_files/gallery/Scripts/19881219.pdf" target="_blank">Paradox</a>,
 <a href="http://www.scifiscripts.com/msol/passenger57.html" target="_blank">Passenger 57</a>,
 <a href="http://www.screentalk.biz/scripts/pathsofglory.pdf" target="_blank">Paths of Glory</a>,
 <a href="http://www.dailyscript.com/scripts/Patriot_1999.html" target="_blank">The Patriot</a>,
 <a href="http://www.weeklyscript.com/Peeping%20Tom.txt" target="_blank">Peeping Tom</a>,
 <a href="http://www.dailyscript.com/scripts/The+Perfect+Neighbor+the+password+is+perfection.pdf" target="_blank">The Perfect Neighbor (was The Perfect Stranger)</a>,
 <a href="http://www.dailyscript.com/scripts/A_Perfect_World.pdf" target="_blank">A Perfect World</a>,
 <a href="http://www.dailyscript.com/scripts/p

In [36]:
for x in msas_titles:
    new_msas_titles.append(str(x).split('">')[1][:-4])

In [37]:
matches = []
for x in list(all_films['title']):
    for y in new_msas_titles:
        if x == y:
            matches.append(x)
len(matches)

175

In [38]:
all_films['script_available_msas'] = ['yes' if x in matches else 'no' for x in list(all_films['title'])]
all_films['script_available_msas'].value_counts()

no     3211
yes     163
Name: script_available_msas, dtype: int64

## Pull Titles from Drew's Script-o-Rama

In [42]:
html_page = requests.get('http://www.script-o-rama.com/table.shtml')
soup = BeautifulSoup(html_page.content,'html.parser')
#print(soup)
titles = list(soup.findAll('p', align = 'LEFT'))
titles#[::2]
#msas_titles = list(titles)[13::6]
#msas_titles

[<p align="LEFT"><a href="http://www.dailyscript.com/scripts/10Things.html">10
 			Things I Hate About You</a></p>, <p align="LEFT">Revised Draft 
 			</p>, <p align="LEFT"><a href="http://www.12.org/script.html">12 (2003)</a></p>, <p align="LEFT">Unspecified Draft</p>, <p align="LEFT"><a href="http://www.dailyscript.com/scripts/12+and+Holding.pdf">12
 			And Holding</a></p>, <p align="LEFT">4/6/04 Draft</p>, <p align="LEFT"><a href="http://www.raindance.org/wp-content/uploads/downloads/2013/03/12monkeys-Production-Draft.pdf">12
 			Monkeys</a></p>, <p align="LEFT">Production Draft 
 			</p>, <p align="LEFT"><a href="http://thescriptsavant.com/pdf/12YearsASlave.pdf">12
 			Years A Slave</a></p>, <p align="LEFT">FYC Draft</p>, <p align="LEFT"><a href="http://www.pages.drexel.edu/~ina22/splaylib/Screenplay-127_Hours.pdf">127
 			Hours</a></p>, <p align="LEFT">Unspecified Draft</p>, <p align="LEFT"><a href="http://www.moviemalls.com/papers/13days.txt">13
 			Days</a></p>, <p align="LEFT">

In [43]:
drew_scripts = []
temp_titles = []

In [44]:
pull_drew_scripts('http://www.script-o-rama.com/table.shtml')

In [45]:
temp_titles

[' href="http://www.dailyscript.com/scripts/10Things.html">10\n\t\t\tThings I Hate About You</a></p>',
 ' href="http://www.12.org/script.html">12 (2003)</a></p>',
 ' href="http://www.dailyscript.com/scripts/12+and+Holding.pdf">12\n\t\t\tAnd Holding</a></p>',
 ' href="http://www.raindance.org/wp-content/uploads/downloads/2013/03/12monkeys-Production-Draft.pdf">12\n\t\t\tMonkeys</a></p>',
 ' href="http://thescriptsavant.com/pdf/12YearsASlave.pdf">12\n\t\t\tYears A Slave</a></p>',
 ' href="http://www.pages.drexel.edu/~ina22/splaylib/Screenplay-127_Hours.pdf">127\n\t\t\tHours</a></p>',
 ' href="http://www.moviemalls.com/papers/13days.txt">13\n\t\t\tDays</a></p>',
 ' href="http://www.dailyscript.com/scripts/thirteen_ghosts.pdf">13\n\t\t\tGhosts</a></p>',
 ' href="http://screenplayexplorer.com/wp-content/scripts/1408.pdf">1408</a></p>',
 ' href="http://www.angelfire.com/movies/ridleyscott/script/1492-ConquestOfParadise.txt">1492:\n\t\t\tConquest Of Paradise</a></p>',
 ' href="http://www.dail

## Pull Titles from Kaggle Dataset (Dialogue Only)

Found Kaggle project similar to mine where a user put together Dialogue Only scripts for the below films. Repo is available here: https://www.kaggle.com/torloweidadata/movies-and-movie-scripts

In [46]:
kaggle_df = pd.read_csv('/Users/will4856/Downloads/moviedataset.csv', index_col='Unnamed: 0')
kaggle_df.head()

Unnamed: 0,actors,characters,movie title,genres,release year,IMDB ID,Scripts
0,Meryl Streep,Clarissa Vaughan,The Hours,Drama,2002,tt0274558,"Dearest, I feel certain that I am going mad ag..."
1,Meryl Streep,Blue Mecha (voice),A.I. Artificial Intelligence,"Adventure, Drama, Science Fiction",2001,tt0212720,Al That was when the icecaps had melted becaus...
2,Meryl Streep,Francesca Johnson,The Bridges of Madison County,"Drama, Romance",1995,tt0112579,"- MICHAEL: Hi, sis. - CAROLYN: Hey. - CAROLYN:..."
3,Meryl Streep,Miranda Priestly,The Devil Wears Prada,"Comedy, Drama, Romance",2006,tt0458352,"Good luck. Hi. Uh, I have an appointment with ..."
4,Meryl Streep,Clara del Valle Trueba,The House of the Spirits,"Drama, Romance",1994,tt0107151,THE HOUSE OF THE SPIRITS. It’s good to be back...


In [47]:
print(kaggle_df['Scripts'][1])

Al That was when the icecaps had melted because of the greenhouse gases... ...and the oceans had drowned cities... ...along all the shorelines of the world. Amsterdam, Venice, New York... ...forever lost. Millions of people were displaced. Climate became chaotic. Hundreds of millions of people starved in poorer countries. A high degree of prosperity survived when the developed world... ...introduced sanctions to license pregnancies... ...which was why robots, who did not consume resources... ...beyond those of their first manufacture... ...were so essential an economic link... ...in the chain mail of society. To create an artificial being has been man's dream... ...since the birth of science. Not merely from the modern age when our forebears created... ...the first thinking machines, primitive ones that played chess. How far we have come. The artificial being is a reality, a perfect simulacrum... ...articulated in limb, articulate in speech... ...and not lacking in human response. And 

Looked for matches in data frame and appended matches to dataframe

In [48]:
matches = []
for x in list(all_films['title']):
    for y in list(kaggle_df['movie title']):
        if x == y:
            matches.append(x)
len(matches)

1633

In [49]:
all_films['script_available_kaggle_df'] = ['yes' if x in matches else 'no' for x in list(all_films['title'])]
all_films['script_available_kaggle_df'].value_counts()

no     2289
yes    1085
Name: script_available_kaggle_df, dtype: int64

Only 42% of titles have been found so far

In [50]:
len(all_films.loc[(all_films['script_available_IMSDB'] == 'yes') | (all_films['script_available_daily_script'] == 'yes')
                 | (all_films['script_available_msas'] == 'yes') | (all_films['script_available_kaggle_df'] == 'yes')])/len(all_films)

0.4226437462951986

# Adding Scripts to Data Frame

## Kaggle Dataset Scripts

Pulled in Kaggle dataset as a Pandas dataframe

In [55]:
all_films_kaggle = all_script_films.loc[all_script_films['script_available_kaggle_df'] == 'yes']

In [56]:
kaggle_df = kaggle_df[['movie title','Scripts']]

In [57]:
kaggle_df.head()

Unnamed: 0,movie title,Scripts
0,The Hours,"Dearest, I feel certain that I am going mad ag..."
1,A.I. Artificial Intelligence,Al That was when the icecaps had melted becaus...
2,The Bridges of Madison County,"- MICHAEL: Hi, sis. - CAROLYN: Hey. - CAROLYN:..."
3,The Devil Wears Prada,"Good luck. Hi. Uh, I have an appointment with ..."
4,The House of the Spirits,THE HOUSE OF THE SPIRITS. It’s good to be back...


Merged Kaggle dataframe and all_films dataframe to building_df. Will continue to use building_df as I add more scripts

In [58]:
building_df = all_films_kaggle.merge(kaggle_df, left_on='title', right_on='movie title').drop_duplicates('title')
building_df = building_df.drop(columns = 'movie title')
building_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,Scripts
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no,no,yes,no,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no,no,yes,no,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...
5,Justice League,2017,300000000,229024295,655945209,355945209,1.19,0,no,no,no,yes,no,"There he is! Oh, sorry. Superman, Superman, ca..."
7,Spectre,2015,300000000,200074175,879620923,579620923,1.93,1,no,no,no,yes,no,"Where are you going? I won't be long. Welcome,..."
8,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,1,no,no,no,yes,no,There was a time above. A time before. There w...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1627,Winter's Bone,2012,2000000,6531503,16131551,14131551,7.07,1,no,no,no,yes,no,"Come on, let's go. Ashlee darling. Hm? Wake up..."
1628,The Company You Keep,2006,2000000,5132442,21087760,19087760,9.54,1,no,no,no,yes,no,A Federal Grand Jury in Detroit today charged ...
1630,Harsh Times,2012,2000000,3337931,6225304,4225304,2.11,1,no,no,no,yes,no,Cease fire! Cease fire! Heads up! Man down! Ce...
1631,Knock Knock,2011,2000000,36336,6328516,4328516,2.16,1,no,no,no,yes,no,"I'm sorry. - Oh, it's okay. - I'm sorry, baby...."


## IMSDB Script Scrape

Pulled out all titles from dataframe where the script is available on IMSDB

In [59]:
imsdb_df = all_films.loc[(all_films['script_available_IMSDB'] == 'yes') & (all_films['script_available_kaggle_df'] == 'no')]

In [60]:
imsdb_df = imsdb_df.drop_duplicates('title')

In [61]:
imsdb_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa
12,The Lion King,2019,260000000,543638043,1656943394,1396943394,5.37,1,yes,no,no,no,no
16,Harry Potter and the Half-Blood Prince,2009,250000000,302089278,935213767,685213767,2.74,1,yes,no,no,no,no
21,Avatar,2009,237000000,760507625,2788701337,2551701337,10.77,1,yes,no,no,no,yes
41,King Kong,2005,207000000,218080025,550517357,343517357,1.66,1,yes,no,yes,no,yes
43,Black Panther,2018,200000000,700059566,1346103376,1146103376,5.73,1,yes,no,no,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5479,Frozen River,2013,1000000,2511476,6030129,5030129,5.03,1,yes,no,no,no,yes
5578,Fruitvale Station,2015,900000,16098998,17549645,16649645,18.50,1,yes,no,no,no,yes
5615,American Graffiti,2013,777000,115000000,140000000,139223000,179.18,1,yes,yes,yes,no,yes
5756,Napoleon Dynamite,2008,400000,44540956,46122713,45722713,114.31,1,yes,no,no,no,no


Pull titles to search for on IMSDB using the IMSDB dataset. Rearranged titles to have ",The" at the end and add "-" in between all the words of the title

In [62]:
search_list_the = []
search_list = []
for x in list(imsdb_df['title']):
    if x[:3] == 'The':
        search_list_the.append((x[4:] + ', ' + x[:3]).replace(' ','-'))
    else:
        search_list.append(x.replace(' ','-'))

Pulled test script from IMSDB to make sure everything was working correctly

In [63]:
html_page = requests.get('https://www.imsdb.com/scripts/Last-Samurai,-The.html')
soup = BeautifulSoup(html_page.content,'html.parser')
titles = list(soup.findAll('pre'))
titles

[<pre>for educational use only                                        for educational use only
 
                                     www.script-fix.com
 
 
 
 
 <b>                                 THE LAST SAMURAI
 </b>
                                            by
 
                                       John Logan
 
 
 
                                      Revisions by
                            Edward Zwick &amp; Marshall Herskovitz
 <b>
 </b><b>                                                                                  1.
 </b>Fade In:
 
 <b>A BRIGHT BLUE TIGER
 </b>
 Surrounded by a pack of dogs, ten of them snarling and gnashing their teeth.
 The TIGER'S, eyes burn with fury as he wheels in a circle, lunging at one
 dog clawing at another, keeping them all at bay.
 
 Suddenly, the TIGER leaps over the dogs and transforms into a WHITE
 BIRD, soaring majestically into the sky.
 
 
 <b>THE FACE OF A JAPANESE MAN
 </b>
 
 Sits up into frame, sweating, waking from a dream. H

In [64]:
imsdb_scripts = []

Iterated through titles to pull scripts from IMSDB using script_pull_imsdb function

In [65]:
for x in search_list:
    script_pull_imsdb('https://www.imsdb.com/scripts/{}.html'.format(x))

In [66]:
len(imsdb_scripts)

In [67]:
len(search_list)

In [68]:
full_search = dict(zip(search_list, imsdb_scripts))

In [69]:
full_search = pd.DataFrame(full_search, index = range(1)).T

In [70]:
full_search = full_search.rename(columns = {0:'script'})

In [71]:
full_search = full_search.reset_index()

In [72]:
full_search = full_search.rename(columns = {'index':'title'})

Collected missing titles into missing_imsdb dataframe

In [73]:
missing_imsdb = full_search.loc[full_search['script'] == '[]']

In [74]:
missing_imsdb.to_csv('/Users/will4856/Downloads/missing_imsdb.csv')
missing_imsdb = pd.read_csv('/Users/will4856/Downloads/missing_imsdb.csv', index_col = 'Unnamed: 0')
missing_imsdb

Unnamed: 0,title,script
0,Harry-Potter-and-the-Half-Blood-Prince,[]
15,Harry-Potter-and-the-Goblet-of-Fire,[]
20,Harry-Potter-and-the-Prisoner-of-Azkaban,[]
25,Harry-Potter-and-the-Chamber-of-Secrets,[]
106,Full-Metal-Jacket,[]
127,Goodfellas,[]
171,Lethal-Weapon,[]
225,Donnie-Darko,[]
237,Casablanca,[]
244,Napoleon-Dynamite,[]


In [75]:
#full_search.to_csv('/Users/will4856/Downloads/full_search.csv')
full_search = pd.read_csv('/Users/will4856/Downloads/full_search.csv', index_col= 'Unnamed: 0')
full_search

Unnamed: 0,title1,script
1,Avatar,[<pre> \n<b> \n</b><b> ...
2,King Kong,[<pre>\n\n<b> ...
3,Black Panther,[<pre>\r\n\r\n\r\n \r\n<b> ...
4,Guardians of the Galaxy Vol 2,[<pre>\r\n\r\n\r\n\r\n<b> ...
5,Aladdin,[<pre>ALADDIN: THE COMPLETE SCRIPT\n<b>COMPIL...
...,...,...
240,Blue Valentine,[<pre>\r\n\r\n\r\n<b> ...
241,Frozen River,[<pre>\n\n\n<b> FR...
242,Fruitvale Station,[<pre>\r\n\r\n\r\n<b> ...
243,American Graffiti,"[<pre> ""AMER..."


In [76]:
full_search = full_search.loc[full_search['script'] != '[]']
full_search = full_search.rename(columns = {'title1':'title'})

In [80]:
imsdb1 = all_films.merge(full_search, left_on='title', right_on='title').drop_duplicates('title')
#imsdb1 = imsdb1.drop(columns = 'title')
imsdb1

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Avatar,2009,237000000,760507625,2788701337,2551701337,10.77,1,yes,no,no,no,yes,[<pre> \n<b> \n</b><b> ...
1,Aladdin,2019,182000000,355559216,1050959216,868959216,4.77,1,yes,no,no,no,yes,[<pre>ALADDIN: THE COMPLETE SCRIPT\n<b>COMPIL...
3,Up,2009,175000000,293004164,731463377,556463377,3.18,1,yes,no,no,no,yes,[<pre> \n<b> \n</b>\n<b> ...
4,Coco,2017,175000000,209726015,799976015,624976015,3.57,1,yes,no,no,no,no,[<pre>\r\n\r\n\r\n\r\n<b> ...
5,Godzilla,2014,160000000,200676069,529076069,369076069,2.31,1,yes,no,yes,no,yes,[<pre><html>\n\n<head>\n <title>GODZILLA fir...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,Kids,2000,1500000,7412216,20412216,18912216,12.61,1,yes,no,yes,no,yes,[<pre> \n\n<b> \n</b> ...
80,Saw,2018,1200000,55968727,103880027,102680027,85.57,1,yes,no,no,no,yes,[<pre>\n\n<b> ...
81,Rocky,2011,1000000,117235147,225000000,224000000,224.00,1,yes,yes,yes,no,yes,[<pre><html>\n<head>\n<script>\n<b><!--\n</b>\...
82,Hellraiser,2014,1000000,14564000,14575148,13575148,13.58,1,yes,no,yes,no,yes,[<pre><html>\n<head>\n<script>\n<b><!--\n</b>i...


In [81]:
building_df = building_df.rename(columns = {'Scripts': 'script'})
building_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no,no,yes,no,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no,no,yes,no,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...
5,Justice League,2017,300000000,229024295,655945209,355945209,1.19,0,no,no,no,yes,no,"There he is! Oh, sorry. Superman, Superman, ca..."
7,Spectre,2015,300000000,200074175,879620923,579620923,1.93,1,no,no,no,yes,no,"Where are you going? I won't be long. Welcome,..."
8,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,1,no,no,no,yes,no,There was a time above. A time before. There w...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1627,Winter's Bone,2012,2000000,6531503,16131551,14131551,7.07,1,no,no,no,yes,no,"Come on, let's go. Ashlee darling. Hm? Wake up..."
1628,The Company You Keep,2006,2000000,5132442,21087760,19087760,9.54,1,no,no,no,yes,no,A Federal Grand Jury in Detroit today charged ...
1630,Harsh Times,2012,2000000,3337931,6225304,4225304,2.11,1,no,no,no,yes,no,Cease fire! Cease fire! Heads up! Man down! Ce...
1631,Knock Knock,2011,2000000,36336,6328516,4328516,2.16,1,no,no,no,yes,no,"I'm sorry. - Oh, it's okay. - I'm sorry, baby...."


In [82]:
building_df = pd.concat([building_df, imsdb1])
building_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no,no,yes,no,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no,no,yes,no,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...
5,Justice League,2017,300000000,229024295,655945209,355945209,1.19,0,no,no,no,yes,no,"There he is! Oh, sorry. Superman, Superman, ca..."
7,Spectre,2015,300000000,200074175,879620923,579620923,1.93,1,no,no,no,yes,no,"Where are you going? I won't be long. Welcome,..."
8,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,1,no,no,no,yes,no,There was a time above. A time before. There w...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,Kids,2000,1500000,7412216,20412216,18912216,12.61,1,yes,no,yes,no,yes,[<pre> \n\n<b> \n</b> ...
80,Saw,2018,1200000,55968727,103880027,102680027,85.57,1,yes,no,no,no,yes,[<pre>\n\n<b> ...
81,Rocky,2011,1000000,117235147,225000000,224000000,224.00,1,yes,yes,yes,no,yes,[<pre><html>\n<head>\n<script>\n<b><!--\n</b>\...
82,Hellraiser,2014,1000000,14564000,14575148,13575148,13.58,1,yes,no,yes,no,yes,[<pre><html>\n<head>\n<script>\n<b><!--\n</b>i...


In [83]:
#building_df.to_csv('building_csv')

In [84]:
building_df = pd.read_csv('/Users/will4856/Downloads/building_csv', index_col='Unnamed: 0')

In [85]:
dirty_script_df = building_df.loc[(building_df['script_available_kaggle_df'] == 'no') 
                & (building_df['script_available_IMSDB'] == 'yes')]
dirty_script_df.head()

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Avatar,2009,237000000,760507625,2788701337,2551701337,10.77,1,yes,no,no,no,yes,[<pre> \n<b> \n</b><b> ...
1,King Kong,2005,207000000,218080025,550517357,343517357,1.66,1,yes,no,yes,no,yes,[<pre>\n\n<b> ...
4,Black Panther,2018,200000000,700059566,1346103376,1146103376,5.73,1,yes,no,no,no,no,[<pre>\r\n\r\n\r\n \r\n<b> ...
5,Guardians of the Galaxy Vol 2,2017,200000000,389813101,869113101,669113101,3.35,1,yes,no,no,no,no,[<pre>\r\n\r\n\r\n\r\n<b> ...
6,Aladdin,2019,182000000,355559216,1050959216,868959216,4.77,1,yes,no,no,no,yes,[<pre>ALADDIN: THE COMPLETE SCRIPT\n<b>COMPIL...


In [86]:
test_script = dirty_script_df['script'][0]
print(test_script)

[<pre>           
<b>          
</b><b>          
</b><b>                                        AVATAR
</b><b>          
</b><b>          
</b><b>          
</b><b>          
</b>                                      Written by
<b>          
</b>                                    James Cameron
<b>          
</b><b>          
</b><b>          
</b><b>          
</b><b>          
</b>          THE SOUND OF DRUMS, from a great distance, growing louder.
<b>          
</b><b>          FADE IN:
</b><b>          
</b>          WE ARE FLYING through mist, a dimly glimpsed forest below.
<b>          
</b><b>                               VOICE (V.O.)
</b>                     When I was lying there in the VA
                     hospital, with a big hole blown through
                     the middle of my life, I started having
                     these dreams of flying.
<b>          
</b>          We are very low over the forest now, gliding fast, the drums
          BUILDING to a PEAK --
<b

In [88]:
dirty_script_df['script'] = dirty_script_df['script'].map(clean_script_html)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [89]:
dirty_script_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Avatar,2009,237000000,760507625,2788701337,2551701337,10.77,1,yes,no,no,no,yes,...
1,King Kong,2005,207000000,218080025,550517357,343517357,1.66,1,yes,no,yes,no,yes,KING...
4,Black Panther,2018,200000000,700059566,1346103376,1146103376,5.73,1,yes,no,no,no,no,BLACK PANTHER ...
5,Guardians of the Galaxy Vol 2,2017,200000000,389813101,869113101,669113101,3.35,1,yes,no,no,no,no,GUARDIANS OF THE GALAXY V...
6,Aladdin,2019,182000000,355559216,1050959216,868959216,4.77,1,yes,no,no,no,yes,ALADDIN: THE COMPLETE SCRIPTCOMPILED BY BEN S...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,Blue Valentine,2005,1000000,9737892,16566240,15566240,15.57,1,yes,no,no,no,yes,BLUE VALENTINE ...
244,Frozen River,2013,1000000,2511476,6030129,5030129,5.03,1,yes,no,no,no,yes,FROZEN RIVER ...
245,Fruitvale Station,2015,900000,16098998,17549645,16649645,18.50,1,yes,no,no,no,yes,FRUITVALE STATIO...
246,American Graffiti,2013,777000,115000000,140000000,139223000,179.18,1,yes,yes,yes,no,yes,"""AMERICAN G..."


In [90]:
building_df = pd.concat([building_df, dirty_script_df]).drop_duplicates('title', keep='last')
building_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no,no,yes,no,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no,no,yes,no,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...
5,Justice League,2017,300000000,229024295,655945209,355945209,1.19,0,no,no,no,yes,no,"There he is! Oh, sorry. Superman, Superman, ca..."
7,Spectre,2015,300000000,200074175,879620923,579620923,1.93,1,no,no,no,yes,no,"Where are you going? I won't be long. Welcome,..."
8,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,1,no,no,no,yes,no,There was a time above. A time before. There w...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,Blue Valentine,2005,1000000,9737892,16566240,15566240,15.57,1,yes,no,no,no,yes,BLUE VALENTINE ...
244,Frozen River,2013,1000000,2511476,6030129,5030129,5.03,1,yes,no,no,no,yes,FROZEN RIVER ...
245,Fruitvale Station,2015,900000,16098998,17549645,16649645,18.50,1,yes,no,no,no,yes,FRUITVALE STATIO...
246,American Graffiti,2013,777000,115000000,140000000,139223000,179.18,1,yes,yes,yes,no,yes,"""AMERICAN G..."


In [91]:
len(search_list_the)

44

In [92]:
imsdb_scripts_the = []
def script_pull_imsdb(url):
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    script = str(soup.findAll('pre'))
    imsdb_scripts_the.append(script)

In [93]:
for x in search_list_the:
    script_pull_imsdb('https://www.imsdb.com/scripts/{}.html'.format(x))

In [94]:
imsdb_scripts_the[0]

'[<pre> <html>\n<head>\n<title>Lion King, The Script at IMSDb.</title>\n<meta content="Lion King, The script" name="description"/>\n<meta content="Lion King, The script, Lion King, The movie script, Lion King, The film script" name="keywords"/>\n<p align="center">\n<iframe frameborder="0" height="90" scrolling="no" src="http://www.imsdb.com/adbanner.html" width="728"></iframe>\n<br/><br/>\nThis script was brought to you by :<br/>\n  <a href="http://www.imsdb.com">The Internet Movie Script Database</a><br/>\n  www.IMSDb.com\n</p><table cellpadding="0" cellspacing="0" width="99%">\n<tr>\n<td valign="top" width="120">\n\n<iframe border="0" frameborder="0" height="600" marginheight="0" marginwidth="0" scrolling="no" src="http://www.filemania.com/ad_sky.html" style="border:none;" width="120"></iframe>\n\n<td><td valign="top" width="120">\n\n<iframe border="0" frameborder="0" height="240" marginheight="0" marginwidth="0" scrolling="no" src="http://rcm.amazon.com/e/cm?t=everlongsoftw-20&amp;o

In [95]:
imsdb2 = pd.DataFrame(dict(zip(search_list_the,imsdb_scripts_the)), index = range(1)).T
imsdb2

Unnamed: 0,0
"Lion-King,-The","[<pre> <html>\n<head>\n<title>Lion King, The S..."
"Last-Samurai,-The",[<pre>for educational use only ...
"Patriot,-The",[<pre><html>\n<head>\n<script>\n<b><!--\n</b>\...
"Kingdom,-The",[<pre>\n\n\n\n\n<b> THE ...
"Abyss,-The",[<pre><html>\n<head>\n<script>\n<b><!--\n</b>i...
"Green-Mile,-The",[<pre>\n<b> T H E G R E E N M...
"Boxtrolls,-The",[<pre>\r\n\r\n\r\n\r\n<b> ...
"Relic,-The","[<pre><html>\n\n<head>\n <title>""The Relic"",..."
"Pacifier,-The",[<pre>\n\n\n<b> ...
"Bounty-Hunter,-The",[<pre>\n\n\n<b> THE...


In [96]:
imsdb2 = imsdb2.reset_index()
imsdb2 = imsdb2.rename(columns = {'index':'title', 0:'script'})

In [97]:
def dehyphenate(title):
    new_title = title.replace('-',' ')
    return new_title

In [98]:
imsdb2['title'] = imsdb2['title'].map(dehyphenate)

In [99]:
def no_comma(title):
    new_title = title.replace(', ', '')

In [100]:
imsdb2['title'][0][:-5]

'Lion King'

In [101]:
imsdb2['title'][0][-3:] + ' ' + imsdb2['title'][0][:-5]

'The Lion King'

In [102]:
def return_title(title):
    new_title = title[-3:] + ' ' + title[:-5]
    return new_title

In [103]:
imsdb2['title'] = imsdb2['title'].map(return_title)

In [104]:
imsdb2['script'] = imsdb2['script'].map(clean_script_html)

In [105]:
len(imsdb2)

44

In [106]:
imsdb2 = imsdb2.loc[(imsdb2['title'] != 'The Lion King') & (imsdb2['title'] != 'The Rage: Carrie 2') & 
           (imsdb2['title'] != 'The Apartment')]

In [107]:
imsdb2

Unnamed: 0,title,script
1,The Last Samurai,for educational use only ...
2,The Patriot,><script><!--/*Break-out-of-frames scriptBy We...
3,The Kingdom,THE KINGDOM ...
4,The Abyss,><script><!--if (window!= top)top.location.hre...
5,The Green Mile,T H E G R E E N M I L E ...
6,The Boxtrolls,THE BOXTROLLS ...
7,The Relic,"> ""The Relic"", early draft, by Amy Holden Jo..."
8,The Pacifier,THE PACIFIER ...
9,The Bounty Hunter,THE BOUNTY HUNTER ...
10,The Hitchhiker's Guide to the Galaxy,HITCHHIKER'S ...


In [108]:
pd.concat([building_df, imsdb2])#.drop_duplicates('title', keep='last')
#building_df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,ROI,domestic_gross,production_budget,profit,script,script_available_IMSDB,script_available_aa,script_available_daily_script,script_available_kaggle_df,script_available_msas,success,title,worldwide_gross,year
0,1.76,241063875.0,379000000.0,6.666639e+08,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...,no,no,no,yes,no,1.0,Pirates of the Caribbean: On Stranger Tides,1.045664e+09,2011.0
1,2.82,459005868.0,365000000.0,1.031099e+09,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...,no,no,no,yes,no,1.0,Avengers: Age of Ultron,1.396099e+09,2015.0
5,1.19,229024295.0,300000000.0,3.559452e+08,"There he is! Oh, sorry. Superman, Superman, ca...",no,no,no,yes,no,0.0,Justice League,6.559452e+08,2017.0
7,1.93,200074175.0,300000000.0,5.796209e+08,"Where are you going? I won't be long. Welcome,...",no,no,no,yes,no,1.0,Spectre,8.796209e+08,2015.0
8,2.32,330360194.0,263000000.0,6.093951e+08,There was a time above. A time before. There w...,no,no,no,yes,no,1.0,Batman v Superman: Dawn of Justice,8.723951e+08,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,,,,,><script><!--if (window!= top)top.location.hre...,,,,,,,The Sweet Hereafter,,
39,,,,,...,,,,,,,The Visitor,,
41,,,,,FADE IN -- Title:For nearly forty years this s...,,,,,,,The Wizard of Oz,,
42,,,,,><script><!--if (window!= top)top.location.hre...,,,,,,,The French Connection,,


In [109]:
imsdb2 = all_films.merge(imsdb2)
building_df = pd.concat([building_df, imsdb2])

In [110]:
building_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no,no,yes,no,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no,no,yes,no,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...
5,Justice League,2017,300000000,229024295,655945209,355945209,1.19,0,no,no,no,yes,no,"There he is! Oh, sorry. Superman, Superman, ca..."
7,Spectre,2015,300000000,200074175,879620923,579620923,1.93,1,no,no,no,yes,no,"Where are you going? I won't be long. Welcome,..."
8,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,1,no,no,no,yes,no,There was a time above. A time before. There w...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,The Sweet Hereafter,2004,5000000,4306697,7951247,2951247,0.59,0,yes,yes,no,no,no,><script><!--if (window!= top)top.location.hre...
38,The Visitor,2000,4000000,9427026,19174817,15174817,3.79,1,yes,no,no,no,no,...
39,The Wizard of Oz,2007,2777000,34685891,34949452,32172452,11.59,1,yes,no,no,no,no,FADE IN -- Title:For nearly forty years this s...
40,The French Connection,2006,2200000,41158757,41158757,38958757,17.71,1,yes,no,no,no,no,><script><!--if (window!= top)top.location.hre...


In [119]:
daily_script_list = all_films.loc[(all_films['script_available_IMSDB'] == 'no') & (all_films['script_available_kaggle_df'] == 'no') 
             & (all_films['script_available_daily_script'] == 'yes')]
daily_script_list

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa
311,Stuart Little 2,2002,120000000,64956806,166000000,46000000,0.38,0,no,yes,no,no,no
1024,Superman,2017,55000000,134218018,300200000,245200000,4.46,1,no,yes,no,no,no
1708,A Nightmare on Elm Street,2011,35000000,63075011,117729621,82729621,2.36,1,no,yes,no,no,no
1753,Untraceable,2012,35000000,28687835,52649951,17649951,0.5,0,no,yes,no,no,no
1890,Out of Africa,2010,31000000,79096868,258210860,227210860,7.33,1,no,yes,no,no,no
2216,Silverado,2001,26000000,33200000,33200000,7200000,0.28,0,no,yes,yes,no,no
2370,Possession,2005,25000000,10103647,14805812,-10194188,-0.41,0,no,yes,yes,no,no
2829,Thirteen Ghosts,2018,19000000,41867960,68467960,49467960,2.6,1,no,yes,no,no,no
4720,The Rules of Attraction,2014,4000000,6525762,11799060,7799060,1.95,1,no,yes,no,no,no
4804,Airplane!,2015,3500000,83453539,83453539,79953539,22.84,1,no,yes,no,no,no


In [120]:
msas_script_list = all_films.loc[(all_films['script_available_IMSDB'] == 'no') & (all_films['script_available_kaggle_df'] == 'no') 
             & (all_films['script_available_msas'] == 'yes')]
msas_script_list

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa
314,The Fantastic Four,2015,120000000,56117548,167849187,47849187,0.4,0,no,no,yes,no,no
765,Doom,2016,70000000,28212337,58757178,-11242822,-0.16,0,no,no,yes,no,no
1214,The Iron Giant,2011,50000000,23159305,31333917,-18666083,-0.37,0,no,no,yes,no,no
1311,Galaxy Quest,2005,45000000,71423726,90523726,45523726,1.01,0,no,no,yes,no,no
1528,Rent,2001,40000000,29077547,31670620,-8329380,-0.21,0,no,no,yes,no,no
1531,The Island of Dr. Moreau,2017,40000000,27682712,27682712,-12317288,-0.31,0,no,no,yes,no,no
1573,Raise the Titanic,2015,40000000,7000000,7000000,-33000000,-0.82,0,no,no,yes,no,no
1826,Chill Factor,2001,34000000,11263966,11263966,-22736034,-0.67,0,no,no,yes,no,no
1972,Glory Road,2002,30000000,42647449,42799060,12799060,0.43,0,no,no,yes,no,no
2216,Silverado,2001,26000000,33200000,33200000,7200000,0.28,0,no,yes,yes,no,no


In [121]:
aa_script_list = all_films.loc[(all_films['script_available_IMSDB'] == 'no') & (all_films['script_available_kaggle_df'] == 'no') 
             & (all_films['script_available_aa'] == 'yes')]
aa_script_list

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa


In [122]:
all_films = all_films.drop(columns = 'script_available_aa')

In [123]:
len(daily_script_list) + len(msas_script_list)

38

In [124]:
daily_script_list['title']

311               Stuart Little 2
1024                     Superman
1708    A Nightmare on Elm Street
1753                  Untraceable
1890                Out of Africa
2216                    Silverado
2370                   Possession
2829              Thirteen Ghosts
4720      The Rules of Attraction
4804                    Airplane!
5010                     The Omen
5248    A Nightmare on Elm Street
Name: title, dtype: object

## Pulling Text From PDF Scripts Found Online

Used pdfminer's high_level extract_text function to test scraping script text from pdf documents

In [131]:
high_level.extract_text('/Users/will4856/Downloads/thirteen_ghosts.pdf')

'THIRTEEN GHOSTS\n\nby\n\nNeal Marshall Stevens\n\nbased on the screenplay by\nRobb White\n\nprevious revisions by\nTodd Alcott\nTodd Alcott and Neal Marshall Stevens\nRichard D\'Ovidio\nRichard D\'Ovidio and James Gunn\n\ncurrent revision by\nRichard D\'Ovidio\n\nREVISED PRODUCTION DRAFT\n\nNovember 4, 2000\n\n\x0cTHIRTEEN GHOSTS - Rev. 10/27/00\n\nBLACKNESS.  SUPERIMPOSE:  "The most beautiful thing we\ncan experience is the mysterious." - Albert Einstein\n\nThen:  "I do believe in spooks.  I do believe in spooks.\nI do, I do, I do believe in spooks." - The Cowardly Lion\n\n1\n\nTITLES\n\nAbstract images begin to form.  They move beneath the\ntitles, brilliant against the darkness, the ghost-like\napparitions fighting a current as if caught in the\nwhirlwind of time.\n\nOur IMAGE SHARPENS.  The apparitions are no longer there.\nInstead, police tape comes INTO FOCUS, the seemingly\nmiles of it entangled on stacks of derelict cars that\nfill the void.  The tape dances before us, undulat

## PDFs from Daily Script PDF Files

In [301]:
new_scripts = []
def pdf_text_pull(list_of_titles):
    for x in list_of_titles:
        new_scripts.append(high_level.extract_text('/Users/will4856/Downloads/{}.pdf'.format(x)))

In [302]:
pdf_text_pull(daily_script_pdf_scripts)

In [303]:
len(new_scripts)

7

In [304]:
new_scripts[0]



In [305]:
daily_script_list_pdf = daily_script_list_pdf.loc[daily_script_list_pdf['title'] != 'The Omen']

In [326]:
#pd.concat([daily_script_list_pdf,
new_daily1 = pd.DataFrame([dict(zip(daily_script_pdf_scripts,new_scripts))]).T.reset_index().rename(columns = {'index':'title1',
                                                                                                 0:'script'})
new_daily1

Unnamed: 0,title1,script
0,The Rules of Attraction,THE RULES OF ATTRACTION\n\nscreenplay by\n\nRo...
1,Thirteen Ghosts,THIRTEEN GHOSTS\n\nby\n\nNeal Marshall Stevens...
2,Possession,POSSESSION\n\nwritten by\nDAVID HENRY HWANG\n\...
3,Out of Africa,...
4,Untraceable,Robert Fyvolent & Mark R. Brinker \n\nUntracea...
5,Superman,"L, \n\nJuly 2 6 , 2002 \n\nS U P\n\nE\n\nR ..."
6,Stuart Little 2,written by\n\nBruce Joel Rubin\n\nrevisions by...


In [337]:
daily_script_list_pdf = daily_script_list_pdf.merge(new_daily1, left_on= 'title', right_on = 'title1')
daily_script_list_pdf#.drop(columns = ['title1','pdf_html'])

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,pdf_html,title1_x,script_x,title1_y,script_y,title1,script
0,Stuart Little 2,2002,120000000,64956806,166000000,46000000,0.38,0,no,yes,no,no,no,pdf,Stuart Little 2,written by\n\nBruce Joel Rubin\n\nrevisions by...,Stuart Little 2,written by\n\nBruce Joel Rubin\n\nrevisions by...,Stuart Little 2,written by\n\nBruce Joel Rubin\n\nrevisions by...
1,Superman,2017,55000000,134218018,300200000,245200000,4.46,1,no,yes,no,no,no,pdf,Superman,"L, \n\nJuly 2 6 , 2002 \n\nS U P\n\nE\n\nR ...",Superman,"L, \n\nJuly 2 6 , 2002 \n\nS U P\n\nE\n\nR ...",Superman,"L, \n\nJuly 2 6 , 2002 \n\nS U P\n\nE\n\nR ..."
2,Untraceable,2012,35000000,28687835,52649951,17649951,0.5,0,no,yes,no,no,no,pdf,Untraceable,Robert Fyvolent & Mark R. Brinker \n\nUntracea...,Untraceable,Robert Fyvolent & Mark R. Brinker \n\nUntracea...,Untraceable,Robert Fyvolent & Mark R. Brinker \n\nUntracea...
3,Out of Africa,2010,31000000,79096868,258210860,227210860,7.33,1,no,yes,no,no,no,pdf,Out of Africa,...,Out of Africa,...,Out of Africa,...
4,Possession,2005,25000000,10103647,14805812,-10194188,-0.41,0,no,yes,yes,no,no,pdf,Possession,POSSESSION\n\nwritten by\nDAVID HENRY HWANG\n\...,Possession,POSSESSION\n\nwritten by\nDAVID HENRY HWANG\n\...,Possession,POSSESSION\n\nwritten by\nDAVID HENRY HWANG\n\...
5,Thirteen Ghosts,2018,19000000,41867960,68467960,49467960,2.6,1,no,yes,no,no,no,pdf,Thirteen Ghosts,THIRTEEN GHOSTS\n\nby\n\nNeal Marshall Stevens...,Thirteen Ghosts,THIRTEEN GHOSTS\n\nby\n\nNeal Marshall Stevens...,Thirteen Ghosts,THIRTEEN GHOSTS\n\nby\n\nNeal Marshall Stevens...
6,The Rules of Attraction,2014,4000000,6525762,11799060,7799060,1.95,1,no,yes,no,no,no,pdf,The Rules of Attraction,THE RULES OF ATTRACTION\n\nscreenplay by\n\nRo...,The Rules of Attraction,THE RULES OF ATTRACTION\n\nscreenplay by\n\nRo...,The Rules of Attraction,THE RULES OF ATTRACTION\n\nscreenplay by\n\nRo...


In [339]:
daily_script_list_pdf = daily_script_list_pdf.drop(columns = ['pdf_html','title1_x', 'script_x','title1_y','script_y','title1'])
daily_script_list_pdf

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Stuart Little 2,2002,120000000,64956806,166000000,46000000,0.38,0,no,yes,no,no,no,written by\n\nBruce Joel Rubin\n\nrevisions by...
1,Superman,2017,55000000,134218018,300200000,245200000,4.46,1,no,yes,no,no,no,"L, \n\nJuly 2 6 , 2002 \n\nS U P\n\nE\n\nR ..."
2,Untraceable,2012,35000000,28687835,52649951,17649951,0.5,0,no,yes,no,no,no,Robert Fyvolent & Mark R. Brinker \n\nUntracea...
3,Out of Africa,2010,31000000,79096868,258210860,227210860,7.33,1,no,yes,no,no,no,...
4,Possession,2005,25000000,10103647,14805812,-10194188,-0.41,0,no,yes,yes,no,no,POSSESSION\n\nwritten by\nDAVID HENRY HWANG\n\...
5,Thirteen Ghosts,2018,19000000,41867960,68467960,49467960,2.6,1,no,yes,no,no,no,THIRTEEN GHOSTS\n\nby\n\nNeal Marshall Stevens...
6,The Rules of Attraction,2014,4000000,6525762,11799060,7799060,1.95,1,no,yes,no,no,no,THE RULES OF ATTRACTION\n\nscreenplay by\n\nRo...


In [342]:
daily_script_list_pdf['script'] = daily_script_list_pdf['script'].map(clean_script_html)
daily_script_list_pdf

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Stuart Little 2,2002,120000000,64956806,166000000,46000000,0.38,0,no,yes,no,no,no,written byBruce Joel Rubinrevisions byLowell G...
1,Superman,2017,55000000,134218018,300200000,245200000,4.46,1,no,yes,no,no,no,"L, July 2 6 , 2002 S U PER M A N FADE IN: ..."
2,Untraceable,2012,35000000,28687835,52649951,17649951,0.5,0,no,yes,no,no,no,Robert Fyvolent & Mark R. Brinker UntraceableB...
3,Out of Africa,2010,31000000,79096868,258210860,227210860,7.33,1,no,yes,no,no,no,...
4,Possession,2005,25000000,10103647,14805812,-10194188,-0.41,0,no,yes,yes,no,no,POSSESSIONwritten byDAVID HENRY HWANGBased on ...
5,Thirteen Ghosts,2018,19000000,41867960,68467960,49467960,2.6,1,no,yes,no,no,no,THIRTEEN GHOSTSbyNeal Marshall Stevensbased on...
6,The Rules of Attraction,2014,4000000,6525762,11799060,7799060,1.95,1,no,yes,no,no,no,THE RULES OF ATTRACTIONscreenplay byRoger Avar...


Appended Daily Script PDF scripts to building_df

In [345]:
building_df = pd.concat([building_df, daily_script_list_pdf])

In [346]:
building_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no,no,yes,no,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no,no,yes,no,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...
5,Justice League,2017,300000000,229024295,655945209,355945209,1.19,0,no,no,no,yes,no,"There he is! Oh, sorry. Superman, Superman, ca..."
7,Spectre,2015,300000000,200074175,879620923,579620923,1.93,1,no,no,no,yes,no,"Where are you going? I won't be long. Welcome,..."
8,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,1,no,no,no,yes,no,There was a time above. A time before. There w...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,Untraceable,2012,35000000,28687835,52649951,17649951,0.50,0,no,yes,no,no,no,Robert Fyvolent & Mark R. Brinker UntraceableB...
3,Out of Africa,2010,31000000,79096868,258210860,227210860,7.33,1,no,yes,no,no,no,...
4,Possession,2005,25000000,10103647,14805812,-10194188,-0.41,0,no,yes,yes,no,no,POSSESSIONwritten byDAVID HENRY HWANGBased on ...
5,Thirteen Ghosts,2018,19000000,41867960,68467960,49467960,2.60,1,no,yes,no,no,no,THIRTEEN GHOSTSbyNeal Marshall Stevensbased on...


In [349]:
msas_script_list['title']

314                 The Fantastic Four
765                               Doom
1214                    The Iron Giant
1311                      Galaxy Quest
1528                              Rent
1531          The Island of Dr. Moreau
1573                 Raise the Titanic
1826                      Chill Factor
1972                        Glory Road
2216                         Silverado
2370                        Possession
2602                             Holes
3163           Halloween: Resurrection
3358                   Far From Heaven
3476                        Barbershop
4157                     The Godfather
4170         Good Night, and Good Luck
4408                             Shine
4528                      The Forsaken
4687                   The Crying Game
4889                        Goldfinger
4905    Beneath the Planet of the Apes
5011                 Hustle &amp; Flow
5097            Friday the 13th Part 3
5454                       The Howling
5567                     

In [352]:
def nav_to_script(title, url):
    search = title + ' ' + 'site' + ' ' + url
    driver = webdriver.Chrome('/Users/will4856/Downloads/chromedriver')
    driver.get("https://www.google.com")
    elem = driver.find_element_by_name("q")
    elem.clear()
    elem.send_keys(str(search))
    driver.find_element_by_id('gbqfbb').click()

In [353]:
for x in list(msas_script_list['title']):
    nav_to_script(title = x, url = 'www.moviescriptsandscreenplays.com')

KeyboardInterrupt: 

In [367]:
driver = webdriver.Chrome('/Users/will4856/Downloads/chromedriver')
driver.get("https://www.moviescriptsandscreenplays.com")
elem = driver.find_element_by_name("search")
elem.clear()
elem.send_keys('Test')
elem.send_keys(Keys.RETURN)

In [368]:
def nav_script_msas(title):
    driver = webdriver.Chrome('/Users/will4856/Downloads/chromedriver')
    driver.get("https://www.moviescriptsandscreenplays.com")
    elem = driver.find_element_by_name("search")
    elem.clear()
    elem.send_keys(title)
    elem.send_keys(Keys.RETURN)

In [370]:
for x in list(msas_script_list['title'][0:5]):
    nav_script_msas(x)

In [361]:
building_df

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df,script_available_aa,script
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no,no,yes,no,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no,no,yes,no,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...
5,Justice League,2017,300000000,229024295,655945209,355945209,1.19,0,no,no,no,yes,no,"There he is! Oh, sorry. Superman, Superman, ca..."
7,Spectre,2015,300000000,200074175,879620923,579620923,1.93,1,no,no,no,yes,no,"Where are you going? I won't be long. Welcome,..."
8,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,1,no,no,no,yes,no,There was a time above. A time before. There w...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,Untraceable,2012,35000000,28687835,52649951,17649951,0.50,0,no,yes,no,no,no,Robert Fyvolent & Mark R. Brinker UntraceableB...
3,Out of Africa,2010,31000000,79096868,258210860,227210860,7.33,1,no,yes,no,no,no,...
4,Possession,2005,25000000,10103647,14805812,-10194188,-0.41,0,no,yes,yes,no,no,POSSESSIONwritten byDAVID HENRY HWANGBased on ...
5,Thirteen Ghosts,2018,19000000,41867960,68467960,49467960,2.60,1,no,yes,no,no,no,THIRTEEN GHOSTSbyNeal Marshall Stevensbased on...


In [371]:
all_films

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,script_available_IMSDB,script_available_daily_script,script_available_msas,script_available_kaggle_df
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1,no,no,no,no
1,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,1,no,no,no,yes
2,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,1,no,no,no,yes
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1,no,no,no,no
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1,no,no,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...
5867,"20,000 Leagues Under the Sea",2013,200000,8000000,8000000,7800000,39.00,1,no,no,no,no
5868,Swingers,2000,200000,4505922,6618578,6418578,32.09,1,yes,yes,no,no
5899,She's Gotta Have It,2010,175000,7137502,7137502,6962502,39.79,1,no,no,no,no
5908,Sweet Sweetback's Baad Asssss Song,2015,150000,15200000,15200000,15050000,100.33,1,no,no,no,no


In [375]:
all_films = all_films.drop(columns = 'found')

In [376]:
all_films['found'] = ['yes' if x in list(building_df['title']) else 'no' for x in all_films['title']]

In [377]:
all_films['found'].value_counts()

no     1992
yes    1382
Name: found, dtype: int64

In [381]:
films_at_large = all_films.loc[all_films['found'] == 'no']
films_at_large = films_at_large[['title','year','production_budget','domestic_gross','worldwide_gross','profit','ROI','success']]
films_at_large

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1
5,Pirates of the Caribbean: At Worldâs End,2007,300000000,309420425,963420425,663420425,2.21,1
8,Star Wars: The Rise of Skywalker,2019,275000000,515202542,1073469600,798469600,2.90,1
...,...,...,...,...,...,...,...,...
5866,Mad Max,2015,200000,8750000,99750000,99550000,497.75,1
5867,"20,000 Leagues Under the Sea",2013,200000,8000000,8000000,7800000,39.00,1
5899,She's Gotta Have It,2010,175000,7137502,7137502,6962502,39.79,1
5908,Sweet Sweetback's Baad Asssss Song,2015,150000,15200000,15200000,15050000,100.33,1


In [464]:
html_page = requests.get('http://www.script-o-rama.com/snazzy/table.html')
soup = BeautifulSoup(html_page.content, 'html.parser')
scripts = list(soup.findAll('td', align = 'left'))
scripts[::2]

[<td align="left" width="372">
 <a href="http://www.dailyscript.com/scripts/three-kings_unproduced.html">3 Kings</a></td>,
 <td align="left" width="372">
 <a href="http://www.dailyscript.com/scripts/threekings_shootingdraft.pdf">
 				3 Kings</a></td>,
 <td align="left" width="372">
 <a href="http://www.dailyscript.com/scripts/three-kings_shooting.html">
 				3 Kings</a></td>,
 <td align="left" width="372">
 <a href="http://snarketeria.com/reads/310.pdf">3:10 To Yuma</a></td>,
 <td align="left" width="372">
 <a href="http://www.godamongdirectors.com/scripts/ele5ment.txt">
 				5th Element, The</a></td>,
 <td align="left" width="372">
 <a href="http://www.godamongdirectors.com/scripts/ele5ment.txt">
 				5th Estate, The</a></td>,
 <td align="left" width="372"><a href="http://sfy.ru/sfy.html?script=five_easy_pieces">5 Easy 
 				Pieces</a></td>,
 <td align="left" width="372">
 <a href="http://www.dailyscript.com/scripts/fivefeetandrising.html">5 
 				Feet And Rising</a></td>,
 <td align

In [421]:
page_numbers = ['','2','3','4']

In [465]:
titles = []
for page in page_numbers:
    html_page = requests.get('http://www.script-o-rama.com/snazzy/table{}.html'.format(page))
    soup = BeautifulSoup(html_page.content, 'html.parser')
    titles1 = list(soup.findAll('td', align = 'left'))
    titles1 = titles1[::2]
    for title in titles1:
        titles.append(title)

In [466]:
len(titles)

1416

In [467]:
titles[-1]

<td align="left">
<a href="http://www.rorkesdriftvc.com/zulu_dawn_script.pdf">Zulu Dawn</a></td>

In [431]:
html_page = requests.get('http://www.script-o-rama.com/snazzy/table.html')
soup = BeautifulSoup(html_page.content, 'html.parser')
drafts = list(soup.findAll('td', align = 'left'))
drafts[1::2]

[<td align="left">First Draft </td>,
 <td align="left">Revised Final Draft</td>,
 <td align="left">Shooting Draft </td>,
 <td align="left">9/15/04 Draft </td>,
 <td align="left">Revised Draft</td>,
 <td align="left">Late Draft</td>,
 <td align="left">Unspecified Draft</td>,
 <td align="left">Unspecified Draft</td>,
 <td align="left">4/16/08 Draft</td>,
 <td align="left">Unspecified Draft</td>,
 <td align="left">Unspecified Draft</td>,
 <td align="left">Second Draft</td>,
 <td align="left">Shooting Script</td>,
 <td align="left">July 2000 Draft</td>,
 <td align="left">First Rewrite</td>,
 <td align="left">First Draft</td>,
 <td align="left">Shooting Draft</td>,
 <td align="left">Unspecified Draft</td>,
 <td align="left">Unspecified Draft</td>,
 <td align="left">Revised Draft </td>,
 <td align="left">Unspecified Draft</td>,
 <td align="left">4/6/04 Draft</td>,
 <td align="left">Production Draft </td>,
 <td align="left">Unspecified Draft</td>,
 <td align="left">Revised Draft</td>,
 <td al

In [432]:
drafts = []
for page in page_numbers:
    html_page = requests.get('http://www.script-o-rama.com/snazzy/table{}.html'.format(page))
    soup = BeautifulSoup(html_page.content, 'html.parser')
    drafts1 = list(soup.findAll('td', align = 'left'))[1::2]
    for draft in drafts1:
        drafts.append(draft)

In [433]:
len(drafts)

1416

In [450]:
html_page = requests.get('http://www.script-o-rama.com/snazzy/table.html')
soup = BeautifulSoup(html_page.content, 'html.parser')
file_types = list(soup.findAll('td'))
file_types[4::3]

[<td>.html</td>,
 <td>.pdf</td>,
 <td>.html</td>,
 <td>.pdf</td>,
 <td>.txt</td>,
 <td>.html</td>,
 <td>.html</td>,
 <td>.html</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td>.doc</td>,
 <td>.doc</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td>.txt</td>,
 <td>.txt</td>,
 <td>.pdf</td>,
 <td>.html</td>,
 <td>.html</td>,
 <td>.html</td>,
 <td>.pdf</td>,
 <td>.txt</td>,
 <td>.txt</td>,
 <td>.pdf</td>,
 <td>.html</td>,
 <td align="center">.pdf</td>,
 <td>.txt</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td align="center">.pdf</td>,
 <td align="center">.pdf</td>,
 <td align="center">.pdf</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td align="center">.txt</td>,
 <td align="center">.pdf</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td>.txt</td>,
 <td>.html</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td>.pdf</td>,
 <td>.doc</td>,
 <td>.txt</td>,
 <td>.pdf</td>,
 <td align="center">
 <p align="center">.txt</p></td>,
 <td align="center">
 <p align="center">.html</p></td>,
 <td 

In [451]:
file_types = []
for page in page_numbers:
    html_page = requests.get('http://www.script-o-rama.com/snazzy/table{}.html'.format(page))
    soup = BeautifulSoup(html_page.content, 'html.parser')
    file_types1 = list(soup.findAll('td'))
    file_types1 = file_types1[4::3]
    for file_type in file_types1:
        file_types.append(file_type)

In [452]:
len(file_types)

1415

In [492]:
type(file_types[0])

bs4.element.Tag

In [491]:
pd.DataFrame(dict(zip(titles, drafts)), columns = ['title','draft'])

Unnamed: 0,title,draft


In [493]:
titles1 = []
for title in titles:
    titles1.append(str(title))
drafts1 = []
for draft in drafts:
    drafts1.append(str(draft))

In [502]:
drew_scripts = pd.DataFrame(dict(zip(titles1, drafts1)),index = range(1)).T.reset_index().rename(columns = {'index':'title', 0: 'draft'})
drew_scripts

Unnamed: 0,title,draft
0,"<td align=""left"" width=""372"">\n<a href=""http:/...","<td align=""left"">First Draft </td>"
1,"<td align=""left"" width=""372"">\n<a href=""http:/...","<td align=""left"">Revised Final Draft</td>"
2,"<td align=""left"" width=""372"">\n<a href=""http:/...","<td align=""left"">Shooting Draft </td>"
3,"<td align=""left"" width=""372"">\n<a href=""http:/...","<td align=""left"">9/15/04 Draft </td>"
4,"<td align=""left"" width=""372"">\n<a href=""http:/...","<td align=""left"">Revised Draft</td>"
...,...,...
1404,"<td align=""left""><a href=""http://www.sellascri...","<td align=""left"">Unspecified Draft</td>"
1405,"<td align=""left"">\n<a href=""http://www.preterh...","<td align=""left"">Unspecified Draft</td>"
1406,"<td align=""left"">\n<a href=""http://www.horrorl...","<td align=""left"">Shooting Script</td>"
1407,"<td align=""left"">\n<a href=""http://www.mediafi...","<td align=""left"">9/18/07 Draft</td>"


In [509]:
drew_scripts['title'][0].split('">')[2][:-9]

'3 Kings'

In [516]:
drew_scripts['title'].map(lambda x: x.split('">')[2])

IndexError: list index out of range

In [522]:
drew_titles = list(drew_scripts['title'])

In [525]:
cleaned_titles = []
for title in drew_titles[0:100]:
    cleaned_titles.append(title.split('">')[2])

In [528]:
for title in drew_titles[100:300]:
    cleaned_titles.append(title.split('">')[2])

In [533]:
for title in drew_titles[300:325]:
    cleaned_titles.append(title.split('">')[2])

In [534]:
for title in drew_titles[325:350]:
    cleaned_titles.append(title.split('">')[2])

IndexError: list index out of range

In [540]:
for title in drew_titles[325:336]:
    cleaned_titles.append(title.split('">')[2])

In [542]:
for title in drew_titles[338:]:
    cleaned_titles.append(title.split('">')[2])

In [551]:
final_cleaned_titles = []
for title in cleaned_titles:
    final_cleaned_titles.append(str(title[:-9]).replace('\n\t\t\t\t',''))
final_cleaned_titles

['3 Kings',
 '3 Kings',
 '3 Kings',
 '3:10 To Yuma',
 '5th Element, The',
 '5th Estate, The',
 '5 Easy Pieces',
 '5 Feet And Rising',
 '500 Days Of Summer',
 '6th Sense, The',
 '7th Seal</a>,',
 '7 Days To Live',
 '7 Days To Live',
 '\n<a href="http://www.dailyscript.com/scripts/arac_a',
 '8 Mile',
 '8 Millimeter',
 '84 Charlie MoPic',
 '9 (2009)',
 '9th Gate, The',
 '10 Things I Hate About You',
 '12 (2003)',
 '12 And Holding',
 '12 Monkeys',
 '13 Days',
 '13 Ghosts',
 '15 Minutes',
 '1408',
 '1492: Conquest Of Paradise',
 '16 Blocks',
 '17 Again',
 '187',
 '25th Hour',
 '28 Weeks Later',
 '30 Days Of Night',
 '40 Year Old Virgin, The',
 '48 Hours',
 '2001: A Space Odyssey',
 '2001 Maniacs',
 '2010',
 'Above The Law',
 'Absolute Power',
 'The Abyss',
 'Ace Ventura: Pet Detective ',
 'Adaptation',
 'Adaptation',
 'The Addams Family',
 'Addicted To Murder 2',
 "The Adventure (L'Avventura)",
 'Adventureland',
 'Adventures Of Buckaroo Banzai',
 'Adventures Of Ford Fairlaine',
 'An Affair 

In [555]:
missing_titles = pd.DataFrame(final_cleaned_titles).rename(columns = {0:'title'}).drop_duplicates('title')
missing_titles

Unnamed: 0,title
0,3 Kings
3,3:10 To Yuma
4,"5th Element, The"
5,"5th Estate, The"
6,5 Easy Pieces
...,...
2001,Young Soul Rebels
2002,You've Got Mail
2003,Zodiac
2004,Zombieland


In [558]:
complete_list = []
for x in list(missing_titles['title']):
    if x[-3:] == 'The':
        complete_list.append(str(x[-3:] + ' ' + x[:-3]).replace(', ',''))
    else:
        complete_list.append(x)

In [559]:
complete_list

['3 Kings',
 '3:10 To Yuma',
 'The 5th Element',
 'The 5th Estate',
 '5 Easy Pieces',
 '5 Feet And Rising',
 '500 Days Of Summer',
 'The 6th Sense',
 '7th Seal</a>,',
 '7 Days To Live',
 '\n<a href="http://www.dailyscript.com/scripts/arac_a',
 '8 Mile',
 '8 Millimeter',
 '84 Charlie MoPic',
 '9 (2009)',
 'The 9th Gate',
 '10 Things I Hate About You',
 '12 (2003)',
 '12 And Holding',
 '12 Monkeys',
 '13 Days',
 '13 Ghosts',
 '15 Minutes',
 '1408',
 '1492: Conquest Of Paradise',
 '16 Blocks',
 '17 Again',
 '187',
 '25th Hour',
 '28 Weeks Later',
 '30 Days Of Night',
 'The 40 Year Old Virgin',
 '48 Hours',
 '2001: A Space Odyssey',
 '2001 Maniacs',
 '2010',
 'Above The Law',
 'Absolute Power',
 'The Abyss',
 'Ace Ventura: Pet Detective ',
 'Adaptation',
 'The Addams Family',
 'Addicted To Murder 2',
 "The Adventure (L'Avventura)",
 'Adventureland',
 'Adventures Of Buckaroo Banzai',
 'Adventures Of Ford Fairlaine',
 'An Affair To Remember',
 'Afterlife',
 'The African Queen',
 'After Schoo

In [562]:
films_at_large['available_drews_scripts'] = ['yes' if x in complete_list else 'no' for x in films_at_large['title']]
films_at_large['available_drews_scripts'].value_counts()

no     1926
yes      66
Name: available_drews_scripts, dtype: int64

In [569]:
available_drew_films = films_at_large.loc[films_at_large['available_drews_scripts'] == 'yes']
available_drew_films.sort_values('title')

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,available_drews_scripts
3169,28 Weeks Later,2006,15000000,28638916,64232714,49232714,3.28,1,yes
3320,A Perfect Getaway,2009,14000000,15515460,22815460,8815460,0.63,0,yes
3902,Adventureland,2008,9800000,16044025,17553055,7753055,0.79,0,yes
4804,Airplane!,2015,3500000,83453539,83453539,79953539,22.84,1,yes
4680,Animal Kingdom,2006,4200000,1044039,8078683,3878683,0.92,0,yes
...,...,...,...,...,...,...,...,...,...
3181,Urban Legends: Final Cut,2004,15000000,21468807,38574362,23574362,1.57,1,yes
4048,Wes Craven's New Nightmare,2008,8000000,18090181,18090181,10090181,1.26,0,yes
2756,Willard,2007,20000000,6882696,6882696,-13117304,-0.66,0,yes
281,X-Men 2,2003,125000000,214949694,406348630,281348630,2.25,1,yes


In [566]:
building_df = building_df[['title','year','production_budget','domestic_gross','worldwide_gross','profit','ROI','success','script']]

In [574]:
films_at_large['script'] = None

In [596]:
films_at_large

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,available_drews_scripts,script
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1,no,
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1,no,
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1,no,
5,Pirates of the Caribbean: At Worldâs End,2007,300000000,309420425,963420425,663420425,2.21,1,no,
8,Star Wars: The Rise of Skywalker,2019,275000000,515202542,1073469600,798469600,2.90,1,no,
...,...,...,...,...,...,...,...,...,...,...
5866,Mad Max,2015,200000,8750000,99750000,99550000,497.75,1,no,
5867,"20,000 Leagues Under the Sea",2013,200000,8000000,8000000,7800000,39.00,1,no,
5899,She's Gotta Have It,2010,175000,7137502,7137502,6962502,39.79,1,no,
5908,Sweet Sweetback's Baad Asssss Song,2015,150000,15200000,15200000,15050000,100.33,1,no,


In [614]:
def BS(url, find_key, class_ = None):
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    script = soup.findAll(find_key, class_ = class_)
    return str(script)

In [595]:
def add_script_to_df(df,index, script):
    df.at[index,'script'] = script

In [597]:
BS('https://transcripts.fandom.com/wiki/Avengers:_Infinity_War', 'p')



In [598]:
add_script_to_df(films_at_large, 4, BS('https://transcripts.fandom.com/wiki/Avengers:_Infinity_War', 'p'))

In [604]:
force_awakens = BS('https://www.imsdb.com/scripts/Star-Wars-The-Force-Awakens.html','pre')

In [605]:
add_script_to_df(films_at_large, 3, force_awakens)

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,available_drews_scripts,script
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1,no,
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1,no,[<pre>\r\n\r\n \r\n<b> ...
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1,no,"[<p>[<i>Marvel Opening Credits</i>]\n</p>, <p>..."
5,Pirates of the Caribbean: At Worldâs End,2007,300000000,309420425,963420425,663420425,2.21,1,no,
8,Star Wars: The Rise of Skywalker,2019,275000000,515202542,1073469600,798469600,2.90,1,no,
...,...,...,...,...,...,...,...,...,...,...
5866,Mad Max,2015,200000,8750000,99750000,99550000,497.75,1,no,
5867,"20,000 Leagues Under the Sea",2013,200000,8000000,8000000,7800000,39.00,1,no,
5899,She's Gotta Have It,2010,175000,7137502,7137502,6962502,39.79,1,no,
5908,Sweet Sweetback's Baad Asssss Song,2015,150000,15200000,15200000,15050000,100.33,1,no,


In [606]:
films_at_large[0:10]

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,available_drews_scripts,script
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1,no,
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1,no,[<pre>\r\n\r\n \r\n<b> ...
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1,no,"[<p>[<i>Marvel Opening Credits</i>]\n</p>, <p>..."
5,Pirates of the Caribbean: At Worldâs End,2007,300000000,309420425,963420425,663420425,2.21,1,no,
8,Star Wars: The Rise of Skywalker,2019,275000000,515202542,1073469600,798469600,2.9,1,no,
9,Solo: A Star Wars Story,2018,275000000,213767512,393151347,118151347,0.43,0,no,
10,John Carter,2012,263700000,73058679,282778100,19078100,0.07,0,no,
12,The Lion King,2019,260000000,543638043,1656943394,1396943394,5.37,1,yes,
13,Tangled,2010,260000000,200821936,585727091,325727091,1.25,0,no,
16,Harry Potter and the Half-Blood Prince,2009,250000000,302089278,935213767,685213767,2.74,1,no,


In [608]:
lion_king = BS('https://transcripts.fandom.com/wiki/The_Lion_King_(2019)','p')

In [610]:
add_script_to_df(films_at_large, 12, lion_king)
films_at_large[0:10]

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,available_drews_scripts,script
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1,no,
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1,no,[<pre>\r\n\r\n \r\n<b> ...
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1,no,"[<p>[<i>Marvel Opening Credits</i>]\n</p>, <p>..."
5,Pirates of the Caribbean: At Worldâs End,2007,300000000,309420425,963420425,663420425,2.21,1,no,
8,Star Wars: The Rise of Skywalker,2019,275000000,515202542,1073469600,798469600,2.9,1,no,
9,Solo: A Star Wars Story,2018,275000000,213767512,393151347,118151347,0.43,0,no,
10,John Carter,2012,263700000,73058679,282778100,19078100,0.07,0,no,
12,The Lion King,2019,260000000,543638043,1656943394,1396943394,5.37,1,yes,"[<p>[<i>Disney opening credits</i>]\n</p>, <p>..."
13,Tangled,2010,260000000,200821936,585727091,325727091,1.25,0,no,
16,Harry Potter and the Half-Blood Prince,2009,250000000,302089278,935213767,685213767,2.74,1,no,


In [627]:
def pdf_text_pull(title):
    return str(high_level.extract_text('/Users/will4856/Downloads/scripts_to_scrape/{}.pdf'.format(title)))

In [623]:
avengers_endgame = pdf_text_pull('Avengers Endgame')

In [624]:
add_script_to_df(films_at_large, 0, avengers_endgame)

In [631]:
films_at_large[0:10]

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,available_drews_scripts,script
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1,no,F O R Y O U R C O N S I D E R AT I O N\n\n...
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1,no,[<pre>\r\n\r\n \r\n<b> ...
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1,no,"[<p>[<i>Marvel Opening Credits</i>]\n</p>, <p>..."
5,Pirates of the Caribbean: At Worldâs End,2007,300000000,309420425,963420425,663420425,2.21,1,no,
8,Star Wars: The Rise of Skywalker,2019,275000000,515202542,1073469600,798469600,2.9,1,no,
9,Solo: A Star Wars Story,2018,275000000,213767512,393151347,118151347,0.43,0,no,
10,John Carter,2012,263700000,73058679,282778100,19078100,0.07,0,no,
12,The Lion King,2019,260000000,543638043,1656943394,1396943394,5.37,1,yes,"[<p>[<i>Disney opening credits</i>]\n</p>, <p>..."
13,Tangled,2010,260000000,200821936,585727091,325727091,1.25,0,no,
16,Harry Potter and the Half-Blood Prince,2009,250000000,302089278,935213767,685213767,2.74,1,no,


In [630]:
tangled = pdf_text_pull('Tangled')
tangled

"Scripts.com\n\nTangled\n\nBy Dan Fogelman\n\nPage 1/35\n\n\x0c(Flynn narrates)\nThis is the story of how I died.\nDon't worry, this is actually\na very fun story. And the truth is, it isn't even mine.\nThis is the story of a girl named,\nRapunzel.\nAnd it starts, with the sun.\nNow, once upon a time, a single\ndrop of sunlight fell from the heavens.\nAnd from this small drop of sun,\ngrew a magic, golden, flower.\nIt had the ability to heal the sick,\nand injured.\nOh, you see that old woman,\nover there?\nYou might want to remember her.\nShe's kind of important.\nWell, centuries pass and a hop\nskip and a bump right away there grew a kingdom.\nThe kingdom was ruled by a beloved\nKing and Queen.\nAnd the Queen, well she was about\nto have a baby,\nand she got sick,\nreally, sick.\nShe was running out of time.\nAnd that's when people usually start\nto look for a miracle.\nOr in this case, a magic golden flower.\nAhhh, I told you she'd\nbe important.\nYou see instead of sharing the\nsun

In [632]:
add_script_to_df(films_at_large, 13, tangled)

In [633]:
films_at_large[0:10]

Unnamed: 0,title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,success,available_drews_scripts,script
0,Avengers: Endgame,2019,400000000,858373000,2797800564,2397800564,5.99,1,no,F O R Y O U R C O N S I D E R AT I O N\n\n...
3,Star Wars Ep. VII: The Force Awakens,2015,306000000,936662225,2068223624,1762223624,5.76,1,no,[<pre>\r\n\r\n \r\n<b> ...
4,Avengers: Infinity War,2018,300000000,678815482,2048359754,1748359754,5.83,1,no,"[<p>[<i>Marvel Opening Credits</i>]\n</p>, <p>..."
5,Pirates of the Caribbean: At Worldâs End,2007,300000000,309420425,963420425,663420425,2.21,1,no,
8,Star Wars: The Rise of Skywalker,2019,275000000,515202542,1073469600,798469600,2.9,1,no,
9,Solo: A Star Wars Story,2018,275000000,213767512,393151347,118151347,0.43,0,no,
10,John Carter,2012,263700000,73058679,282778100,19078100,0.07,0,no,
12,The Lion King,2019,260000000,543638043,1656943394,1396943394,5.37,1,yes,"[<p>[<i>Disney opening credits</i>]\n</p>, <p>..."
13,Tangled,2010,260000000,200821936,585727091,325727091,1.25,0,no,Scripts.com\n\nTangled\n\nBy Dan Fogelman\n\nP...
16,Harry Potter and the Half-Blood Prince,2009,250000000,302089278,935213767,685213767,2.74,1,no,


In [636]:
half_blood = pdf_text_pull('Harry Potter and the Half-Blood Prince')
half_blood

"Rev. 09/13/07 (Blue)\nRev. 10/02/07 (Pink)\nRev. 11/06/07 (Yellow)\n\nHARRY POTTER AND THE HALF-BLOOD PRINCE\n\nscreenplay by\n\nSteve Kloves\n\nbased on the novel by J.K. Rowling\n\nThis script is the confidential and proprietary \nproperty of Warner Bros. Pictures and no portion of \nit may be performed, distributed, reproduced, used, \nquoted or published without prior written permission.\n\nWARNER BROS. PICTURES INC.\n4000 Warner Boulevard\nBurbank, California  91522\n\nAugust 28, 2007\n© 2007\nWARNER BROS. ENT.\nAll Rights Reserved\n\n\x0cHARRY POTTER... HALF-BLOOD PRINCE - Rev. 11/6/07       1.\n\nDARKNESS.\n\nTHUNDER -- or something like it -- sounds in the \ndistance.\n\nBOOM.  Then again.  BOOM.\n\nWe GLIDE THROUGH the inky blackness.  Ambient flashes \nilluminate the silhouette of the WB LOGO.  We PASS \nTHROUGH.\n\nINTO more darkness.  Lost.  More FLASHES.  And we --\n\nA SINGLE EYE\n\nBlank behind glasses.  FLASH!  The PUPIL CONTRACTS and  \nwe --\n\nCUT TO:\n\nCUT TO:\n\n

In [637]:
add_script_to_df(films_at_large, 16, half_blood)

In [638]:
john_carter = pdf_text_pull('John Carter')
john_carter

'\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c'

In [639]:
films_at_large.to_csv('/Users/will4856/Downloads/films_at_large.csv')

In [641]:
building_df.to_csv('/Users/will4856/Downloads/building_csv.csv')