In [1]:
from narrative_understanding.cache import GetPage

import pandas as pd
import bs4
import requests
import os
import re
import urllib3
import urllib
import wget
import datetime
import collections
import subprocess
import pdftotext

In [12]:
cache_dir = os.path.join(os.getenv("DATA_DIR"), "mica_narrative_understanding/data/scrape")
movie_scripts_dir = os.path.join(os.getenv("DATA_DIR"), "mica_narrative_understanding/data/movie_scripts")

def write_text(text: str, text_file: str):
    """Write `text` to the `text_file` text file."""
    with open(text_file, "w") as fw:
        fw.write(text)


def download_pdf(pdf_url: str, pdf_file: str) -> bool:
    """Download pdf from the `pdf_url` url (must end in .pdf or .PDF), save the document to the `pdf_file` path, 
    convert the pdf file to a text file, and save the text file to the same directory as the `pdf_file` path 
    with the same filename but with the .txt extension.

    Args:
        pdf_url (str) : url to a pdf document.
        pdf_file (str) : filepath where pdf document will be saved.
    
    Returns:
        success flag (bool) : True if the pdf is successfully downloaded and converted to a text file, else False.
    """
    pdf_file = pdf_file if re.search(r"\.pdf$", pdf_file) is not None else pdf_file + ".pdf"
    text_file = re.sub(r"\.pdf$", ".txt", pdf_file)
    try:
        response = requests.get(pdf_url)
        if response.status_code == 200:
            with open(pdf_file, "wb") as fw:
                fw.write(response.content)
            with open(pdf_file, "rb") as fr:
                pdf = pdftotext.PDF(fr)
            text = "\n\n".join(pdf)
            n_words = len(text.split())
            assert n_words > 1000
            write_text(text, text_file)
        return True
    except Exception:
        if os.path.exists(pdf_file):
            os.remove(pdf_file)
        if os.path.exists(text_file):
            os.remove(text_file)
    return False


def get_movie_script(url: str, getpage: GetPage) -> tuple[str, int] | None:
    """Check if `url` links to a PDF script or find the script (PDF url or text) within the `url` webpage.
    
    Args:
        url (str) : url 
        getpage (GetPage) : GetPage object to retrieve webpages

    Returns:
        str : a pdf url or script text
        int : status code
    """
    # url links to a PDF document
    if url.endswith("pdf") or url.endswith("PDF"):
        return url, 200

    # retrieve page
    soup_dict, status_code_dict = getpage(url, disable_progress_bar=True)

    # url links to a TEXT document
    if url.endswith("txt") or url.endswith("TXT"):
        if url in soup_dict:
            soup = soup_dict[url]
            return soup.text, 200
        elif url in status_code_dict:
            return None, status_code_dict[url]

    # script url links to scriptslug
    elif re.search("scriptslug.com", url) is not None:
        if url in soup_dict:
            soup = soup_dict[url]
            for a_element in soup.find_all("a"):
                if a_element.text.lower().strip() == "read the script":
                    pdf_url = a_element["href"]
                    if pdf_url.endswith("pdf") or pdf_url.endswith("PDF"):
                        return pdf_url, 200
        elif url in status_code_dict:
            return None, status_code_dict[url]
    
    # script url links to imsdb html script
    # script url links to dailyscript html script
    # script url links to screenplays for you
    elif re.search(r"((imsdb)|(dailyscript)|(horrorlair))\.com.*\.html?$", url) or (
         re.search("sfy.ru", url) is not None):
        if url in soup_dict:
            soup = soup_dict[url]
            preformatted_element = soup.find("pre")
            if preformatted_element is not None:
                return preformatted_element.text, 200
        elif url in status_code_dict:
            return None, status_code_dict[url]

In [4]:
# scripts on screen url and current date
scripts_on_screen_url = "https://scripts-onscreen.com/movie-script-index/"
date_accessed = datetime.datetime.now().strftime("%b %d %Y")

# initialize GetPage object to retrieve, parse, and cache webpages
getpage = GetPage(cache_dir)

# get scripts on screen index page and find the category pages of movie links
parsed_pages, _ = getpage(scripts_on_screen_url, disable_progress_bar=True)
index_soup = parsed_pages[scripts_on_screen_url]
index_links_soup = index_soup.find("div", attrs={"class":"soslinks"})
links = [a_element["href"] for a_element in index_links_soup.find_all("a")]
print("parsed Scripts On Screen index page\n")

# get the category pages and find the movie links
link_soups, _ = getpage(*links)
movie_links = []
for link_soup in link_soups.values():
    list_elements = link_soup.find("div", {"class": "sosindex"}).find_all("li")
    for list_element in list_elements:
        movie_link = "https://scripts-onscreen.com" + list_element.find("a")["href"]
        movie_links.append(movie_link)
print("retrieved Scripts on Screen movie link category pages\n")

# get the movie pages
movie_pages, _ = getpage(*movie_links)
print(f"retrieved {len(movie_pages)} movie pages\n")

# initialize the movie scripts index
movie_scripts_directory = movie_scripts_dir
movie_scripts_index_file = os.path.join(movie_scripts_directory, "index.csv")
movie_scripts_index = {}
next_file = 0

# populate the movie scripts index with existing entries
# the index is a mapping from script url to filename (without the extension), download date, IMDB id, MOVIEDB id,
# and the synopsis on scripts on screen website
if os.path.exists(movie_scripts_index_file):
    movie_index_df = pd.read_csv(movie_scripts_index_file, index_col=None)
    for _, row in movie_index_df.iterrows():
        movie_scripts_index[row["url"]] = (row["file"], row["date"], row["imdb_id"], row["moviedb_id"], 
                                        row["script_on_screen_synopsis"])
        next_file = max(next_file, row["file"] + 1)

parsed Scripts On Screen index page



https://scripts-onscreen.com/movie-script-index/numeric-movie-script-index/: 100%|██████████| 27/27 [00:01<00:00, 22.14url/s]


retrieved Scripts on Screen movie link category pages



https://scripts-onscreen.com/movie/99-homes-script-links/: 100%|██████████| 11059/11059 [02:33<00:00, 72.26url/s]                                                            


retrieved 9349 movie pages



In [13]:
n = 100
error_urls = []

# loop over each movie page
for i, movie_page in enumerate(movie_pages.values()):
    main_div_soup = movie_page.find("div", {"class": "main_div"})
    synopsis, imdb_id, moviedb_id = None, None, None

    # get scripts on screen synopsis
    movie_synopsis_soups = [movie_prop_soup 
                            for movie_prop_soup in main_div_soup.find_all("div", {"class": "movie-prop"}) 
                                if movie_prop_soup.text.startswith("Script Synopsis")]
    if movie_synopsis_soups:
        synopsis = re.sub(f"^Script Synopsis:", "", movie_synopsis_soups[0].text).strip()
    
    # get script urls
    movie_links_soup = main_div_soup.find("div", {"class": "movie-links"})
    urls = []
    if movie_links_soup is not None:
        list_elements = movie_links_soup.find_all("li")
        
        # loop over each link
        # find imdb and moviedb id
        # find all script links that are not paid and does not contain the word 'Transcript' in the text
        for list_element in list_elements:
            url = list_element.find("a")["href"]
            text = list_element.text
            imdb_match = re.search(r"\(\s*(tt\d+)\s*\)", text)
            moviedb_match = re.search(r"\(\s*(\d+)\s*\)", text)
            if "IMDb" in text and imdb_match is not None:
                imdb_id = imdb_match.group(1)
            elif "TheMovieDB.org" in text and moviedb_match is not None:
                moviedb_id = moviedb_match.group(1)
            elif "$" not in text and "Transcript" not in text and url not in movie_scripts_index:
                urls.append(url)

    # get pdf urls or scrape the texts
    # script url is the url on the movie page of scripts on screen used as index
    # script url might not always be the same as the pdf url, for cases where more webpages needs to be retrieved
    # to find the actual url to the movie script
    script_url_to_pdf_url, script_url_to_text = {}, {}
    for url in urls:
        response = get_movie_script(url, getpage)
        if response is not None:
            if response[1] == 200:
                if re.match(r"http", response[0]) is not None:
                    script_url_to_pdf_url[url] = response[0]
                else:
                    script_url_to_text[url] = response[0]
        else:
            print(f"{url:150s}")
            error_urls.append(url)
    
    # download pdfs
    for script_url, pdf_url in script_url_to_pdf_url.items():
        pdf_file = os.path.join(movie_scripts_directory, f"{next_file}.pdf")
        if download_pdf(pdf_url, pdf_file):
            movie_scripts_index[script_url] = (next_file, date_accessed, imdb_id, moviedb_id, synopsis)
            next_file += 1
        else:
            print(f"{pdf_url:150s} failed to download")
            error_urls.append(pdf_url)
    
    # write texts
    for script_url, text in script_url_to_text.items():
        movie_scripts_index[script_url] = (next_file, date_accessed, imdb_id, moviedb_id, synopsis)
        text_file = os.path.join(movie_scripts_directory, f"{next_file}.txt")
        next_file += 1
        write_text(text, text_file)

    # write movie scripts index and cache index periodically
    if (i + 1) % 10 == 0 or i == len(movie_pages) - 1:
        rows = []
        for url, (file_id, date, imdb_id, moviedb_id, synopsis) in movie_scripts_index.items():
            row = [url, file_id, date, imdb_id, moviedb_id, synopsis]
            rows.append(row)
        movie_index_df = pd.DataFrame(rows, columns=["url", "file", "date", "imdb_id", "moviedb_id", 
                                                        "script_on_screen_synopsis"])
        movie_index_df.to_csv(movie_scripts_index_file, index=False)
        getpage._write_index()
    
    if len(error_urls) >= n:
        break

https://web.archive.org/web/20180713235209/http://www.joblo.com:80/scripts/The%20A-Team.pdf                                                            failed to download
https://www.docdroid.net/2GJdbnu/about-time-script-richard-curtis-pdf                                                                                  failed to download
http://www.dailyscript.com/scripts/ace_ventura_shoot.html                                                                                             


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://drive.google.com/file/d/15FGH5Nl2QXcMEJHI9XaGXjm2lC2cjJbH/view                                                                                
https://www.docdroid.net/muJitYr/ad-astra.pdf#page=2                                                                                                  
http://www.beingcharliekaufman.com/index.php/scripts-writing/adaptation-2nd-draft-24-sept-1999/download                                               
http://www.beingcharliekaufman.com/index.php/scripts-writing/adaptation-2nd-draft-revised-21-nov-2000/download                                        
https://assets.scriptslug.com/live/pdf/scripts/the-adjustment-bureau-2011.pdf                                                                          failed to download
http://www.screenplaydb.com/film/scripts/adventureland20070805/                                                                                       
https://thescriptsavant.com/movies/The_Adventures_Of_TinTin.pdf            

ConnectTimeout: HTTPConnectionPool(host='content.cdlib.org', port=8088): Max retries exceeded with url: /xtf/view?docId=ft7b69p14j&chunk.id=d0e9232&toc.depth=1&toc.id=&brand=ucpress&query=riskin&html.parser (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7f02650708e0>, 'Connection to content.cdlib.org timed out. (connect timeout=None)'))