In [12]:
import collections
import os
import re
import requests
import pdftotext

data_dir = os.path.join(os.getenv("DATA_DIR"), "mica_narrative_understanding/")

In [3]:
error_urls_file = os.path.join(data_dir, "data/movie_scripts/error_urls.txt")
error_urls = set()
with open(error_urls_file, "r") as fr:
    for line in fr:
        if line.strip():
            error_urls.add(line.strip())

In [4]:
error_urls = sorted(error_urls)

In [9]:
domain_to_urls = collections.defaultdict(set)

for error_url in error_urls:
    match = re.search("^https?://([^/]+)/", error_url)
    if match is not None:
        domain = match.group(1)
        domain_to_urls[domain].add(error_url)

domains = sorted(domain_to_urls.keys(), key=lambda domain: len(domain_to_urls[domain]), reverse=True)
for domain in domains:
    print(domain, len(domain_to_urls[domain]))

web.archive.org 287
thescriptsavant.com 164
www.scifiscripts.com 153
www.dailyscript.com 133
www.horrorlair.com 127
www.scriptslug.com 65
www.awesomefilm.com 56
assets.scriptslug.com 48
www.scribd.com 42
www.sonyclassics.com 41
www.imsdb.com 38
wikileaks.org 37
drive.google.com 36
www.aellea.com 33
sfy.ru 22
www.cinefile.biz 22
www.screenplaydb.com 21
www.hundland.org 12
twcawards.com 11
www.mymoviescripts.com 11
www.sendspace.com 11
film.netflixawards.com 9
twcguilds.com 8
waltdisneystudiosawards.com 8
www.hitchcockwiki.com 8
www.vantageguilds.com 8
www.dropbox.com 8
www.whiskeyloosetongue.com 7
www.amazonstudiosguilds.com 7
focusfeaturesguilds2016.com 6
leonscripts.tripod.com 6
noamkroll.com 6
thescriptlab.com 6
www.universalexports.net 6
0f1b361a5a35d46c59b38689aef7623c.fslcdn.net 5
www.angelfire.com 5
www.annapurnaguilds.com 5
www.beingcharliekaufman.com 5
www.docstoc.com 5
www.scenebyscene.net 5
universalpicturesawards.com 5
www.imdb.com 5
gointothestory.blcklst.com 4
omenchronicl

In [14]:
def write_text(text: str, text_file: str):
    """Write `text` to the `text_file` text file."""
    with open(text_file, "w") as fw:
        fw.write(text)

def download_pdf(pdf_url: str, pdf_file: str, timeout: int) -> bool:
    """Download pdf from the `pdf_url` url (must end in .pdf or .PDF), save the document to the `pdf_file` path, 
    convert the pdf file to a text file, and save the text file to the same directory as the `pdf_file` path 
    with the same filename but with the .txt extension.

    Args:
        pdf_url (str) : url to a pdf document.
        pdf_file (str) : filepath where pdf document will be saved.
        timeout (int) : timeout in seconds
    
    Returns:
        success flag (bool) : True if the pdf is successfully downloaded and converted to a text file, else False.
    """
    pdf_file = pdf_file if re.search(r"\.pdf$", pdf_file) is not None else pdf_file + ".pdf"
    text_file = re.sub(r"\.pdf$", ".txt", pdf_file)
    try:
        response = requests.get(pdf_url, timeout=timeout)
        if response.status_code == 200:
            with open(pdf_file, "wb") as fw:
                fw.write(response.content)
            with open(pdf_file, "rb") as fr:
                pdf = pdftotext.PDF(fr)
            text = "\n\n".join(pdf)
            n_words = len(text.split())
            assert n_words > 1000
            write_text(text, text_file)
            return True
    except Exception:
        pass
        # if os.path.exists(pdf_file):
        #     os.remove(pdf_file)
        # if os.path.exists(text_file):
        #     os.remove(text_file)
    return False

In [10]:
domain = "thescriptsavant.com"

for url in domain_to_urls[domain]:
    print(url)

https://thescriptsavant.com/movies/Hard_Candy.pdf
https://thescriptsavant.com/movies/Green_Lantern.pdf
https://thescriptsavant.com/movies/The_Cocoanuts.pdf
https://thescriptsavant.com/movies/Robocop.pdf
https://thescriptsavant.com/movies/Scott_Pilgrim.pdf
https://thescriptsavant.com/movies/The_Karate_Kid.pdf
https://thescriptsavant.com/movies/Elizabeth_Town.pdf
https://thescriptsavant.com/movies/Untitled_50_Cent_Project.pdf
https://thescriptsavant.com/movies/12.pdf
https://thescriptsavant.com/movies/Born_On_The_Forth_Of_July_P2.pdf
https://thescriptsavant.com/movies/High_Noon.pdf
https://thescriptsavant.com/movies/Idle_Hands.pdf
https://thescriptsavant.com/movies/The_Four_Seasons.pdf
https://thescriptsavant.com/movies/The_Silver_Streak.pdf
https://thescriptsavant.com/movies/Melvin_And_Howard.pdf
https://thescriptsavant.com/movies/The_Muppets.pdf
https://thescriptsavant.com/movies/Naked_Gun_33.pdf
https://thescriptsavant.com/movies/Fried_Green_Tomatoes.pdf
https://thescriptsavant.com/mo

In [16]:
pdf_url = "https://thescriptsavant.com/movies/Green_Lantern.pdf"
download_pdf(pdf_url, "test.pdf", 120)

False

In [19]:
domain = "web.archive.org"

for url in domain_to_urls[domain]:
    print(url)

https://web.archive.org/web/20150906113737/http://www.joblo.com/scripts/Edge%20of%20Darkness.pdf
https://web.archive.org/web/20170110153336/http://screenplayexplorer.com:80/wp-content/scripts/The-Great-Outdoors.pdf
https://web.archive.org/web/20020203200523/http://geocities.com/emruf1/andalou.html
https://web.archive.org/web/20140214181940/http://www.awesomefilm.com/script/requiem.txt
https://web.archive.org/web/20130924084950/http://www.joblo.com/scripts/yourhighness.pdf
https://web.archive.org/web/20150501090237/http://twcguilds.com:80/assets/downloads/ScreenplayTIG.pdf?
https://web.archive.org/web/20200212093959/http://www.script-o-rama.com:80/movie_scripts/m/miracle-at-st-anna-script.html
https://web.archive.org/web/20090828140623/http://www.imsdb.com:80/scripts/Goodfellas.html
https://web.archive.org/web/20130530065510/http://www.joblo.com/scripts/schmucks.pdf
https://web.archive.org/web/20080510111023/http://www.startingoutfilm.com/pdf/script.pdf
https://web.archive.org/web/20170

In [20]:
pdf_url = "https://web.archive.org/web/20150906113737/http://www.joblo.com/scripts/Edge%20of%20Darkness.pdf"

download_pdf(pdf_url, "test.pdf", 120)

False