# Process Movie Scripts

In [2]:
import os
import requests
import pandas as pd

In [12]:
# File path for raw_script_urls.tsv located in the data directory
file_path = 'data/raw_script_urls.tsv'

# Read the TSV file using pandas
df = pd.read_csv(file_path, sep='\t', header=None, names=['movieID', 'movieName', 'url'])
print("DataFrame head:")
print(df.head())

DataFrame head:
  movieID                   movieName  \
0      m0  10 things i hate about you   
1      m1  1492: conquest of paradise   
2      m2                  15 minutes   
3      m3       2001: a space odyssey   
4      m4                     48 hrs.   

                                                 url  
0   http://www.dailyscript.com/scripts/10Things.html  
1  http://www.hundland.org/scripts/1492-ConquestO...  
2  http://www.dailyscript.com/scripts/15minutes.html  
3       http://www.scifiscripts.com/scripts/2001.txt  
4      http://www.awesomefilm.com/script/48hours.txt  


In [16]:
# Extract URLs from the 'url' column
urls = df['url'].dropna().astype(str).tolist()
print("Found " + str(len(urls)) + " URLs in the file.")

Found 616 URLs in the file.


In [18]:
# Initialize a list to hold scripts
scripts = []

# Function to fetch script content from URL with error handling
def fetch_script(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return response.text
        else:
            print("Failed to fetch " + url + ": Status code " + str(response.status_code))
            return None
    except Exception as e:
        print("Error fetching " + url + ": " + str(e))
        return None

In [20]:
# Download each movie script content
for url in urls:
    print("Fetching script from: " + url)
    script = fetch_script(url)
    if script:
        scripts.append(script)
        
print("Successfully fetched " + str(len(scripts)) + " scripts out of " + str(len(urls)) + " URLs.")

Fetching script from: http://www.dailyscript.com/scripts/10Things.html
Fetching script from: http://www.hundland.org/scripts/1492-ConquestOfParadise.txt
Error fetching http://www.hundland.org/scripts/1492-ConquestOfParadise.txt: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Fetching script from: http://www.dailyscript.com/scripts/15minutes.html
Fetching script from: http://www.scifiscripts.com/scripts/2001.txt
Fetching script from: http://www.awesomefilm.com/script/48hours.txt
Fetching script from: http://www.scifiscripts.com/scripts/5thelement.txt
Fetching script from: http://www.dailyscript.com/scripts/eight-millimeter.html
Fetching script from: http://www.hundland.org/scripts/A-Nightmare-on-Elm-Street-4.txt
Error fetching http://www.hundland.org/scripts/A-Nightmare-on-Elm-Street-4.txt: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Fetching script from: http://www.hundland.org/scripts/A-Nigh

In [22]:
 # Create a corpus by joining all scripts together with a separator
corpus = "\n\n--- SCRIPT SEPARATOR ---\n\n".join(scripts)

In [24]:
# Save the corpus to a file
corpus_file = 'data/movie_scripts_corpus.txt'
with open(corpus_file, 'w', encoding='utf-8') as f:
    f.write(corpus)
    
print("Corpus created and saved to " + corpus_file)

Corpus created and saved to data/movie_scripts_corpus.txt


In [26]:
# Also create a list of individual documents, if needed
documents = scripts

print("Total documents in corpus:", len(documents))
print("Script complete.")

Total documents in corpus: 528
Script complete.
