# IMSDb scraper

The following code will scrape all scripts available on the Internet Movie Script Database (http://www.imsdb.com) and store the raw data in individual json objects in a folder titled "scripts" for processing and import into the project database

## Imports

In [2]:
import requests as r                  # HTTP request handling
import bs4 as bs                      # HTML parsing
import json as j                      # JSON utilities
import pandas as pd                   # Data utilities
from tqdm import tqdm_notebook as tn  # Progress bar
from datetime import datetime         # Datetime object
import os                             # OS interaction utilities

## Global variable declarations

In [3]:
URL = 'http://www.imsdb.com{}' # IMSDb site URL template
OUT_FILE = './scripts/{}.json'      # Script json file output directory

## Extracting movie links

In [4]:
page = r.get(URL.format('/all scripts')) # Retreieve "All Scripts" page                
film_list = bs.BeautifulSoup(page.text, "lxml") # Convert response object to BeautifulSoup
films = film_list.find_all('table')[1].find_all('a')[61:] # Extract film links
film_data = pd.DataFrame() # Create DataFrame for data storage
film_data['title'] = [x.string for x in films] # Extract film titles
film_data['link'] = [x['href'] for x in films] # Extract film links
film_data.head() # Check output

Unnamed: 0,title,link
0,10 Things I Hate About You,/Movie Scripts/10 Things I Hate About You Scri...
1,12,/Movie Scripts/12 Script.html
2,12 and Holding,/Movie Scripts/12 and Holding Script.html
3,12 Monkeys,/Movie Scripts/12 Monkeys Script.html
4,12 Years a Slave,/Movie Scripts/12 Years a Slave Script.html


In [44]:
try:
    film_data.to_csv('films.csv')
    print("Data succesfully saved to {}".format(filename))
except:
    print("Saving failed. Check Pandas 'film_data' object.")

Data succesfully saved to ./scripts/films.csv


## Extracting film scripts and related info

In [5]:
links = pd.read_csv('films.csv', index_col=0)
links.head()

Unnamed: 0,title,link
0,10 Things I Hate About You,/Movie Scripts/10 Things I Hate About You Scri...
1,12,/Movie Scripts/12 Script.html
2,12 and Holding,/Movie Scripts/12 and Holding Script.html
3,12 Monkeys,/Movie Scripts/12 Monkeys Script.html
4,12 Years a Slave,/Movie Scripts/12 Years a Slave Script.html


In [16]:
errors = {}
for title, link in tn(zip(links['title'], links['link'])):
    
    # Create film dictionary object for storing data
    film_info = {
        'title':title,
        'film_link':link,
        'script_link':None,
        'info_raw':None,
        'script':None,
        'scraped':str(datetime.now()),
    }
    
    # Retrieve film page and raw film info
    film = r.get(URL.format(link))
    info = bs.BeautifulSoup(film.text, "lxml").find("table", attrs={"class":"script-details"})
    film_info['info_raw'] = str(info)
    film_info['script_link'] = info.find_all("a")[-1]['href']
    
    # Retrieve script page and raw script text
    try:
        script = r.get(URL.format(film_info['script_link']))
        script_text = str(bs.BeautifulSoup(script.text, "lxml").find("td", attrs={"class":"scrtext"}))
        film_info['script'] = script_text
    except:
        errors[title] = film_info['script_link']
    
    # Output dictionary to json object
    with open(OUT_FILE.format(film_info['title']), 'w') as f:
        j.dump(film_info, f)

# Print status message on completion
print("Scraping run complete. {} files created from original list of {} films.".format(len(os.listdir('./scripts')), len(links)))




Exception in thread Thread-6:
Traceback (most recent call last):
  File "/Users/matt.tranquada/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/matt.tranquada/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/matt.tranquada/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration




Scraping run complete. 1170 files created from original list of 1171 films.


In [17]:
# Check for errors
print(errors.keys())
print(len(errors.keys()))

dict_keys([])
0


In [81]:
# Print error log to 'errors.csv'
import csv
with open('errors.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in errors.items():
        writer.writerow([key, value])