 # Romantic Comedies
 ### Scraped by Tim
 ### From [Wikipedia's Index of American Romantic Comedy Films](https://en.wikipedia.org/wiki/Category:American_romantic_comedy_films)

In [142]:
from urllib import request
import dateutil.parser
import bs4 as bs
import json
import random
import re

In [249]:
WIKIPEDIA = "https://en.wikipedia.org"
ROOT = WIKIPEDIA + "/wiki/Category:American_romantic_comedy_films"

errors = []

In [209]:
movies = []
page_params = ""

def get_movies_from_page(url):
    page = request.urlopen(url).read()
    soup = bs.BeautifulSoup(page, 'lxml')
    
    url_list = [li.find("a") for li in soup.body.select(".mw-category")[0].find_all("li")]
    
    parsed_movies = [{
                        "title": re.sub("(\s*\(\d+)\s.+\)|(\s\(.+\))", strip_film_title, url.text),
                        "url": url.get("href")
                    } for url in url_list]
    movies.extend(parsed_movies)
    
    next_page = soup.body.find("a", text="next page")
    if next_page:
        page_params = next_page.get("href")
        get_movies_from_page(WIKIPEDIA + page_params)
    else:
        return
    
def strip_film_title(match):
    g = match.group
    if (g(2)):
        return ""
    if (g(1)):
        return g(1) + ")"

In [210]:
get_movies_from_page(ROOT)
len(movies)

1740

## TODO:

1. Make a sample of 10 movies that I can test
* Grab the starring lineup duo and set them back in the movies dict
* Grab budget and revenue <- cut out all movies that don't have both
* roll up the data of the actors/actresses (years performed & # of movies)
* Maybe grab IMDB scores from API
* learn to use matplotlib again
* plot movies per year
* plot movies relative of budget & revenue
* plot cast based on frequency
* plot cast based on average revenue they bring in


* Can keep (1953) from (1953 film), but remove (film) 

In [63]:
sample_set = [movies[i] for i in random.sample(range(0, len(movies)), 10)]


sample = sample_set[0]

sample_page = request.urlopen(WIKIPEDIA_ROOT + sample["url"]).read()
sample_soup = bs.BeautifulSoup(sample_page, 'lxml')

In [252]:
def get_movie_info(movie):
    movie_page = request.urlopen(WIKIPEDIA_ROOT + movie["url"]).read()
    movie_soup = bs.BeautifulSoup(movie_page, 'lxml')
    
    starring = []
    release_date = None
    box_office = None
    budget = None
    
    try:
        infobox = movie_soup.find("table", class_="infobox").find_all("tr")
        for info in infobox[2:]:
            row_header = info.find("th").text.lower();
            if   "starring"     in row_header:
                starring = [re.sub("\(\w+\)","",name) for name in info.td.text.split("\n") if len(name) != 0]
            elif "release date" in row_header:
                release_date = string_to_date(info.find("td").text, movie["title"])
            elif "box office"   in row_header:
                box_office = dollar_to_float(info.find("td").text, movie["title"])
            elif "budget"       in row_header:
                budget = dollar_to_float(info.find("td").text, movie["title"])
    except Exception as e:
        global errors
        errors.append("error for {}: ".format(movie["title"], str(e)))
    return starring, box_office, budget, release_date

def dollar_to_float(value, title):
    pattern = re.compile(r"\$|,|£|(\[\d+\])|(\(\w+\))|(\s*\[.+\])|(\s*\(.+\))")
    cleaned_string = pattern.sub("", value)
    try:
        whitespace = re.compile(r"\s+")
        split = whitespace.split(cleaned_string)
        if len(split) == 1 or len(split[1]) == 0:
            return float(split[0])
        else:
            amount, multiplier = split
            amount = float(amount)
            if "million" in multiplier:
                multiplier = 1e6
            elif "billion" in multiplier:
                multiplier = 1e9
            else:
                #Error case; easier to find this way
                global errors
                errors.append("error parsing amount {} for {}".format(value, title))
                multiplier = None
            return amount * multiplier
    except Exception as err:
        return None
    
def string_to_date(string, title):
    global errors
    date = None

    match = re.search("\((\d+-\d+-\d+)\)", string)
    match = match.group(1) if match else None
    if not match:
        match = re.search("(\d+ \w+ \d+)", string)
        match = match.group(1) if match else None
    if not match:
        match = re.search("(\w+ \d+, \d+)", string)
        match = match.group(1) if match else None
    if not match:
        match = re.search("(\d{4})", string)
        match = match.group(1)
        if match:
            return dateutil.parser.parse(match).strftime("%Y")
        
    if match:
        try:
            date = dateutil.parser.parse(match).strftime("%Y-%m-%d")
        except Exception:
            global errors
            errors.append("error parsing date {} for".format(string, title))
    return date    

In [253]:
sample_set = [movies[i] for i in random.sample(range(0, len(movies)), 25)]

for sample in sample_set[:]:
    sample["cast"], sample["box_office"], sample["budget"], sample["release_date"] =  get_movie_info(sample)

In [255]:
for i, movie in enumerate(movies):
    movie["cast"], movie["box_office"], movie["budget"], movie["release_date"] =  get_movie_info(movie)
    if i % (len(movies)/100) == 0:
        print(".", end="")

.

In [258]:
with open('raw_data.json', 'w') as out:  
    json.dump(movies, out, ensure_ascii=False, indent=4)


In [259]:
for error in errors:
    print(error)

error for And So They Were Married: 
error for Bitter Sweet (2009): 
error for The Boob: 
error parsing amount $9,000,000 USD for Chasing Papi
error for The Dog Problem: 
error for Eve Knew Her Apples: 
error for Face in the Sky: 
error for Flirting with Forty: 
error for Flirty Birdy: 
error for Gideon: 
error for His Wife's Lover: 
error for Hullabaloo: 
error parsing amount $2,500,000[3]
$2,000,000 (theatrical rentals) for It Happened One Night
error for Lady in a Jam: 
error for Make Believe Ballroom: 
error for A Modern Affair: 
error for The Mouse Comes to Dinner: 
error parsing amount $23,681,338 (domestic)[1]
$11,645,000 (rentals) for The Owl and the Pussycat
error for The Pill: 
error for Set It Up: 
error for Signature Move: 
error parsing amount $10, 000 for Sleeping Beauties
error for Solid Serenade: 
error parsing amount $2 Million for Straight A's
error for Sweet Lies: 
error parsing amount $34,335,025 (USA)[1]
$40,000,000 (Worldwide)[2] for Thoroughly Modern Millie
error