In [1]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
url = 'https://www.boxofficemojo.com/title/tt2488496/rankings/?ref_=bo_tt_tab#tabs' 

response = requests.get(url)

In [3]:
page = response.text

In [4]:
soup = BeautifulSoup(page, 'lxml')

In [5]:
title_string = soup.find('title').text
title = title_string.split('-')[0].strip()

In [6]:
domdist_regex = re.compile('Domestic Distributor')
domdist_string = soup.find(text=domdist_regex).findNext().text
domestic_dist = domdist_string.split('See')[0].strip()

In [7]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [8]:
# Domestic distributor
domestic_dist = get_movie_value(soup,'Domestic Distributor').split('See')[0].strip()

# Budget (raw)
raw_budget = get_movie_value(soup,'Budget')

# Domestic opening revenue (raw)
raw_domestic_opening = get_movie_value(soup,'Domestic Opening')

#Earliest release date (raw)
raw_earliest_release = get_movie_value(soup,'Earliest Release').split('\n')[0].strip()

#Rating
rating = get_movie_value(soup,'MPAA')

# Runtime (raw)
raw_runtime = get_movie_value(soup,'Running Time')

#Genres (raw
raw_genres = get_movie_value(soup, 'Genres').split('\n')

In [9]:
import dateutil.parser

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None
    
def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date
    
def clean_genres(genre_list):
    genres = [item.strip () for item in genre_list if item.strip()]
    return genres 

In [10]:
budget = money_to_int(raw_budget)

domestic_opening = money_to_int(raw_domestic_opening)

runtime = runtime_to_minutes(raw_runtime)

genres = clean_genres(raw_genres)

In [11]:
raw_sales_list = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[0:]

def clean_sales(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes the raw list of money strings and converts each item to int
    '''
    
    sale_list = [item.text for item in lst]
    
    sale_list = [money_to_int(item) for item in sale_list]
   
    return sale_list

In [12]:
raw_sales_regions = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='a-size-small')[0:]

def clean_regions(lst):
    
    '''Clean the summary domestic/international summary table
    
    Takes raw list of region string, cleans, and puts back into list
    '''
    
    region_list = [item.text for item in lst]
    region_list = [item.strip() for item in region_list if item.split()]
    region_list = [item.split('(')[0].strip() for item in region_list]
    
    return region_list

In [13]:
regions_list = clean_regions(raw_sales_regions)
sales_list = clean_sales(raw_sales_list)

In [14]:
# Zip region and associated sales per region - believe should solve for any missing regions
summary_sales = zip(regions_list, sales_list)
list(summary_sales)

[('Domestic', 936662225),
 ('International', 1131791908),
 ('Worldwide', 2068454133)]

In [19]:
domsales_regex = re.compile('Domestic')
domsales_string = soup.find(text=domdist_regex).findNext(_class='money')

In [20]:
domsales_string