In [1]:
from urllib2 import urlopen as URL # Access internet resources
from bs4 import BeautifulSoup as BS # Parse HTML pages
import pandas as pd, numpy as np, re # importing some other packages

# Let's get some data for LaLa Land

In [2]:
# Open page
pg = 'http://www.imdb.com/title/' # This is the root of the url for movies
tt = 'tt3783958' # this is the imdb id for a movie (this one is lalaland)
sc = URL(pg+tt).read() # read the page source
soup = BS(sc) # create soup object



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


# Let's go to chrome to inspect this page
![lalaland](./figures/lalaland.png)

# Getting average aggregate rating

In [3]:
rating = soup.find_all('div', {'class':'imdbRating'}) # Find div tags with class as imdbRating
len(rating)

1

In [4]:
rating = rating[0] # since we know we only get one that fits this description, let's keep the first item

In [5]:
rating # this is what the HTML code looks like
# notice there are several places where the data wa want is located
# also, span tags a bunch of the data we want

<div class="imdbRating" itemprop="aggregateRating" itemscope="" itemtype="http://schema.org/AggregateRating">\n<div class="ratingValue">\n<strong title="8.1 based on 348,146 user ratings"><span itemprop="ratingValue">8.1</span></strong><span class="grey">/</span><span class="grey" itemprop="bestRating">10</span> </div>\n<a href="/title/tt3783958/ratings?ref_=tt_ov_rt"><span class="small" itemprop="ratingCount">348,146</span></a>\n<div class="hiddenImportant">\n<span itemprop="reviewCount">1,296 user</span>\n<span itemprop="reviewCount">639 critic</span>\n</div>\n</div>

In [6]:
rating.find_all('span')

[<span itemprop="ratingValue">8.1</span>,
 <span class="grey">/</span>,
 <span class="grey" itemprop="bestRating">10</span>,
 <span class="small" itemprop="ratingCount">348,146</span>,
 <span itemprop="reviewCount">1,296 user</span>,
 <span itemprop="reviewCount">639 critic</span>]

In [7]:
rating.find_all('span')[0].text

u'8.1'

In [8]:
ratingdata = [s.text for s in rating.find_all('span')] # seems to get all the data we need
ratingdata

[u'8.1', u'/', u'10', u'348,146', u'1,296 user', u'639 critic']

In [9]:
ratingdata = [r for r in ratingdata if ('/' not in r)] # let's just get rid of the entry we don't want
ratingdata

[u'8.1', u'10', u'348,146', u'1,296 user', u'639 critic']

In [10]:
# How can we extract  numbers only from a blob of text?
float(re.findall('[0-9.,]+','asfdjla34,412rqupoieqr31431.,341231r3')[0].replace(',', ''))

34412.0

In [11]:
ratingdata = [float(re.findall('[0-9.,]+',r)[0].replace(',', '')) for r in ratingdata] # why does this work?
ratingdata

[8.1, 10.0, 348146.0, 1296.0, 639.0]

In [12]:
labels = ['rating','out_of', 'volume', 'userreviews', 'criticreviews']

In [13]:
moviedata = dict(zip(labels, ratingdata))
moviedata

{'criticreviews': 639.0,
 'out_of': 10.0,
 'rating': 8.1,
 'userreviews': 1296.0,
 'volume': 348146.0}

# Make this into a function 

In [14]:
def imdb_rating_test(tt='tt3315342'):
    pg = 'http://www.imdb.com/title/' # This is the root of the url for movies
    sc = URL(pg+tt).read() # read the page source
    soup = BS(sc)
    rating = soup.find_all('div', {'class':'imdbRating'})[0] # Find div tags with class as imdbRating
    ratingdata = [s.text for s in rating.find_all('span')] # seems to get all the data we need
    ratingdata = [r for r in ratingdata if ('/' not in r)] # let's just get rid of the entry we don't want
    ratingdata = [float(re.findall('[0-9.,]+',r)[0].replace(',', '')) for r in ratingdata] # why does this work?
    labels = ['rating','out_of', 'volume', 'userreviews', 'criticreviews']
    moviedata = dict(zip(labels, ratingdata))
    return moviedata

In [15]:
moviedata = imdb_rating_test(tt='tt3783958')
moviedata

{'criticreviews': 639.0,
 'out_of': 10.0,
 'rating': 8.1,
 'userreviews': 1296.0,
 'volume': 348146.0}

# Ok, we got the aggregate review data, what else can we get?

## Metacritic score?
![MetaCritic](./figures/lalaland_metacritic.png)

In [16]:
soup.find_all('div', {'class':'metacriticScore'}) # looks like there's only one of these on the page

[<div class="metacriticScore score_favorable titleReviewBarSubItem">\n<span>93</span>\n</div>]

In [17]:
soup.find_all('div', {'class':'metacriticScore'})[0].text # we can get the text like this

u'\n93\n'

In [18]:
soup.find_all('div', {'class':'metacriticScore'})[0].text.strip() # we can strip the extra stuff

u'93'

In [19]:
moviedata['metacritic'] = float(soup.find_all('div', {'class':'metacriticScore'})[0].text.strip()) 
# strip the line breaks (\n) and convert to float
# insert into moviedata dictionary
moviedata

{'criticreviews': 639.0,
 'metacritic': 93.0,
 'out_of': 10.0,
 'rating': 8.1,
 'userreviews': 1296.0,
 'volume': 348146.0}

# Your turn
### First get box office data (budget, opening weekend, gross)
![BoxOffice](./figures/lalaland_boxoffice.png)

In [20]:
txtblock = soup.find_all('div', {'class':'txt-block'})

In [21]:
header = txtblock[0].find_all('h4')[0].text

In [22]:
txtblock[0].text.replace(header+u'\n', '').strip()

u"Here's to the fools who dream."

In [23]:
out = list() # creating empty list
for r in txtblock: # loop through txtblock
    try:
        header = r.find_all('h4')[0].text
        dat = r.text.replace(header+u'\n', '').strip()
        out.append((header, dat))
    except:
        pass

In [24]:
out

[(u'Taglines:', u"Here's to the fools who dream."),
 (u'Motion Picture Rating\n                    (MPAA)\n                ',
  u'Rated PG-13 for some language\n| \n See all certifications\xa0\xbb'),
 (u'Parents Guide:', u'View content advisory\xa0\xbb'),
 (u'Official Sites:',
  u'Official Facebook Page\n|\nOfficial site\n|\n\nSee more\xa0\xbb'),
 (u'Country:', u'USA\n|\nHong Kong'),
 (u'Language:', u'English'),
 (u'Release Date:',
  u'Release Date: 25 December 2016 (USA)\n    \nSee more\xa0\xbb'),
 (u'Also Known As:', u'Also Known As: LaLa Land\n      \nSee more\xa0\xbb'),
 (u'Filming Locations:', u'Los Angeles, California, USA\n\nSee more\xa0\xbb'),
 (u'Budget:', u'Budget:$30,000,000\n            (estimated)'),
 (u'Opening Weekend USA:',
  u'Opening Weekend USA: $881,104,\n9 December 2016, Limited Release'),
 (u'Gross USA:', u'Gross USA: $151,101,803'),
 (u'Cumulative Worldwide Gross:', u'Cumulative Worldwide Gross: $445,636,919'),
 (u'Production Co:',
  u'Summit Entertainment,      