In [1]:
from urllib2 import urlopen as URL # Access internet resources
from bs4 import BeautifulSoup as BS # Parse HTML pages
import pandas as pd, numpy as np, re # importing some other packages

# Let's get some data for LaLa Land

In [2]:
# Open page
pg = 'http://www.imdb.com/title/' # This is the root of the url for movies
tt = 'tt3783958' # this is the imdb id for a movie (this one is lalaland)
sc = URL(pg+tt).read() # read the page source
soup = BS(sc) # create soup object



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


# Let's go to chrome to inspect this page
![Chrome](https://www.dropbox.com/s/ayn7zdqfpsagxfc/lalaland.png?dl=1)

# Getting some data from this page

In [3]:
rating = soup.find_all('div', {'class':'imdbRating'}) # Find div tags with class as imdbRating
len(rating)

1

In [4]:
rating = rating[0] # since we know we only get one that fits this description, let's keep the first item

In [5]:
rating # this is what the HTML code looks like
# notice there are several places where the data wa want is located
# also, span tags a bunch of the data we want

<div class="imdbRating" itemprop="aggregateRating" itemscope="" itemtype="http://schema.org/AggregateRating">\n<div class="ratingValue">\n<strong title="8.2 based on 308,847 user ratings"><span itemprop="ratingValue">8.2</span></strong><span class="grey">/</span><span class="grey" itemprop="bestRating">10</span> </div>\n<a href="/title/tt3783958/ratings?ref_=tt_ov_rt"><span class="small" itemprop="ratingCount">308,847</span></a>\n<div class="hiddenImportant">\n<span itemprop="reviewCount">1,211 user</span>\n<span itemprop="reviewCount">618 critic</span>\n</div>\n</div>

In [6]:
rating.find_all('span')

[<span itemprop="ratingValue">8.2</span>,
 <span class="grey">/</span>,
 <span class="grey" itemprop="bestRating">10</span>,
 <span class="small" itemprop="ratingCount">308,847</span>,
 <span itemprop="reviewCount">1,211 user</span>,
 <span itemprop="reviewCount">618 critic</span>]

In [7]:
rating.find_all('span')[0].text

u'8.2'

In [8]:
ratingdata = [s.text for s in rating.find_all('span')] # seems to get all the data we need
ratingdata

[u'8.2', u'/', u'10', u'308,847', u'1,211 user', u'618 critic']

In [9]:
ratingdata = [r for r in ratingdata if ('/' not in r)] # let's just get rid of the entry we don't want
ratingdata

[u'8.2', u'10', u'308,847', u'1,211 user', u'618 critic']

In [10]:
# How can we extract  numbers only from a blob of text?
float(re.findall('[0-9.,]+','asfdjla34,412rqupoieqr31431.,341231r3')[0].replace(',', ''))

34412.0

In [11]:
ratingdata = [float(re.findall('[0-9.,]+',r)[0].replace(',', '')) for r in ratingdata] # why does this work?
ratingdata

[8.2, 10.0, 308847.0, 1211.0, 618.0]

In [12]:
labels = ['rating','out_of', 'volume', 'userreviews', 'criticreviews']

In [13]:
moviedata = dict(zip(labels, ratingdata))
moviedata

{'criticreviews': 618.0,
 'out_of': 10.0,
 'rating': 8.2,
 'userreviews': 1211.0,
 'volume': 308847.0}

In [28]:
title = soup.find_all('div', {'class':'title_wrapper'})[0]
moviename = u' '.join(title.find('h1').text.strip().split())

In [29]:
moviedata['movie'] = moviename
moviedata

{'criticreviews': 618.0,
 'movie': u'La La Land (2016)',
 'out_of': 10.0,
 'rating': 8.2,
 'userreviews': 1211.0,
 'volume': 308847.0}

# Make this into a function 

In [30]:
def imdb_rating_test(tt='tt3315342'):
    pg = 'http://www.imdb.com/title/' # This is the root of the url for movies
    sc = URL(pg+tt).read() # read the page source
    soup = BS(sc)
    rating = soup.find_all('div', {'class':'imdbRating'})[0] # Find div tags with class as imdbRating
    ratingdata = [s.text for s in rating.find_all('span')] # seems to get all the data we need
    ratingdata = [r for r in ratingdata if ('/' not in r)] # let's just get rid of the entry we don't want
    ratingdata = [float(re.findall('[0-9.,]+',r)[0].replace(',', '')) for r in ratingdata] # why does this work?
    labels = ['rating','out_of', 'volume', 'userreviews', 'criticreviews']
    moviedata = dict(zip(labels, ratingdata))
    title = soup.find('div', {'class':'title_wrapper'})
    moviename = u' '.join(title.find('h1').text.strip().split())
    moviedata['movie']=moviename
    return moviedata

In [31]:
moviedata = imdb_rating_test(tt='tt3783958')
moviedata

{'criticreviews': 618.0,
 'movie': u'La La Land (2016)',
 'out_of': 10.0,
 'rating': 8.2,
 'userreviews': 1211.0,
 'volume': 308847.0}

# Let's get a bunch of these

In [32]:
import pandas as pd
links = pd.read_csv('https://www.dropbox.com/s/eiadju0vt3wkjrd/links.csv?dl=1')

In [33]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [34]:
imdbids = ['tt'+str(l) for l in list(links.imdbId)]
len(imdbids)

9125

In [35]:
# We can write a loop to do this...
out = dict()
for l in imdbids[:10]: # let's get the first 10 for now
    out[l]=imdb_rating_test(tt=l)

In [36]:
df = pd.DataFrame(out).transpose().reset_index()
df.rename(columns={'index':'tt'}, inplace = True)
df.head()

Unnamed: 0,tt,criticreviews,movie,out_of,rating,userreviews,volume
0,tt112302,14,Tom and Huck (1995),10,5.6,20,8149
1,tt113041,37,Father of the Bride Part II (1995),10,5.9,66,28656
2,tt113189,145,GoldenEye (1995),10,7.2,429,209584
3,tt113228,26,Grumpier Old Men (1995),10,6.6,60,20718
4,tt113277,183,Heat (1995),10,8.2,913,470061


In [60]:
from ipyparallel import Client
cli = Client('/home/ec2-user/.starcluster/ipcluster/SecurityGroup:@sc-smallcluster2-us-west-2.json', 
             sshkey='/home/ec2-user/.ssh/mykey.rsa', packer = 'json')
lbview = cli.load_balanced_view() #define parallel block type (balanced)
pnodes = len(cli.ids)
print pnodes
# cli = Client()
# lbview = cli.load_balanced_view() #define parallel block type (balanced)
# pnodes = len(cli.ids)
# print pnodes

10


In [61]:
@lbview.parallel(block=True)
def imdb_rating_par(ttlist):
    from urllib2 import urlopen as URL
    from bs4 import BeautifulSoup as BS
    import re
    pg = 'http://www.imdb.com/title/' # This is the root of the url for movies
    output = list()
    for tt in ttlist:
        try:
            sc = URL(pg+tt).read() # read the page source
            soup = BS(sc)
            rating = soup.find_all('div', {'class':'imdbRating'})[0] # Find div tags with class as imdbRating
            ratingdata = [s.text for s in rating.find_all('span')] # seems to get all the data we need
            ratingdata = [r for r in ratingdata if ('/' not in r)] # let's just get rid of the entry we don't want
            ratingdata = [float(re.findall('[0-9.,]+',r)[0].replace(',', '')) for r in ratingdata] # why does this work?
            labels = ['rating','out_of', 'volume', 'userreviews', 'criticreviews']
            moviedata = dict(zip(labels, ratingdata))
            title = soup.find('div', {'class':'title_wrapper'})
            moviename = u' '.join(title.find('h1').text.strip().split())
            moviedata['movie']=moviename
            output.append((tt, moviedata))
        except:
            pass
    return output

In [62]:
from itertools import izip_longest
def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return izip_longest(*args, fillvalue=fillvalue)

In [63]:
inputs = imdbids[:100]
inputs = list(grouper(inputs, int(len(inputs)/pnodes+1), fillvalue=None))

In [64]:
outputs = imdb_rating_par.map(inputs) # execute in parallel

In [66]:
from itertools import chain
df = pd.DataFrame(dict(list(chain(*outputs)))).transpose().reset_index()
df.rename(columns = {'index':'tt'}, inplace = True)
df.head()

Unnamed: 0,tt,criticreviews,movie,out_of,rating,userreviews,volume
0,tt110299,17,Lamerica (1994),10,7.6,12,1824
1,tt110877,36,Il Postino (1994),10,7.7,108,27999
2,tt111173,13,Shopping (1994),10,5.4,39,2234
3,tt112281,62,Ace Ventura: When Nature Calls (1995),10,6.3,180,172301
4,tt112286,7,Across the Sea of Time (1995),10,6.4,9,217


# Ok, we got the aggregate review data, what else can we get?

## Metacritic score?
![MetaCritic](https://www.dropbox.com/s/baurxvpegadz8kp/lalaland_metacritic.png?dl=1)

In [None]:
soup.find_all('div', {'class':'metacriticScore'}) # looks like there's only one of these on the page

In [None]:
soup.find_all('div', {'class':'metacriticScore'})[0].text # we can get the text like this

In [None]:
soup.find_all('div', {'class':'metacriticScore'})[0].text.strip() # we can strip the extra stuff

In [None]:
moviedata['metacritic'] = float(soup.find_all('div', {'class':'metacriticScore'})[0].text.strip()) 
# strip the line breaks (\n) and convert to float
# insert into moviedata dictionary
moviedata

# Your turn
### First get box office data (budget, opening weekend, gross)
![BoxOffice](https://www.dropbox.com/s/uuxnqlxt2rkni1n/lalaland_boxoffice.png?dl=1)

In [None]:
txtblock = soup.find_all('div', {'class':'txt-block'})

In [None]:
header = txtblock[0].find_all('h4')[0].text

In [None]:
txtblock[0].text.replace(header+u'\n', '').strip()

In [None]:
out = list() # creating empty list
for r in txtblock: # loop through txtblock
    try:
        header = r.find_all('h4')[0].text
        dat = r.text.replace(header+u'\n', '').strip()
        out.append((header, dat))
    except:
        pass

In [None]:
out

In [None]:
### Your code, insert lines here as needed