# Web scraping using `BeautifulSoup` and accessing APIs

In [1]:
import json
import numpy as np
import pandas as pd
import re
import requests

from bs4 import BeautifulSoup
from time import sleep

## Web scraping using `BeautifulSoup`

### Read HTML

In [2]:
html = requests.get('http://www.imdb.com/title/tt2084970/').text

In [3]:
print(html)










<!DOCTYPE html>
<html
xmlns:og="http://ogp.me/ns#"
xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///title/tt2084970?src=mdot">
            <script type="text/javascript">var ue_t0=window.ue_t0||+new Date();</script>
            <script type="text/javascript">
                var ue_mid = "A1EVAM02EL8SFB"; 
                var ue_sn = "www.imdb.com";  
                var ue_furl = "fls-na.amazon.com";
                var ue_sid = "000-0000000-0000000";
                var ue_id = "15R3ZEA9DWNEPD6ATXHH";
                (function(e){var c=e;var a=c.ue||{};a.main_scope="mainscopecsm";a.q=[];a.t0=c.ue_t0||+new Date();a.d=g;function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a.q.push({n:h,a:arguments,t:a.d()})}}function b(m,l,h,j,i){var k={m:m,f:l,l:h,c:""+j,err:i,fromOnError:

### Parse HTML into a `BeautifulSoup` object

In [4]:
soup = BeautifulSoup(html, 'lxml')

### Retrieve the title

In [9]:
soup.find(name='h1', attrs={'itemprop': 'name'})

<h1 class="" itemprop="name">The Imitation Game <span id="titleYear">(<a href="/year/2014/?ref_=tt_ov_inf">2014</a>)</span> </h1>

In [11]:
soup.find(name='h1', attrs={'itemprop': 'name'}).find(text=True, recursive=False) #recursive False means get only 
                                                                                  #the first line

'The Imitation Game\xa0'

In [12]:
soup.find(name='h1', attrs={'itemprop': 'name'}).find(text=True, recursive=False).strip()

'The Imitation Game'

### Retrieve the genre(s)

In [13]:
soup.find_all(name='span', attrs={'itemprop': 'genre'})

[<span class="itemprop" itemprop="genre">Biography</span>,
 <span class="itemprop" itemprop="genre">Drama</span>,
 <span class="itemprop" itemprop="genre">Thriller</span>]

In [14]:
[x.text for x in soup.find_all('span', itemprop='genre')]

['Biography', 'Drama', 'Thriller']

### Retrieve the description

In [15]:
soup.find('div', itemprop='description').text.strip()

'During World War II, mathematician Alan Turing tries to crack the enigma code with help from fellow mathematicians.'

### Retrieve the duration (in minutes)

In [16]:
soup.find('time', itemprop='duration')

<time datetime="PT114M" itemprop="duration">
                        1h 54min
                    </time>

In [21]:
re.findall(r'(\d+)', soup.find('time', itemprop='duration')['datetime'])

#regular expression: + means at least one
#                    \d means digit

['114']

In [22]:
int(re.findall(r'(\d+)', soup.find('time', itemprop='duration')['datetime'])[0])

114

### Retrieve the content rating

In [38]:
soup.find('meta', itemprop='contentRating')

<meta content="12A" itemprop="contentRating"/>

In [11]:
soup.find('meta', itemprop='contentRating')['content']

'12A'

### Retrieve the rating

In [40]:
soup.find('span', itemprop='ratingValue')

<span itemprop="ratingValue">8.0</span>

In [41]:
float(soup.find('span', itemprop='ratingValue').text)

8.0

### Retrieve the rating and number of reviews

In [25]:
soup.find('div', class_='ratingValue')

<div class="ratingValue">
<strong title="8.0 based on 547,588 user ratings"><span itemprop="ratingValue">8.0</span></strong><span class="grey">/</span><span class="grey" itemprop="bestRating">10</span> </div>

In [26]:
soup.find('div', class_='ratingValue').strong['title']

'8.0 based on 547,588 user ratings'

In [27]:
soup.find('div', 'ratingValue').strong['title']

'8.0 based on 547,588 user ratings'

In [77]:
re.findall(r'^([\d\.]+).+?([\d,]+)', soup.find('div', 'ratingValue').strong['title'])

[('8.0', '547,463')]

In [29]:
rating, n = re.findall(r'^([\d\.]+).+?([\d,]+)', soup.find('div', 'ratingValue').strong['title'])[0]
rating = float(rating)
n = int(n.replace(',', ''))
rating, n

(8.0, 547588)

### Define a function to do all of the above given an IMDb ID

In [30]:
def scrape_film_info(imdb_id):
    html = requests.get('http://www.imdb.com/title/' + imdb_id).text
    soup = BeautifulSoup(html, 'lxml')
    info = {}
    info['title'] =\
        soup.find('h1', itemprop='name').find(text=True, recursive=False).strip()
    info['genres'] =\
        [x.text for x in soup.find_all('span', itemprop='genre')]
    info['description'] =\
        soup.find('div', itemprop='description').text.strip()
    info['duration'] =\
        int(re.findall(r'(\d+)', soup.find('time', itemprop='duration')['datetime'])[0])
    info['content_rating'] =\
        soup.find('meta', itemprop='contentRating')['content']
    rating, n =\
        re.findall(r'^([\d\.]+).+?([\d,]+)',\
                   soup.find('div', 'ratingValue').strong['title'])[0]
    info['rating'] = float(rating)
    info['n'] = int(n.replace(',', ''))
    return info

In [31]:
scrape_film_info('tt2084970')

{'content_rating': '12A',
 'description': 'During World War II, mathematician Alan Turing tries to crack the enigma code with help from fellow mathematicians.',
 'duration': 114,
 'genres': ['Biography', 'Drama', 'Thriller'],
 'n': 547588,
 'rating': 8.0,
 'title': 'The Imitation Game'}

### Get the *Top 250 as rated by IMDb Users* list

In [32]:
soup = BeautifulSoup(requests.get('http://www.imdb.com/chart/top').text, 'lxml')

### Retrieve the list of IMDb IDs

In [33]:
tmp = soup.find_all(name='td', attrs={'class': 'titleColumn'})
imdb_ids = [re.findall(r'/(tt[0-9]+)/', x.a['href'])[0] for x in tmp]
imdb_ids = imdb_ids[:10]  # Keep only the top 10 films
imdb_ids

['tt0111161',
 'tt0068646',
 'tt0071562',
 'tt0468569',
 'tt0050083',
 'tt0108052',
 'tt0110912',
 'tt0167260',
 'tt0060196',
 'tt0137523']

### Call `scrape_film_info` for each ID

In [34]:
films = []
for imdb_id in imdb_ids:
    films.append(scrape_film_info(imdb_id))
    sleep(1)

In [35]:
films[:2]

[{'content_rating': '15',
  'description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
  'duration': 142,
  'genres': ['Crime', 'Drama'],
  'n': 1860959,
  'rating': 9.3,
  'title': 'The Shawshank Redemption'},
 {'content_rating': 'X',
  'description': 'The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son.',
  'duration': 175,
  'genres': ['Crime', 'Drama'],
  'n': 1270470,
  'rating': 9.2,
  'title': 'The Godfather'}]

In [36]:
films = pd.DataFrame(films, index=imdb_ids)  # Convert to a DataFrame
films

Unnamed: 0,content_rating,description,duration,genres,n,rating,title
tt0111161,15,Two imprisoned men bond over a number of years...,142,"[Crime, Drama]",1860959,9.3,The Shawshank Redemption
tt0068646,X,The aging patriarch of an organized crime dyna...,175,"[Crime, Drama]",1270470,9.2,The Godfather
tt0071562,X,The early life and career of Vito Corleone in ...,202,"[Crime, Drama]",875598,9.0,The Godfather: Part II
tt0468569,12,When the menace known as the Joker emerges fro...,152,"[Action, Crime, Drama]",1838986,9.0,The Dark Knight
tt0050083,U,A jury holdout attempts to prevent a miscarria...,96,"[Crime, Drama]",508371,8.9,12 Angry Men
tt0108052,15,"In German-occupied Poland during World War II,...",195,"[Biography, Drama, History]",955721,8.9,Schindler's List
tt0110912,18,"The lives of two mob hit men, a boxer, a gangs...",154,"[Crime, Drama]",1456216,8.9,Pulp Fiction
tt0167260,12A,Gandalf and Aragorn lead the World of Men agai...,201,"[Adventure, Drama, Fantasy]",1331575,8.9,The Lord of the Rings: The Return of the King
tt0060196,X,A bounty hunting scam joins two men in an unea...,148,[Western],551687,8.9,"The Good, the Bad and the Ugly"
tt0137523,18,"An insomniac office worker, looking for a way ...",139,[Drama],1491516,8.8,Fight Club


## Accessing APIs

### Send API request

In [37]:
req = requests.post('http://api.postcodes.io/postcodes', json={'postcodes': ['E1 7PT', 'EC2M 7PP']})

### Check HTTP status code (2xx = success, 4xx = client error, 5xx = server error)

In [38]:
req.status_code

200

### Get the raw response

In [39]:
req.text

'{"status":200,"result":[{"query":"E1 7PT","result":{"postcode":"E1 7PT","quality":1,"eastings":533842,"northings":181367,"country":"England","nhs_ha":"London","longitude":-0.0725132699729764,"latitude":51.5153793466949,"european_electoral_region":"London","primary_care_trust":"Tower Hamlets","region":"London","lsoa":"Tower Hamlets 015D","msoa":"Tower Hamlets 015","incode":"7PT","outcode":"E1","parliamentary_constituency":"Bethnal Green and Bow","admin_district":"Tower Hamlets","parish":"Tower Hamlets, unparished area","admin_county":null,"admin_ward":"Spitalfields & Banglatown","ccg":"NHS Tower Hamlets","nuts":"Tower Hamlets","codes":{"admin_district":"E09000030","admin_county":"E99999999","admin_ward":"E05009333","parish":"E43000220","parliamentary_constituency":"E14000555","ccg":"E38000186","nuts":"UKI42"}}},{"query":"EC2M 7PP","result":{"postcode":"EC2M 7PP","quality":1,"eastings":533204,"northings":181534,"country":"England","nhs_ha":"London","longitude":-0.0816392701323494,"latit

### Decode the JSON response into a dictionary

In [40]:
res = req.json()

In [41]:
res

{'result': [{'query': 'E1 7PT',
   'result': {'admin_county': None,
    'admin_district': 'Tower Hamlets',
    'admin_ward': 'Spitalfields & Banglatown',
    'ccg': 'NHS Tower Hamlets',
    'codes': {'admin_county': 'E99999999',
     'admin_district': 'E09000030',
     'admin_ward': 'E05009333',
     'ccg': 'E38000186',
     'nuts': 'UKI42',
     'parish': 'E43000220',
     'parliamentary_constituency': 'E14000555'},
    'country': 'England',
    'eastings': 533842,
    'european_electoral_region': 'London',
    'incode': '7PT',
    'latitude': 51.5153793466949,
    'longitude': -0.0725132699729764,
    'lsoa': 'Tower Hamlets 015D',
    'msoa': 'Tower Hamlets 015',
    'nhs_ha': 'London',
    'northings': 181367,
    'nuts': 'Tower Hamlets',
    'outcode': 'E1',
    'parish': 'Tower Hamlets, unparished area',
    'parliamentary_constituency': 'Bethnal Green and Bow',
    'postcode': 'E1 7PT',
    'primary_care_trust': 'Tower Hamlets',
    'quality': 1,
    'region': 'London'}},
  {'que

In [42]:
res['result'][0]['result']['country']

'England'