# Getting data from single movie in IMDB

In [2]:
# Import libraries
# https://www.dataquest.io/blog/web-scraping-beautifulsoup/

from requests import get
url ='http://www.imdb.com/search/title?release_date=2019&sort=num_votes,desc&page=1'
r = get(url)
print(r.text[:500]) # lets take a look at out response object as text




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle"


In [3]:
# after inspecting the HTML code on the webpage, we will parse the 50 div containers in the webpage with beautiful soup 
# import beautiful soup libraries

from bs4 import BeautifulSoup
html_soup = BeautifulSoup(r.text, 'html.parser')
type(html_soup)


bs4.BeautifulSoup

In [4]:
# from inspecting the website, we can see that the 50 div containers have a unique class attribute called
# lister-item mode-advanced, we will use find_all() to extract all the dv containers

movie_containers = html_soup.find_all('div', class_ ='lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


In [5]:
# Now we will select only the first container, and extract, by turn, each item of interes
#     - the name of the movie
#     - released year 
#     - the IMDB rating
#     - the metascore
#     - the number of votes

# we can access the first container (first_movie) by using list notation on movie_containers

first_movie = movie_containers[0]

In [6]:
# now we need to inspect again for the information we need from this long html code
# we can see that its under lister-item-content, then lister-item-header.
# we can also see that the name is contained within an anchor tag <a>. this tag is nested within a header tag <h3>
# which is nexted in a <div> tag.


# we can access tag objects but using a tag name as an attribute will only select the first tag by that name.
# we will only get the content of the first div tag if we run first_movie.div as shown
first_movie.div  # tag object


<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt7286456"></div>
</div>

In [7]:
#lets try accessing the first anchor tag <a>
first_movie.a


# this doesn't take us to the movie's name. The first <a> is somwhere within the second <div>

<a href="/title/tt7286456/"> <img alt="小丑" class="loadlate" data-tconst="tt7286456" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BNGVjNWI4ZGUtNzE0MS00YTJmLWE0ZDctN2ZiYTk2YmI3NTYyXkEyXkFqcGdeQXVyMTkxNjUyNQ@@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB466725069_.png" width="67"/>
</a>

In [8]:
first_movie.h3
# however, the first h3 tag brings us very close to the title

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt7286456/">小丑</a>
<span class="lister-item-year text-muted unbold">(2019)</span>
</h3>

In [9]:
first_movie.h3.a
# great, now we just need the text of this attribute

<a href="/title/tt7286456/">小丑</a>

In [10]:
first_name = first_movie.h3.a.text
first_name  # perfect!

'小丑'

In [11]:
# next we need to find the year of release
# we can see from inspecting that its under the span tag
# but there are two different span elements. So how can we get the second one?
# dot notation as we have been using above will only access the first span element, so we will use find()
# which is like find_all() used above, expect it only returns 1 element. In fact it is equivalent to find_all(limit=1)

# there is a distinguishing mark: 'lister-item-year text-muted unbold'

first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
first_year

'(2019)'

In [12]:
# now we find the Imdb rating
# we see its under a div tag with class 'ratings-bar', then another div with class "inline-block ratings-imdb-rating"
# with at last a <strong> tag

first_rating = first_movie.find('div', class_= 'inline-block ratings-imdb-rating').strong.text
first_rating

# Note: this is the long and tedious way of doing things.

'8.5'

In [13]:
# we can see that calling the strong tag gives 8.5 right away
first_movie.strong

<strong>8.5</strong>

In [14]:
# now lets find the metascore under another span tag
first_mscore = first_movie.find('span', class_ = 'metascore mixed')
first_mscore = int(first_mscore.text)
print(first_mscore)

# notice that for the metascore there are 3 outcomes of the class_. favourable, mixed, and unfavourable
# each defines whether the metascore's good or bad (or changing the color)
# What is specific to all Metascore ratings though is ony the metascore value. This is the one we are going to use when 
# we;ll write the script for the entire page

59


In [15]:
# now lets get the number of votes
# its not a class anymore, it is a 'name' with value 'nv'
# the name attribute is different from the class attribute. Using BeautifulSoup we can access elements by any attribute.
# The find() and find_all() functions have a parametter named attrs. To this we can pass in the attributes and values 
# we are searching for as a dictionary


first_vote = first_movie.find('span', attrs = {'name':'nv'})
first_vote

<span data-value="853238" name="nv">853,238</span>

In [16]:
# now using the data-value attribute, we wont need to take out the comma later on, and convert to int here
# so insead of using .text we use 'data-value'
# Convert to integer
first_vote = int(first_vote['data-value'])
first_vote

853238

#  script for single page

<blockquote>
    Before parsing a whole page, note that there are reviews that dont have a metascore, hence we need to create a condition that allows us to skip movies without a metascore
</blockquote>

<blockquote>
    We see that the Metascore section is contained withon a 'div tag. The class attribute has two value: inlin-block and a distinctive ratings-metascore.
</blockquote>

In [17]:
# lets use find() to search each movie container for a div having that distinct mark.
# if find() doesnt find anything, it will return a None object. We will use this condition in an if function
# to control whether a movie is scraped.

# the fourth movie chernobyl does not have a metascore

fourth_movie_mscore = movie_containers[3].find('div', class_ = 'ratings-metascore')
type(fourth_movie_mscore)

NoneType

In [18]:
# now lets put all the code above together compressed as much as possible.
# useing a for loop to loop all 50 containers

# first lets make lists variables to store the scraped data
names = []
years = []
ratings = []
metascores = []
votes = []

# for loop 
for container in movie_containers:
    # Metascore if condition
    if container.find('div', class_ = 'ratings-metascore') is not None:
        # name of movie
        name = container.h3.a.text
        names.append(name)
        # year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)
        # rating
        rating = float(container.strong.text) # rule of thumb: when decimals, use float for better calculation
        ratings.append(rating)
        # Metascore 
        metascore = container.find('span', class_ = 'metascore').text
        metascores.append(metascore)
        # votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(vote)
        

In [19]:
# now lets change it to a dataframe
import pandas as pd
df = pd.DataFrame({
    'movie':names,
    'year':years,
    'rating':ratings,
    'metascore':metascores,
    'votes':votes
})

In [20]:
df

Unnamed: 0,movie,year,rating,metascore,votes
0,小丑,(2019),8.5,59,853238
1,復仇者聯盟：終局之戰,(2019),8.4,78,753959
2,從前，有個好萊塢,(2019),7.6,83,506314
3,寄生上流,(2019),8.6,96,469548
4,驚奇隊長,(2019),6.9,64,422422
5,鋒迴路轉,(2019),7.9,82,376294
6,1917,(2019),8.3,78,362015
7,STAR WARS：天行者的崛起,(2019),6.6,53,346930
8,蜘蛛人：離家日,(2019),7.5,69,304373
9,愛爾蘭人,(2019),7.9,94,304252


# scraping more than one page


In [21]:
# we need to add 3 things:
#     1. make all request in one loop
#     2. controll the loops rate to avoid bombarding the servier with requets
#     3. monitor the loop while it runs

# https://www.imdb.com/search/title/?release_date=2019-01-01,2019-12-31&sort=num_votes,desc&start=51&ref_=adv_nxt
# All types of URL follow  certain pattern. as in our case we have a starting number with increments of 50
# lets create a list that contains these numbers
import numpy as np
pages = np.arange(1,500,50).tolist()
pages #so we have 10 pages




[1, 51, 101, 151, 201, 251, 301, 351, 401, 451]

In [41]:
# we also need a list for date
import datetime
base = datetime.datetime(2010,1,1)

start_dates = pd.date_range(base, periods=10, freq='YS').strftime("%Y-%m-%d").tolist()
end_dates = pd.date_range(base, periods=10, freq ='Y').strftime("%Y-%m-%d").tolist()
start_dates #check

# so in total we will be scraping 100 pages

['2010-01-01',
 '2011-01-01',
 '2012-01-01',
 '2013-01-01',
 '2014-01-01',
 '2015-01-01',
 '2016-01-01',
 '2017-01-01',
 '2018-01-01',
 '2019-01-01']

In [43]:
end_dates
year_url = []
for i in range(len(start_dates)):
    year_url.append(start_dates[i]+','+end_dates[i])
    
    
year_url

['2010-01-01,2010-12-31',
 '2011-01-01,2011-12-31',
 '2012-01-01,2012-12-31',
 '2013-01-01,2013-12-31',
 '2014-01-01,2014-12-31',
 '2015-01-01,2015-12-31',
 '2016-01-01,2016-12-31',
 '2017-01-01,2017-12-31',
 '2018-01-01,2018-12-31',
 '2019-01-01,2019-12-31']

# Controlling crawl-rate

In [24]:
# controlling the rate is important for us and the website. Avoid hammering the servier with a bunch 
# of requests per second, or we might get our IP address banned. and we wont distruped the activity of the website and 
# the server from responding to other users request

# we will contropp the loop rate with the sleep() funtion from pythons time module. 
# sleep will pause the execution of the loop for a specified amount of seconds
# to mimic human behavior we will vary the amount of waiting time between request by using randint() function 
# from python's random module. randint() generates integers within a specified interval.

# importing libraries
from time import sleep
from random import randint

In [25]:
# Since we are scraping a lot of pages, it is best to find a way to monitor the process.
# hence we will monitor:
#     1. the frequence of requests, make sure our program is not overloading the server.
#     2. the number of requests, halt the loop in case it exceeds expected
#     3. status code of requests, make sure its sending back correct responses.

# to get a frequency value we'll divide the number of requests by the time elapsed since th first request.
# similar to computing the speed of a vehicle

In [26]:
# lets import the time library and set a starting time
from time import time
start_time = time()
requests = 0
for _ in range(5):
    # A request would go here
    requests += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    

Request: 1; Frequency: 0.4999496391481066 requests/s
Request: 2; Frequency: 0.66644892918683 requests/s
Request: 3; Frequency: 0.7496992215239955 requests/s
Request: 4; Frequency: 0.6664186182341179 requests/s
Request: 5; Frequency: 0.7140180802694945 requests/s


In [30]:
# to make it more tidy, we clear the output after each iteration, and replace it with info about the most recent request
# we will use the clear_output() function, setting wait paramter to true 

from IPython.core.display import clear_output
start_time = time()
requests = 0
for _ in range(5):
    # A request would go here
    requests += 1
    sleep(randint(1,3))
    current_time = time()
    elapsed_time = current_time - start_time
    print('Request: {}; Frequency: {} request/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)

Request: 5; Frequency: 0.3843617349714595 request/s


In [34]:
# now set a status code where a successful request is indivated by 200. 
# using the warn() function from warnings module
from warnings import warn
warn("Warning Simulation")

  after removing the cwd from sys.path.


# Now we piece everything together

In [57]:
# redeclare variables
names = []
years = []
ratings = []
metascores = []
votes = []

# Monitor loop
start_time = time()
requests = 0

# loop over years
for year in year_url:
    
    # iterate over all pages
    for page in pages:
        
        # make a get request
        r = get('https://www.imdb.com/search/title/?release_date=' + year
                + '&sort=num_votes,desc&start=' + str(page) + '&ref_=adv_nxt')
        
        # temporarily pause loop with sleep function
        sleep(randint(8,15))
        
        # monitor requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request: {}; Frequency: {} request/s'.format(requests, requests/elapsed_time))
        clear_output(wait=True)
        
        # throw warning for non-200 status
        if r.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, r.status_code))
            
        # break loop if number of loop is more than expected
        if requests > 101:
            warn('Number of requess was greater than expected.')
            break
            
        # parse content
        soup = BeautifulSoup(r.text, 'html.parser')
        
        # select the 50 containers from a single page
        movie_containers = soup.find_all('div', class_ = 'lister-item mode-advanced')
        
        # loop for the info of each movie
        for container in movie_containers:
            
            # metascore exists condition 
            if container.find('div', class_ = 'ratings-metascore') is not None:
                
                # name
                name = container.h3.a.text
                names.append(name)
                
                # year
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)
                
                # rating
                rating = float(container.strong.text)
                ratings.append(rating)
                
                # metascore
                score = container.find('span', class_ = 'metascore').text
                metascores.append(int(score))
                
                # votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))
                
                

Request: 100; Frequency: 0.07914021934857986 request/s


In [58]:
# now lets change it to a dataframe
df = pd.DataFrame({
    'movie':names,
    'year':years,
    'rating':ratings,
    'metascore':metascores,
    'votes':votes
})


In [59]:
df

Unnamed: 0,movie,year,rating,metascore,votes
0,全面啟動,(2010),8.8,74,2001638
1,隔離島,(2010),8.2,63,1095541
2,玩具總動員3,(2010),8.3,92,739522
3,鋼鐵人2,(2010),7.0,57,706089
4,黑天鵝,(2010),8.0,79,686011
5,馴龍高手,(2010),8.1,75,651321
6,王者之聲：宣戰時刻,(2010),8.0,88,628187
7,社群網戰,(2010),7.7,95,608531
8,特攻聯盟,(2010),7.6,66,517624
9,神偷奶爸,(2010),7.6,72,491149
