In [1]:
from requests import get
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
print(response.text[:500])




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle"


In [2]:
#print(response.text)

In [3]:
# To parse our HTML document and extract the 50 div containers, we’ll use a Python module 
# called BeautifulSoup,

In [4]:
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

bs4.BeautifulSoup

In [5]:
# Before extracting the 50 div containers, we need to figure out what distinguishes them 
# from other div elements on that page. Often, the distinctive mark resides in the class 
# attribute. If you inspect the HTML lines of the containers of interest, you’ll notice 
# that the class attribute has two values: lister-item and mode-advanced. This combination
# is unique to these div containers. We can see that’s true by doing a quick search 
# (Ctrl + F). We have 50 such containers, so we expect to see only 50 matches:

In [6]:
# Now let’s use the find_all() method to extract all the div containers that have a class 
# attribute of lister-item mode-advanced:

In [7]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


In [8]:
# find_all() returned a ResultSet object which is a list containing all the 50 divs we 
# are interested in.

In [9]:
# Now we’ll select only the first container, and extract, by turn, each item of interest:

# The name of the movie.
# The year of release.
# The IMDB rating.
# The Metascore.
# The number of votes.

In [10]:
# Extracting the data for a single movie
# We can access the first container, which contains information about a single movie, by 
# using list notation on movie_containers.

In [11]:
# As you can see, the HTML content of one container is very long. To find out the HTML 
# line specific to each data point, we’ll use DevTools once again.

In [12]:
## The name of the movie
# We begin with the movie’s name, and locate its correspondent HTML line by using DevTools
# You can see that the name is contained within an anchor tag (<a>). This tag is nested 
# within a header tag (<h3>). The <h3> tag is nested within a <div> tag. This <div> is the
# third of the divs nested in the container of the first movie. We stored the content of 
# this container in the first_movie variable.

In [13]:
first_movie = html_soup.find('div', class_ = 'lister-item mode-advanced')

In [14]:
# first_movie is a Tag object, and the various HTML tags within it are stored as its 
# attributes. We can access them just like we would access any attribute of a Python 
# object. However, using a tag name as an attribute will only select the first tag by that
# name. If we run first_movie.div, we only get the content of the first div tag:

In [15]:
first_movie.div

<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342"></div>
</div>

In [16]:
first_movie.a

<a href="/title/tt3315342/"> <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB466725069_.png" width="67"/>
</a>

In [17]:
first_movie.h3

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt3315342/">Logan</a>
<span class="lister-item-year text-muted unbold">(2017)</span>
</h3>

In [18]:
first_movie.h3.a

<a href="/title/tt3315342/">Logan</a>

In [19]:
# Now it’s all just a matter of accessing the text from within that <a> tag:
first_name = first_movie.h3.a.text
first_name

'Logan'

In [20]:
# The year of the movie’s release
# We move on with extracting the year. This data is stored within the <span> tag below 
# the <a> that contains the name.

In [21]:
# Dot notation will only access the first span element. We’ll search by the distinctive 
# mark of the second <span>. We’ll use the find() method which is almost the same as 
# find_all(), except that it only returns the first match. In fact, find() is equivalent 
# to find_all(limit = 1). The limit argument limits the output to the first match.
# The distinguishing mark consists of the values lister-item-year text-muted unbold 
# assigned to the class attribute. So we look for the first <span> with these values 
# within the <h3> tag:

In [22]:
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold')
first_year

<span class="lister-item-year text-muted unbold">(2017)</span>

In [23]:
# From here, we just access the text using attribute notation:
first_year = first_year.text
first_year

'(2017)'

In [24]:
# We could easily clean that output and convert it to an integer. But if you explore more 
# pages, you will notice that for some movies the year takes unpredictable values like 
# (2017)(I) or (2015)(V). It’s more efficient to do the cleaning after the scraping, when 
# we’ll know all the year values.

In [25]:
# The IMDB rating
# We now focus on extracting the IMDB rating of the first movie.
# There are a couple of ways to do that, but we’ll first try the easiest one. If you 
# inspect the IMDB rating using DevTools, you’ll notice that the rating is contained 
# within a <strong> tag.

In [26]:
first_movie.strong

<strong>8.1</strong>

In [27]:
# Great! We’ll access the text, convert it to the float type, and assign it to the 
# variable first_imdb:
first_imdb = float(first_movie.strong.text)
first_imdb

8.1

In [28]:
# The Metascore
# If we inspect the Metascore using DevTools, we’ll notice that we can find it within a 
# span tag.

In [29]:
# Attribute notation clearly isn’t a solution. There are many <span> tags before that. 
# You can see one right above the <strong> tag. We’d better use the distinctive values of 
# the class attribute (metascore favorable).

In [30]:
first_mscore = first_movie.find('span', class_ = 'metascore favorable')
first_mscore = int(first_mscore.text)
print(first_mscore)

77


In [31]:
# The favorable value indicates a high Metascore and sets the rating’s background color to
# green. The other two possible values are unfavorable and mixed. What is specific to all 
# Metascore ratings though is only the metascore value. This is the one we are going to 
# use when we’ll write the script for the entire page.

In [32]:
# The favorable value indicates a high Metascore and sets the rating’s background color to
# green. The other two possible values are unfavorable and mixed. What is specific to all 
# Metascore ratings though is only the metascore value. This is the one we are going to 
# use when we’ll write the script for the entire page.

In [33]:
# The name attribute is different from the class attribute. Using BeautifulSoup we can 
# access elements by any attribute. The find() and find_all() functions have a parameter 
# named attrs. To this we can pass in the attributes and values we are searching for as a 
# dictionary:

In [34]:
first_votes = first_movie.find('span', attrs = {'name':'nv'})
first_votes

<span data-value="600256" name="nv">600,256</span>

In [35]:
# We could use .text notation to access the <span> tag’s content. It would be better 
# though if we accessed the value of the data-value attribute. This way we can convert the 
# extracted datapoint to an int without having to strip a comma. You can treat a Tag 
# object just like a dictionary. The HTML attributes are the dictionary’s keys. The values
# of the HTML attributes are the values of the dictionary’s keys. This is how we can 
# access the value of the data-value attribute:

In [36]:
first_votes['data-value']

'600256'

In [37]:
# Let’s convert that value to an integer, and assign it to first_votes:
first_votes = int(first_votes['data-value'])

In [38]:
# The script for a single page
# Before piecing together what we’ve done so far, we have to make sure that we’ll extract 
# the data only from the containers that have a Metascore.

In [39]:
# Before piecing together what we’ve done so far, we have to make sure that we’ll 
# extract the data only from the containers that have a Metascore. We need to 
# add a condition to skip movies without a Metascore.

In [40]:
# Using DevTools again, we see that the Metascore section is contained within a <div> tag.
# The class attribute has two values: inline-block and ratings-metascore. The distinctive 
# one is clearly ratings-metascore.

In [41]:
# We can use find() to search each movie container for a div having that distinct mark.
# When find() doesn’t find anything, it returns a None object. We can use this result in 
# an if statement to control whether a movie is scraped.

In [42]:
# Let’s look on the web page to search for a movie container that doesn’t have a Metascore
# , and see what find() returns.

In [43]:
eighth_movie_mscore = movie_containers[7].find('div', class_ = 'ratings-metascore')
eighth_movie_mscore

<div class="inline-block ratings-metascore">
<span class="metascore favorable">84        </span>
        Metascore
            </div>

In [44]:
# let’s put together the code above, and compress it as much as possible, but only 
# insofar as it’s still easily readable. In the next code block we:

In [45]:
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:
        # The name
        name = container.h3.a.text
        names.append(name)
        # The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)
        # The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
        # The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))
        # The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))

In [46]:
# Let’s check the data collected so far. Pandas makes it easy for us to see whether we’ve 
# scraped our data successfully.

In [47]:
import pandas as pd

test_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})

print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 5 columns):
movie        43 non-null object
year         43 non-null object
imdb         43 non-null float64
metascore    43 non-null int64
votes        43 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 1.8+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Logan,(2017),8.1,77,600256
1,Thor: Ragnarok,(2017),7.9,74,534045
2,Star Wars: Episode VIII - The Last Jedi,(2017),7.0,85,524995
3,Guardians of the Galaxy Vol. 2,(2017),7.6,67,524630
4,Wonder Woman,(2017),7.4,76,517446
5,Dunkirk,(2017),7.9,94,501688
6,Spider-Man: Homecoming,(2017),7.4,73,471893
7,Get Out,(I) (2017),7.7,84,444233
8,It,(I) (2017),7.3,69,421356
9,Blade Runner 2049,(2017),8.0,81,413893


In [48]:
# As a side note, if you run the code from a country where English is not the main 
# language, it’s very likely that you’ll get some of the movie names translated into the 
# main language of that country.

# Most likely, this happens because the server infers your location from your IP address. 
# Even if you are located in a country where English is the main language, you may still 
# get translated content. This may happen if you’re using a VPN while you’re making the 
# GET requests.

# If you run into this issue, pass the following values to the headers parameter of the 
# get() function:
headers = {"Accept-Language": "en-US, en;q=0.5"}

In [49]:
## The script for multiple pages

In [50]:
# We’ll scrape the first 4 pages of each year in the interval 2000-2017. 4 pages for each 
# of the 18 years makes for a total of 72 pages. Each page has 50 movies, so we’ll scrape 
# data for 3600 movies at most. But not all the movies have a Metascore, so the number 
# will be lower than that. Even so, we are still very likely to get data for over 2000 
# movies.

In [51]:
# As we are making the requests, we’ll only have to vary the values of only two parameters
# of the URL: the release_date parameter, and page. Let’s prepare the values we’ll need 
# for the forthcoming loop. In the next code cell we will:

# Create a list called pages, and populate it with the strings corresponding to the first 
# 4 pages.

# Create a list called years_url and populate it with the strings corresponding to the 
# years 2000-2017.

In [52]:
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]

In [53]:
## Controlling the crawl-rate

In [54]:
# Controlling the rate of crawling is beneficial for us, and for the website we are 
# scraping. If we avoid hammering the server with tens of requests per second, then we are
# much less likely to get our IP address banned. We also avoid disrupting the activity of 
# the website we scrape by allowing the server to respond to other users’ requests too.

In [55]:
# We’ll control the loop’s rate by using the sleep() function from Python’s time module. 
# sleep() will pause the execution of the loop for a specified amount of seconds.

# To mimic human behavior, we’ll vary the amount of waiting time between requests by using
# the randint() function from the Python’s random module. randint() randomly generates 
# integers within a specified interval.

In [56]:
# e.g
from time import sleep
from random import randint

for _ in range(1,4):
    print('Jesus is Lord')
    sleep(randint(1,5))

Jesus is Lord
Jesus is Lord
Jesus is Lord


In [57]:
## Monitoring the loop as it’s still going

In [58]:
# Given that we’re scraping 72 pages, it would be nice if we could find a way to monitor 
# the scraping process as it’s still going. This feature is definitely optional, but it 
# can be very helpful in the testing and debugging process. Also, the greater the number 
# of pages, the more helpful the monitoring becomes. If you are going to scrape hundreds 
# or thousands of web pages in a single code run, I would say that this feature becomes a 
# must.

In [59]:
# For our script, we’ll make use of this feature, and monitor the following parameters:

# The frequency (speed) of requests, so we make sure our program is not overloading the 
# server.

# The number of requests, so we can halt the loop in case the number of expected requests 
# is exceeded.

# The status code of our requests, so we make sure the server is sending back the proper 
# responses.

In [60]:
# To get a frequency value we’ll divide the number of requests by the time elapsed since 
# the first request. This is similar to computing the speed of a car – we divide the 
# distance by the time taken to cover that distance. Let’s experiment with this monitoring
# technique at a small scale first. In the following code cell we will:

In [61]:
# Set a starting time using the time() function from the time module, and assign the 
# value to start_time.

# Assign 0 to the variable requests which we’ll use to count the number of requests.
# Start a loop, and then with each iteration:
    # Simulate a request.
    # Increment the number of requests by 1.
    # Pause the loop for a time interval between 8 and 15 seconds.
    # Calculate the elapsed time since the first request, and assign the value to 
    # elapsed_time.
    # Print the number of requests and the frequency.

In [66]:
from time import time

start_time = time()
requests = 0
for _ in range(5):
# A request would go here
    requests += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))

Request: 1; Frequency: 0.9763882028077158 requests/s
Request: 2; Frequency: 0.6601893365743332 requests/s
Request: 3; Frequency: 0.5958592629581828 requests/s
Request: 4; Frequency: 0.5681640612631259 requests/s
Request: 5; Frequency: 0.4977322900367531 requests/s


In [67]:
# Since we’re going to make 72 requests, our work will look a bit untidy as the output 
# accumulates. To avoid that, we’ll clear the output after each iteration, and replace it 
# with information about the most recent request. To do that we’ll use the clear_output()
# function from the IPython’s core.display module. We’ll set the wait parameter of 
# clear_output() to True to wait with replacing the current output until some new output 
# appears.

In [69]:
from IPython.core.display import clear_output

start_time = time()
requests = 0
for _ in range(5):
# A request would go here
    requests += 1
    sleep(randint(1,3))
    current_time = time()
    elapsed_time = current_time - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)

Request: 1; Frequency: 0.9982312072914624 requests/s
Request: 2; Frequency: 0.4996610199374599 requests/s
Request: 3; Frequency: 0.5990871225315276 requests/s
Request: 4; Frequency: 0.5706816348394799 requests/s
Request: 5; Frequency: 0.49931130034095467 requests/s


In [70]:
# To monitor the status code we’ll set the program to warn us if there’s something off. 
# A successful request is indicated by a status code of 200. We’ll use the warn() function
# from the warnings module to throw a warning if the status code is not 200.

In [72]:
from warnings import warn
warn("Warning Simulation")

  


In [73]:
# We chose a warning over breaking the loop because there’s a good possibility we’ll 
# scrape enough data, even if some of the requests fail. We will only break the loop if 
# the number of requests is greater than expected.

In [74]:
### Piecing everything together