# Raspagem de Dados com Python e BeautifulSoup

https://medium.com/machina-sapiens/raspagem-de-dados-com-python-e-beautifulsoup-1b1b7019774c

In [1]:
from requests import get
from bs4 import BeautifulSoup

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'

In [3]:
response = get(url)
print(response.text[:500])




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle"


In [4]:
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

bs4.BeautifulSoup

In [5]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


### Extraindo dados de um único filme

In [6]:
first_movie = movie_containers[0]
first_movie

<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt3315342/"> <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB466725069_.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt3315342/">Logan</a>
<span class="lister-item-year text-muted unbold">(2017)</span>
</h3>
<p class="text-muted">
<span class="certificate">16</span>
<span class="ghost">|</span>
<span class="runtime">137 min</span>
<span class="ghost">|</span>
<span class="genre">
Ac

#### O nome do filme


In [7]:
first_movie.div

<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342"></div>
</div>

In [8]:
first_movie.a

<a href="/title/tt3315342/"> <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB466725069_.png" width="67"/>
</a>

In [9]:
first_movie.h3

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt3315342/">Logan</a>
<span class="lister-item-year text-muted unbold">(2017)</span>
</h3>

In [10]:
print(first_movie.h3.a)
print(first_movie.h3.a.text)

<a href="/title/tt3315342/">Logan</a>
Logan


#### O ano de lançamento do filme

In [11]:
first_name = first_movie.h3.a.text

print(first_name)

Logan


In [12]:
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold')
print(first_year.text)
print(str(first_year.text).strip('()')) # Limpeza

(2017)
2017


#### A avaliação IMDB


In [13]:
first_imdb = float(first_movie.strong.text)

print(first_movie.strong)
first_imdb

<strong>8.1</strong>


8.1

#### O Metascore


In [14]:
first_mscore = first_movie.find('span', class_ = 'metascore favorable')
first_mscore = float(first_mscore.text)
first_mscore

77.0

#### O número de votos


In [15]:
first_votes = first_movie.find('span', attrs ={'name':'nv'})
first_votes = int(first_votes['data-value'])
first_votes

602828

#### O script para uma página

In [16]:
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
    
    # If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:
        
        # The name
        name = container.h3.a.text
        names.append(name)
        
        # The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)
        
        # The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
        
        # The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))
        
        # The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))

#### Pandas

In [17]:
import pandas as pd
test_df = pd.DataFrame({'movie': names,
                       'year': years,
                       'imdb': imdb_ratings,
                       'metascore': metascores,
                       'votes': votes})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movie      43 non-null     object 
 1   year       43 non-null     object 
 2   imdb       43 non-null     float64
 3   metascore  43 non-null     int64  
 4   votes      43 non-null     int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.8+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Logan,(2017),8.1,77,602828
1,Thor: Ragnarok,(2017),7.9,74,536565
2,Star Wars: Os Últimos Jedi,(2017),7.0,85,526901
3,Guardiões da Galáxia Vol. 2,(2017),7.6,67,526865
4,Mulher-Maravilha,(2017),7.4,76,519451
5,Dunkirk,(2017),7.9,94,504437
6,Homem-Aranha: De Volta ao Lar,(2017),7.4,73,473820
7,Corra!,(I) (2017),7.7,84,447014
8,It: A Coisa,(I) (2017),7.3,69,423675
9,Blade Runner 2049,(2017),8.0,81,416813


### Juntando tudo

In [None]:
from time import time
from time import sleep
from random import randint

from IPython.core.display import clear_output
from warnings import warn



# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Preparing the monitoring of the loop
start_time = time()
requests = 0

pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]

headers = {"Accept-Language": "en-US, en;q=0.5"}


# For every year in the interval 2000-2018
for year_url in years_url:
    
    # For every page in the interval 1-4
    for page in pages:
        
        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url + 
                       '&sort=num_votes,desc&page=' + page, headers = headers)
        # Pause the loop
        sleep(randint(8,15))
        
        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)
        
        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))
            
        # Break the loop if the number od requests is greater than exepected
        if requests > 72:
            warn('Number of requests was greater than expected.')
            break
        
        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')
        
        # Select all the 50 movie containers from a single page 
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
        
        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('div', class_ = 'ratings-metascore') is not None:
                
                # Scrape the name
                name = container.h3.a.text
                names.append(name)
                
                # Scrape the year
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)
                
                # Scrape the IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)
                
                # Scrape the Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))
                
                # Scrape the number of votes
                vote = container.find('span', attrs = {'name': 'nv'})['data-value']
                votes.append(int(vote))

Request: 12; Frequency: 0.07304733134152037 requests/s


### Examinando os dados extraídos

In [None]:
movie_ratings = pd.DataFrame({'movie': names,
                              'year': years,
                              'imdb': imdb_ratings,
                              'metascore': metascores,
                              'votes': votes})

print(movie_ratings.info())

In [None]:
movie_ratings.head(10)

In [None]:
movie =  movie_ratings.describe(include=[np.number])
movie

### Limpando os dados extraídos


In [None]:
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
movie_ratings.head(10)

In [None]:
movie_ratings['year'].unique()

In [None]:
movie_ratings.loc[:, 'year'] = movie_ratings['year'].str[-5:-1].astype(int)

In [None]:
movie_ratings['year'].unique()

In [None]:
movie_ratings.describe().loc[['min', 'max'], ['imdb', 'metascore']]

In [None]:
movie_ratings['n_imdb'] = movie_ratings['imdb'].astype(int) * 10
movie_ratings.head(10)

In [None]:
movie_ratings.to_csv('movie_ratings.csv')

## Plotando e analizando as distribuições

In [None]:
import matplotlib.pyplot as plt
# %matplotlib inline

In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (16,4))
ax1, ax2, ax3 = fig.axes

ax1.hist(movie_ratings['imdb'], bins=10, range=(0,10))
ax1.set_title('IMDB rating')

ax2.hist(movie_ratings['metascore'], bins = 10, range = (0,100))
ax2.set_title('Metascore')

ax3.hist(movie_ratings['n_imdb'], bins = 10, range = (0,100), histtype = 'step')
ax3.hist(movie_ratings['metascore'], bins = 10, range = (0,100), histtype = 'step')
ax3.legend(loc = 'upper left')
ax3.set_title('The Two Normalized Distributions')

for ax in fig.axes:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
plt.show()


In [None]:
x = movie_ratings['imdb']
ax = sns.distplot(x, hist=True, kde=True, rug=False, color='m', bins=25, hist_kws={'edgecolor':'black'})
plt.show()

In [None]:
imdb_fuko = movie_ratings['n_imdb']
mscore_fuko = movie_ratings['metascore']

sns.distplot(imdb_fuko, hist=True, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='IMDB')
sns.distplot(mscore_fuko, hist=True, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Metascore')
plt.legend()