# Web Scraping with BeautifulSoup

Objective: Web scrapping from IMDB Website  
Website: https://www.dataquest.io/blog/web-scraping-beautifulsoup/

In [1]:
from requests import get
url = "https://www.imdb.com/search/title?release_date=2018&sort=num_votes,desc&page=1"
response = get(url)
#print(response.text[:500])

In [2]:
from bs4 import BeautifulSoup as bs
html_soup = bs(response.text, 'html.parser')
type(html_soup)

bs4.BeautifulSoup

## Extract and Calculate Attribute in Container

In [3]:
movie_containers = html_soup.findAll("div", class_ = "lister-item mode-advanced")
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


## Extract Data for Single Data

In [4]:
#First movie data block
movie01 = movie_containers[0]
movie01

<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt4154756"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt4154756/?ref_=adv_li_i"> <img alt="Avengers: Infinity War" class="loadlate" data-tconst="tt4154756" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BMjMxNjY2MDU1OV5BMl5BanBnXkFtZTgwNzY1MTUwNTM@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB470041630_.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt4154756/?ref_=adv_li_tt">Avengers: Infinity War</a>
<span class="lister-item-year text-muted unbold">(2018)</span>
</h3>
<p class="text-muted ">
<span class="certificate">P13</span>
<span class="ghost">|</span>
<span class="runtime">149 min</span>
<span class="ghost"

## Extract Movie Name

In [5]:
movie01.h3.a.text

'Avengers: Infinity War'

## Extract Movie Year

In [6]:
mov_year = movie01.h3.find("span", class_ = "lister-item-year text-muted unbold")
mov_year.text.strip("()")

'2018'

## Extract Movie Certificate

In [7]:
cert = movie01.p.find("span", class_ = "certificate")
cert.text

'P13'

## Extract Movie Runtime

In [8]:
rt = movie01.p.find("span", class_ = "runtime")
rt.text

'149 min'

## Extract Movie Genre

In [9]:
genre = movie01.p.find("span", class_ = "genre")
genre.text.strip("\n ")

'Action, Adventure, Fantasy'

## Extract Movie Rating

In [10]:
movie01.strong.text

'8.5'

## Extract Movie Metascore

In [11]:
metascore = movie01.find("span", class_ = "metascore favorable")
metascore = int(metascore.text)
metascore

68

## Extract Movie Vote

In [12]:
vote = movie01.find("span", attrs = {"name":"nv"})
vote = int(vote["data-value"])
vote

591179

## Declare List of Data

In [13]:
names = []
years = []
certificates = []
runtimes = []
genres = []
ratings = []
metascores = []
votes = []

## Loop Scrap Data

In [14]:
for container in movie_containers:

    # If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:

        # The name - movie01.h3.a.text
        name = container.h3.a.text
        names.append(name)

        # The year - movie01.h3.find("span", class_ = "lister-item-year text-muted unbold"
        year = container.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
        years.append(year)

        # movie01.p.find("span", class_ = "certificate")
        certificate = container.p.find("span", class_ = "certificate").text
        certificates.append(certificate)
        
        # movie01.p.find("span", class_ = "runtime")
        runtime = container.p.find("span", class_ = "runtime").text
        runtimes.append(runtime)
        
        # movie01.p.find("span", class_ = "genre")
        genre = container.p.find("span", class_ = "genre").text.strip("\n ")
        genres.append(genre)
        
        # The IMDB rating
        imdb = float(container.strong.text)
        ratings.append(imdb)

        # The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))

        # The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))

## Create Dataframe

In [15]:
import pandas as pd

test_df = pd.DataFrame({'movie': names,'year': years,
                        'certificate': certificates,'runtime': runtimes,
                        'genre': genres,'imdb': ratings,
                        'metascore': metascores,'votes': votes})
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 8 columns):
movie          47 non-null object
year           47 non-null object
certificate    47 non-null object
runtime        47 non-null object
genre          47 non-null object
imdb           47 non-null float64
metascore      47 non-null int64
votes          47 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 3.0+ KB
None


In [16]:
print(test_df.shape)
test_df.head()

(47, 8)


Unnamed: 0,movie,year,certificate,runtime,genre,imdb,metascore,votes
0,Avengers: Infinity War,(2018),P13,149 min,"Action, Adventure, Fantasy",8.5,68,591179
1,Black Panther,(2018),P13,134 min,"Action, Adventure, Sci-Fi",7.3,88,467760
2,Deadpool 2,(2018),18,119 min,"Action, Adventure, Comedy",7.8,66,352673
3,Ready Player One,(2018),P13,140 min,"Action, Adventure, Sci-Fi",7.5,64,281711
4,A Quiet Place,(2018),P13,90 min,"Drama, Horror, Mystery",7.6,82,278533
