## Web Scraping IMDb Top 250

In [36]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Data Preparing

In [37]:
url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250' # assign the url of the website to variable url
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}

We have to use a User-Agent header when scraping websites because some websites might block requests that don't appear to come from a web browser.  
By specifying a User-Agent, you're essentially telling the server that you're a particular type of browser making the request.

In [38]:
response = requests.get(url, headers=headers) # it use to HTTP request
soup = BeautifulSoup(response.content, 'html.parser') # to take all or parse the html content 

In [39]:
movie_data = soup.findAll('div', attrs={'class': 'ipc-metadata-list-summary-item__c'}) # take all the data of every movie
print(f'The Length of movie_data : {len(movie_data)}')
print(movie_data[0])

The Length of movie_data : 250
<div class="ipc-metadata-list-summary-item__c"><div class="ipc-metadata-list-summary-item__tc"><span aria-disabled="false" class="ipc-metadata-list-summary-item__t"></span><div class="sc-be6f1408-0 gVGktK cli-children"><div class="ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-be6f1408-9 srahg cli-title"><a class="ipc-title-link-wrapper" href="/title/tt0111161/?ref_=chttp_t_1" tabindex="0"><h3 class="ipc-title__text">1. The Shawshank Redemption</h3></a></div><div class="sc-be6f1408-7 iUtHEN cli-title-metadata"><span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">1994</span><span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">2h 22m</span><span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">18+</span></div><span class="sc-be6f1408-1 dbnleL"><div class="sc-e2dbc1a3-0 ajrIH sc-be6f1408-2 dAeZAQ cli-ratings-container" data-testid="ratingGroup--container"><span aria-label="IMDb rating: 9.3" clas

In [40]:
str(movie_data[0]).split("><")

['<div class="ipc-metadata-list-summary-item__c"',
 'div class="ipc-metadata-list-summary-item__tc"',
 'span aria-disabled="false" class="ipc-metadata-list-summary-item__t"',
 '/span',
 'div class="sc-be6f1408-0 gVGktK cli-children"',
 'div class="ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-be6f1408-9 srahg cli-title"',
 'a class="ipc-title-link-wrapper" href="/title/tt0111161/?ref_=chttp_t_1" tabindex="0"',
 'h3 class="ipc-title__text">1. The Shawshank Redemption</h3',
 '/a',
 '/div',
 'div class="sc-be6f1408-7 iUtHEN cli-title-metadata"',
 'span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">1994</span',
 'span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">2h 22m</span',
 'span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">18+</span',
 '/div',
 'span class="sc-be6f1408-1 dbnleL"',
 'div class="sc-e2dbc1a3-0 ajrIH sc-be6f1408-2 dAeZAQ cli-ratings-container" data-testid="ratingGroup--container"',
 'span aria-label="

## The Movie Name

In [41]:
# For movie the class is 'ipc-title__text'
# First in here i want to show that the title of the movie start differently
print(movie_data[0].find('h3',attrs={'class':'ipc-title__text'}).text[:]) # for the case of number 1 to 9 it has 3 character before the name of the movie is which [1,.,' ']
print(movie_data[10].find('h3',attrs={'class':'ipc-title__text'}).text[:]) # for the case of number 1 to 9 it has 4 character before the name of the movie is which [1,1,.,' ']
print(movie_data[100].find('h3',attrs={'class':'ipc-title__text'}).text[:])  # for the case of number 1 to 9 it has 5 character before the name of the movie is which [1,0,1,.,' ']

1. The Shawshank Redemption
11. Forrest Gump
101. The Apartment


In [42]:
movie_name = []
for i in range(0,9): # since it has 3 character before the name of the movie 
    judul = movie_data[i].find('h3',attrs={'class':'ipc-title__text'})
    movie_name.append(judul.text[3:]) # so it start from index 3
for i in range(9,99): # since it has 4 character before the name of the movie 
    judul = movie_data[i].find('h3',attrs={'class':'ipc-title__text'})
    movie_name.append(judul.text[4:]) # so it start from index 4
for i in range(99,250): # since it has 5 character before the name of the movie 
    judul = movie_data[i].find('h3',attrs={'class':'ipc-title__text'})
    movie_name.append(judul.text[5:]) # so it start from index 5

In [43]:
movie_name

['The Shawshank Redemption',
 'The Godfather',
 'The Dark Knight',
 'The Godfather: Part II',
 '12 Angry Men',
 "Schindler's List",
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Good, the Bad and the Ugly',
 'Forrest Gump',
 'The Lord of the Rings: The Two Towers',
 'Fight Club',
 'Inception',
 'Star Wars: Episode V - The Empire Strikes Back',
 'The Matrix',
 'GoodFellas',
 "One Flew Over the Cuckoo's Nest",
 'Se7en',
 "It's a Wonderful Life",
 'Interstellar',
 'Shichinin no samurai',
 'The Silence of the Lambs',
 'Saving Private Ryan',
 'City of God',
 'Life Is Beautiful',
 'The Green Mile',
 'Terminator 2: Judgment Day',
 'Star Wars',
 'Back to the Future',
 'Spirited Away',
 'Spider-Man: Across the Spider-Verse',
 'The Pianist',
 'Parasite',
 'Psycho',
 'Gladiator',
 'The Lion King',
 'Léon',
 'The Departed',
 'American History X',
 'Whiplash',
 'The Prestige',
 'Hotaru no haka',
 'Seppuku',
 'The Usual

## Movie Rating

In [44]:
# For the movie rating it is locate in class 'ipc-rating-star--imdb' in attribute of 'aria-label'
# the Imdb rating is located in the last 3 index
movie_data[i].find('span', class_='ipc-rating-star--imdb')['aria-label']

'IMDb rating: 8.2'

In [45]:
vote = []
for i in range(len(movie_data)):
    like = movie_data[i].find('span', class_='ipc-rating-star--imdb')
    vote.append(like["aria-label"][-3:])# append the Imdb rating that is located in the last 3 index

In [46]:
vote

['9.3',
 '9.2',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '8.9',
 '8.9',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.6',
 '8.6',
 '8.7',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.5',
 '8.6',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '9.1',
 '8.5',
 '8.5',
 '8.4',
 '8.5',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.3',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.4',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.2',
 '8.2',
 '8.3',
 '8.3',
 '8.3',
 '8.2',
 '8.2',
 '8.3',
 '8.2',


### The year, hours of the movie and the age rating

In [47]:
for i in movie_data[0].findAll('span',attrs={'class':'sc-be6f1408-8 fcCUPU cli-title-metadata-item'}):
    print(i) # to get the year, the hours and the age rating, those data located in class 'sc-be6f1408-8 fcCUPU cli-title-metadata-item'

<span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">1994</span>
<span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">2h 22m</span>
<span class="sc-be6f1408-8 fcCUPU cli-title-metadata-item">18+</span>


In [48]:
year = []
time = []
age_rating = []
for i in range(len(movie_data)):
    
    if len(movie_data[i].findAll('span',attrs={'class':'sc-be6f1408-8 fcCUPU cli-title-metadata-item'}))<3: # were found there is a data that doesn't have the age rating so it needs to be filled manually
        tahun,waktu = movie_data[i].findAll('span',attrs={'class':'sc-be6f1408-8 fcCUPU cli-title-metadata-item'})
        year.append(tahun.text)
        time.append(waktu.text)
        age_rating.append('12A')
    else:
        tahun,waktu,umur = movie_data[i].findAll('span',attrs={'class':'sc-be6f1408-8 fcCUPU cli-title-metadata-item'})
        year.append(tahun.text)
        time.append(waktu.text)
        age_rating.append(umur.text)

In [49]:
for i in range(0,250):
    print(f"{year[i]} {time[i]} {age_rating[i]}")

1994 2h 22m 18+
1972 2h 55m 18+
2008 2h 32m R
1974 3h 22m 18+
1957 1h 36m SU
1993 3h 15m R
2003 3h 21m A
1994 2h 34m 17+
2001 2h 58m R
1966 2h 41m D
1994 2h 22m 21
2002 2h 59m A
1999 2h 19m D17+
2010 2h 28m D
1980 2h 4m PG
1999 2h 16m 18+
1990 2h 25m 17+
1975 2h 13m R
1995 2h 7m 21
1946 2h 10m 13+
2014 2h 49m R
1954 3h 27m Not Rated
1991 1h 58m D
1998 2h 49m 18+
2002 2h 10m R
1997 1h 56m 21
1999 3h 9m 18+
1991 2h 17m 13+
1977 2h 1m 13+
1985 1h 56m 13+
2001 2h 5m 13+
2023 2h 20m SU
2002 2h 30m R
2019 2h 12m 17+
1960 1h 49m R
2000 2h 35m 16+
1994 1h 28m PG
1994 1h 50m R
2006 2h 31m R
1998 1h 59m R
2014 1h 46m 16+
2006 2h 10m 17+
1988 1h 29m Not Rated
1962 2h 13m Not Rated
1995 1h 46m R
1942 1h 42m PG
2011 1h 52m R
1936 1h 27m G
1988 2h 54m PG
1954 1h 52m PG
1968 2h 46m PG-13
2023 2h 27m 12A
1979 1h 57m 17+
1931 1h 27m G
1979 2h 27m R
2012 2h 45m 21+
2000 1h 53m R
2008 1h 38m SU
1981 1h 55m PG
2006 2h 17m R
1950 1h 50m Passed
1957 1h 28m Approved
2018 2h 29m 13+
2018 1h 57m SU
1980 2h 26m

### DataFrame

In [50]:
data = { # pack all of the data into one table or data frame
    'name' : movie_name,
    'year' : year,
    'duration':time,
    'rating' :age_rating,
    'rate' : vote,
}
df=pd.DataFrame(data)
df

Unnamed: 0,name,year,duration,rating,rate
0,The Shawshank Redemption,1994,2h 22m,18+,9.3
1,The Godfather,1972,2h 55m,18+,9.2
2,The Dark Knight,2008,2h 32m,R,9.0
3,The Godfather: Part II,1974,3h 22m,18+,9.0
4,12 Angry Men,1957,1h 36m,SU,9.0
...,...,...,...,...,...
245,The Help,2011,2h 26m,13+,8.1
246,It Happened One Night,1934,1h 45m,Passed,8.1
247,Les quatre cents coups,1959,1h 39m,Not Rated,8.1
248,Aladdin,1992,1h 30m,G,8.0


### Count total hours of all the movies

In [51]:
tiempo = 0
j=0
# in i am trying to calculate the total minutes of all the movies
for x in time:
    if x[1] == 'h': # as a sign to get the hours or the movie duration is more than 59 minutes, not 60 minutes because 60 minutes is written as 1 h 
        if len(x)>2: # it means that it has the minutes example: 2h 3m
            result = int(x[0])*60 +int (x[2:len(x)-1]) # X[0] is the hours that will be multiply by 60 and plus the minutes
            tiempo = tiempo + result
        else: # it means that it hasn't the minutes example: 2h 
            tiempo = tiempo + int(x[0])*60
            
    else: # the movie below than 60 minutes or 1 h
        tiempo= tiempo + int(x[:len(x)-1]) # just take the minutes and romove the unit example 59m, then just take the 59
print(f'{int(tiempo/60)}Hr {tiempo%60}m') # convert the total minutes into hours and the minutes

539Hr 57m
