In [9]:
import pandas as pd
import numpy as np

import requests
from requests import get
from bs4 import BeautifulSoup

from time import sleep
from random import randint

In [28]:
# Creating the lists we want to write into
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

In [29]:
# Getting English translated titles from the movies
headers = {'Accept-Language': 'en-US, en;q=0.5'}

In [33]:
pages = np.arange(1, 301, 100)
pages

array([  1, 101, 201])

In [34]:
# Storing each of the urls of 50 movies 
for page in pages:
    # Getting the contents from the each url
#     page = requests.get('https://www.imdb.com/list/ls009318269/?st_dt=&mode=detail&page=' + str(page) + '&ref_=adv_nxt', headers=headers)
    page = requests.get('https://www.imdb.com/search/title/?groups=top_1000&start=' + str(page) + '&ref_=adv_nxt', headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Aiming the part of the html we want to get the information from
    movie_div = soup.find_all('div', class_='lister-item mode-advanced')
    
    # Controling the loop’s rate by pausing the execution of the loop for a specified amount of time
    # Waiting time between requests for a number between 2-10 seconds
    sleep(randint(2,10))
    
    for container in movie_div:
        # Scraping the movie's name
        name = container.h3.a.text
        titles.append(name)
        
        # Scraping the movie's year
        year = container.h3.find('span', class_='lister-item-year').text
        years.append(year)
        
        # Scraping the movie's length
        runtime = container.find('span', class_='runtime').text if container.p.find('span', class_='runtime') else '-'
        time.append(runtime)
        
        # Scraping the rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
        
        # Scraping the metascore
        m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
        metascores.append(m_score)
        
        # Scraping votes and gross earnings
        nv = container.find_all('span', attrs={'name':'nv'})
        vote = nv[0].text
        votes.append(vote)
        grosses = nv[1].text if len(nv) > 1 else '-'
        us_gross.append(grosses)

In [35]:
movies = pd.DataFrame({'movie':titles,
                       'year':years,
                       'time_minute':time,
                       'imdb_rating':imdb_ratings,
                       'metascore':metascores,
                       'vote':votes,
                       'gross_earning':us_gross})

movies.head()

Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning


In [36]:
movies.dtypes

movie            float64
year             float64
time_minute      float64
imdb_rating      float64
metascore        float64
vote             float64
gross_earning    float64
dtype: object

In [37]:
# Cleaning 'year' column
movies['year'] = movies['year'].str.extract('(\d+)').astype(int)
movies.head(3)

AttributeError: Can only use .str accessor with string values!

In [19]:
# Cleaning 'time_minute' column
movies['time_minute'] = movies['time_minute'].str.extract('(\d+)').astype(int)
movies.head(3)

Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
0,Zack Snyder's Justice League,2021,242,8.2,54,258456,-
1,The Ten Commandments,1956,220,7.9,-,65015,$93.74M
2,The Father,2020,97,8.3,88,27678,-


In [20]:
# Cleaning 'metascore' column
movies['metascore'] = movies['metascore'].str.extract('(\d+)')
# convert it to float and if there are dashes turn it into NaN
movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')

In [21]:
# Cleaning 'vote' column
movies['vote'] = movies['vote'].str.replace(',', '').astype(int)
movies.head(3)

Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
0,Zack Snyder's Justice League,2021,242,8.2,54.0,258456,-
1,The Ten Commandments,1956,220,7.9,,65015,$93.74M
2,The Father,2020,97,8.3,88.0,27678,-


In [22]:
# Cleaning 'gross_earning' column
# left strip $ and right strip M 
movies['gross_earning'] = movies['gross_earning'].map(lambda x: x.lstrip('$').rstrip('M'))
# convert it to float and if there are dashes turn it into NaN
movies['gross_earning'] = pd.to_numeric(movies['gross_earning'], errors='coerce')
movies.head(3)

Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
0,Zack Snyder's Justice League,2021,242,8.2,54.0,258456,
1,The Ten Commandments,1956,220,7.9,,65015,93.74
2,The Father,2020,97,8.3,88.0,27678,


In [23]:
movies.dtypes

movie             object
year               int64
time_minute        int64
imdb_rating      float64
metascore        float64
vote               int64
gross_earning    float64
dtype: object

In [24]:
movies

Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
0,Zack Snyder's Justice League,2021,242,8.2,54.0,258456,
1,The Ten Commandments,1956,220,7.9,,65015,93.74
2,The Father,2020,97,8.3,88.0,27678,
3,Sleepers,1996,147,7.6,49.0,193257,49.10
4,Harry Potter and the Half-Blood Prince,2009,153,7.6,78.0,482370,301.96
...,...,...,...,...,...,...,...
345,The Others,2001,104,7.6,74.0,341120,96.52
346,Just Mercy,2019,137,7.6,68.0,49938,
347,3 Idiots,2009,170,8.4,67.0,350228,6.53
348,Some Like It Hot,1959,121,8.2,98.0,246804,25.00


In [25]:
movies.to_csv('movies_test.csv')

In [38]:
df = pd.read_csv('movies_test.csv')

In [39]:
df

Unnamed: 0.1,Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
0,0,Zack Snyder's Justice League,2021,242,8.2,54.0,258456,
1,1,The Ten Commandments,1956,220,7.9,,65015,93.74
2,2,The Father,2020,97,8.3,88.0,27678,
3,3,Sleepers,1996,147,7.6,49.0,193257,49.10
4,4,Harry Potter and the Half-Blood Prince,2009,153,7.6,78.0,482370,301.96
...,...,...,...,...,...,...,...,...
345,345,The Others,2001,104,7.6,74.0,341120,96.52
346,346,Just Mercy,2019,137,7.6,68.0,49938,
347,347,3 Idiots,2009,170,8.4,67.0,350228,6.53
348,348,Some Like It Hot,1959,121,8.2,98.0,246804,25.00


In [40]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
0,0,Zack Snyder's Justice League,2021,242,8.2,54.0,258456,
1,1,The Ten Commandments,1956,220,7.9,,65015,93.74
2,2,The Father,2020,97,8.3,88.0,27678,
3,3,Sleepers,1996,147,7.6,49.0,193257,49.1
4,4,Harry Potter and the Half-Blood Prince,2009,153,7.6,78.0,482370,301.96
5,5,Sound of Metal,2019,120,7.8,82.0,59593,
6,6,The Trial of the Chicago 7,2020,129,7.8,76.0,128456,
7,7,Avengers: Endgame,2019,181,8.4,78.0,854754,858.37
8,8,The Godfather,1972,175,9.2,100.0,1645702,134.97
9,9,Saving Private Ryan,1998,169,8.6,91.0,1252798,216.54


In [41]:
df.shape[0]

350

In [45]:
df.sort_values(by='imdb_rating', ascending=False)

Unnamed: 0.1,Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
165,165,The Shawshank Redemption,1994,142,9.3,80.0,2376024,28.34
15,15,The Shawshank Redemption,1994,142,9.3,80.0,2376024,28.34
8,8,The Godfather,1972,175,9.2,100.0,1645702,134.97
158,158,The Godfather,1972,175,9.2,100.0,1645702,134.97
13,13,The Dark Knight,2008,152,9.0,84.0,2339823,534.86
...,...,...,...,...,...,...,...,...
315,315,Apollo 13,1995,140,7.6,77.0,272910,173.84
154,154,Harry Potter and the Half-Blood Prince,2009,153,7.6,78.0,482370,301.96
292,292,Kick-Ass,2010,117,7.6,66.0,528696,48.07
11,11,Harry Potter and the Sorcerer's Stone,2001,152,7.6,64.0,669351,317.58


In [46]:
df.sort_values(by='metascore', ascending=False)

Unnamed: 0.1,Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
158,158,The Godfather,1972,175,9.2,100.0,1645702,134.97
8,8,The Godfather,1972,175,9.2,100.0,1645702,134.97
348,348,Some Like It Hot,1959,121,8.2,98.0,246804,25.00
318,318,Seven Samurai,1954,207,8.6,98.0,319687,0.27
132,132,Pan's Labyrinth,2006,118,8.2,98.0,624766,37.63
...,...,...,...,...,...,...,...,...
333,333,Fear and Loathing in Las Vegas,1998,118,7.6,41.0,262033,10.68
1,1,The Ten Commandments,1956,220,7.9,,65015,93.74
106,106,The Invisible Guest,2016,106,8.1,,149449,
151,151,The Ten Commandments,1956,220,7.9,,65015,93.74


In [44]:
df.sort_values(by='gross_earning', ascending=False)

Unnamed: 0.1,Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
7,7,Avengers: Endgame,2019,181,8.4,78.0,854754,858.37
157,157,Avengers: Endgame,2019,181,8.4,78.0,854754,858.37
170,170,Avengers: Infinity War,2018,149,8.4,68.0,864188,678.82
20,20,Avengers: Infinity War,2018,149,8.4,68.0,864188,678.82
28,28,Titanic,1997,194,7.8,75.0,1059437,659.33
...,...,...,...,...,...,...,...,...
162,162,Another Round,2020,117,7.8,80.0,58594,
171,171,The Gentlemen,2019,113,7.8,51.0,256382,
256,256,The Invisible Guest,2016,106,8.1,,149449,
311,311,Giant,1956,201,7.6,84.0,34566,
