In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'
source = requests.get(url)

# Get information on top 100 movies of all time based on IMDB ratings

soup = BeautifulSoup(source.content, 'html.parser')

# Take html content of website and return as BeautifulSoup object

In [3]:
name = []
year = []
genre = []
imdb_rating = []
metascore = [] # weighted average score of critics
runtime = []
number_votes = []
gross_earnings = []

# Initialize empty lists to store scraped data 

In [4]:
movie_data = soup.find_all('div', class_= 'lister-item mode-advanced')

In [5]:
len(movie_data)

# Check if number of extracted data points matches expected number

100

In [6]:
for store in movie_data:
    movie_name = store.h3.find('a').text
    name.append(movie_name)
    
    movie_year = store.h3.find('span', class_= 'lister-item-year text-muted unbold').text
    movie_year = movie_year.replace('(', '').replace(')','') # Delete parantheses around year
    year.append(movie_year)
    
    movie_genre = store.p.find('span', class_='genre').text.replace('\n', '')
    genre.append(movie_genre)
    
    movie_imdb_rating = store.find('div', class_= 'inline-block ratings-imdb-rating').text.replace('\n', '')
    imdb_rating.append(movie_imdb_rating)
    
    if store.find('span', class_= 'metascore'):
        movie_metascore = store.find('span', class_= 'metascore').text 
    else:
        movie_metascore = 'na'
    movie_metascore = movie_metascore.replace(' ', '')
    metascore.append(movie_metascore)
    
    movie_runtime = store.p.find('span', class_='runtime').text
    movie_runtime = movie_runtime.replace(' min', '')
    runtime.append(movie_runtime)
    
    votes_and_earn = store.find_all('span', attrs={'name':'nv'})
    movie_number_votes = votes_and_earn[0].text
    movie_number_votes = movie_number_votes.replace(',', '')
    number_votes.append(movie_number_votes)
    
    if len(votes_and_earn) == 3:
        movie_gross_earnings = votes_and_earn[1].text
        
    else:
        movie_gross_earnings = 'na'
    gross_earnings.append(movie_gross_earnings)
      
    
# Loop to extract relevant information from each data point, in text format

In [7]:
movie_df = pd.DataFrame({
    'name' : name,
    'year' : year,
    'genre' : genre,
    'imdb_rating' : imdb_rating,
    'metascore' : metascore,
    'runtime' : runtime,
    'number_votes' : number_votes,
    'gross_earnings' : gross_earnings
})

In [8]:
movie_df

Unnamed: 0,name,year,genre,imdb_rating,metascore,runtime,number_votes,gross_earnings
0,Jai Bhim,2021,"Crime, Drama",9.3,na,164,169810,na
1,The Shawshank Redemption,1994,Drama,9.3,80,142,2530884,$28.34M
2,The Godfather,1972,"Crime, Drama",9.2,100,175,1742398,$134.97M
3,Soorarai Pottru,2020,Drama,9.1,na,153,106492,na
4,The Dark Knight,2008,"Action, Crime, Drama",9.0,84,152,2481366,$534.86M
...,...,...,...,...,...,...,...,...
95,Vikram Vedha,2017,"Action, Crime, Drama",8.3,na,147,35593,na
96,Drishyam,2013,"Crime, Drama, Thriller",8.3,na,160,39233,na
97,The Hunt,2012,Drama,8.3,77,115,312264,$0.69M
98,A Separation,2011,Drama,8.3,95,123,235951,$7.10M


In [9]:
movie_df = movie_df.replace('na', 0)
movie_df['gross_earnings'] = movie_df['gross_earnings'].replace({
    '\$':'',
    'M':'e+06',
    'K':'e+03',
}, regex=True)

movie_df['year'] = movie_df['year'].replace({
    'II ': '',
    'I ': ''
}, regex=True)

movie_df['gross_earnings'] = movie_df['gross_earnings'].astype(float).astype('Int32')
movie_df['metascore'] = movie_df['metascore'].astype(float).astype('Int32')
movie_df = movie_df.replace(0, np.NaN)

# Convert gross_earnings and metascore strings to integers 

In [14]:
movie_df['number_votes'] = movie_df['number_votes'].astype(str).astype(int)
movie_df['runtime'] = movie_df['runtime'].astype(str).astype(int)
movie_df['imdb_rating'] = movie_df['imdb_rating'].astype(str).astype(float)
movie_df['year'] = movie_df['year'].astype(str).astype(int)

# Convert all other numerical columns to integers or floats for quantitative analysis

In [17]:
movie_df.to_csv('top100.csv')