In [19]:
#Project title: IMDb movie reviews - scraping and cleaning the movie ratings
#Project goal: Scrape movie reviews from IMDb and clean the data to prepare it for analysis

#MILESTONE 1 : DATASCRAPING
#importing libraries
import requests #to send http requests
from bs4 import BeautifulSoup #to do html content parsing
import pandas as pd # to work with tabular data
import json #to handle json data

#Sending request to IMdb
#Setting custom headers to make our request appear like it's coming from a real user's browser
#This helps avoid blocks or restrictions from IMDb's server that might be triggered by bots
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [20]:
#IMDb Top 250 movies page 
url ="https://www.imdb.com/chart/top/"

#Sending a GET request to the IMDb Top 250 page to retrieve its HTML content for scraping
response = requests.get(url, headers=headers)
response.status_code

#Checking for successful retrieval
if response.status_code == 200:
    print ("Retrieved successfully.")
else:
    print (f"Error retrieving, status code: {response.status_code}")

Retrieved successfully.


In [21]:
#Parsing HTML with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
soup #parsing

#extracting the json-ld script 
json_data=soup.find("script",type="application/ld+json")

#loading json
if json_data:
    data = json.loads(json_data.string)
    
#initializing lists to hold the extracted movie details
titles = []
urls = []
descriptions = []
best_ratings = []
worst_ratings = []
ratings = []
genres = []
durations = [] 

#extracting movie details from the JSON data
if "itemListElement" in data:#checking if key exists
    for item in data["itemListElement"]:  #looping
        movie=item["item"] #accessing individual data
        titles.append(movie["name"]) #appending
        urls.append(movie["url"])
        descriptions.append(movie["description"])
        best_ratings.append(movie["aggregateRating"]["bestRating"])
        worst_ratings.append(movie["aggregateRating"]["worstRating"])
        ratings.append(float(movie["aggregateRating"]["ratingValue"]))
        genres.append(movie["genre"])
        durations.append(movie["duration"])
        
#creating pandas dataframe to store the scraped data
df = pd.DataFrame({
    "Title": titles,
    "URL": urls,
    "Description": descriptions,
    "Best rating": best_ratings,
    "Worst rating": worst_ratings,
    "Rating": ratings,
    "Genre": genres,
    "Duration": durations
}) 

#inserting column named ID at 0 index or beginning 
df.insert(0, "ID", range(1, len(df)+1))  

In [22]:
#displaying the first few rows of the dataframe
df.head()

Unnamed: 0,ID,Title,URL,Description,Best rating,Worst rating,Rating,Genre,Duration
0,1,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,A banker convicted of uxoricide forms a friend...,10,1,9.3,Drama,PT2H22M
1,2,The Godfather,https://www.imdb.com/title/tt0068646/,The aging patriarch of an organized crime dyna...,10,1,9.2,"Crime, Drama",PT2H55M
2,3,The Dark Knight,https://www.imdb.com/title/tt0468569/,When a menace known as the Joker wreaks havoc ...,10,1,9.0,"Action, Crime, Drama",PT2H32M
3,4,The Godfather Part II,https://www.imdb.com/title/tt0071562/,The early life and career of Vito Corleone in ...,10,1,9.0,"Crime, Drama",PT3H22M
4,5,12 Angry Men,https://www.imdb.com/title/tt0050083/,The jury in a New York City murder trial is fr...,10,1,9.0,"Crime, Drama",PT1H36M


In [23]:
#MILESTONE 2: DATA CLEANING AND PREPROCESSING
#checking for missing values
df.isnull().sum()

ID              0
Title           0
URL             0
Description     0
Best rating     0
Worst rating    0
Rating          0
Genre           0
Duration        0
dtype: int64

No missing values were found in any column of the dataset. This confirms that the scraped data is complete and does not require imputation or row removal.

In [24]:
#checking for duplicate rows
df.duplicated().sum()

0

No duplicate rows found

In [25]:
#checking to confirm the data types
df.dtypes

ID                int64
Title            object
URL              object
Description      object
Best rating       int64
Worst rating      int64
Rating          float64
Genre            object
Duration         object
dtype: object

integer good. float good. strings appear as object in pandas unless explicitly converted, so no error found. 

clean data set generated.

In [26]:
# exporting the final dataset to a CSV file without the index column
df.to_csv("IMDb_movie_reviews.csv", index=False)

# confirmation message
print("✅ Data successfully saved as 'IMDb_movie_reviews.csv'")

✅ Data successfully saved as 'IMDb_movie_reviews.csv'


Dataset exported and saved. 