### 📦 **Libraries Used for Scraping _IMDB_ 🎬**  

In [2]:
import pandas as pd 
import numpy as np
from numerize import numerize
import ast
import os
import sqlite3

### 📂 **Reading the CSV File**  

In [3]:
df = pd.read_csv('../data/raw/data_scraping_imdb.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190 entries, 0 to 189
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          190 non-null    int64  
 1   movie_name          190 non-null    object 
 2   release_year        189 non-null    float64
 3   certificate         190 non-null    object 
 4   duration            189 non-null    object 
 5   rating              190 non-null    float64
 6   director            190 non-null    object 
 7   oscar_nominations   190 non-null    int64  
 8   genre               190 non-null    object 
 9   countries           190 non-null    object 
 10  language            190 non-null    object 
 11  production_company  190 non-null    object 
 12  budget              189 non-null    object 
 13  gross_worldwide     182 non-null    object 
dtypes: float64(2), int64(2), object(10)
memory usage: 20.9+ KB


In [None]:
df = df.rename(columns={
    'Unnamed: 0' :'movie_id',
    'duration' : 'duration_minutes'
})

# **Cleaning Data** 🎬  

In [None]:
budget_of_film = df['budget'].tolist()
new_list = []
for element in budget_of_film:
    if pd.isna(element): 
        new_list.append(None)
        continue  
    new_element = str(element).strip()  
    new_element = new_element.replace('(estimated)', '')  
    new_element = new_element.replace("$", "").replace(",", "") 
    new_element = new_element.strip()  

    if new_element.endswith("M"):
        new_element = new_element.replace("M", "")
        if new_element.isdigit():
            new_list.append(int(new_element)) 
            continue

    if new_element.isdigit():  
        new_list.append(int(new_element))  
    else:
        new_list.append(None)  
df['cleaned_budget'] = new_list
df['budget'] = df['cleaned_budget'].astype(float) / 1_000_000
df['budget'] = df['budget'].fillna(0)

In [None]:
df['release_year'] = df['release_year'].fillna(0).astype('int64')

In [None]:
def clean_gross(value):
    if pd.isna(value) or not isinstance(value, str):  
        return None  

    value = value.strip().replace("$", "").replace(",", "") 

    try:
        return round(float(value) / 1_000_000, 2)  
    except ValueError:
        return None  
df['gross_in_million'] = df['gross_worldwide'].apply(clean_gross)
df['gross_in_million'] = df['gross_in_million'].fillna(0)


In [None]:
df.drop(['cleaned_budget','gross_worldwide'],axis=1,inplace=True)
df.drop_duplicates(subset=['movie_name'], inplace=True)

In [None]:
df['duration_minutes'] = df['duration_minutes'].replace('None', None)

df['hours'] = df['duration_minutes'].str.extract(r'(\d+)h', expand=False).fillna(0).astype(int)
df['minutes'] = df['duration_minutes'].str.extract(r'(\d+)m', expand=False).fillna(0).astype(int)

df['duration_minutes'] = (df['hours'] * 60) + df['minutes']

df.drop(columns=['hours', 'minutes'], inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 164 entries, 0 to 189
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            164 non-null    int64  
 1   movie_name          164 non-null    object 
 2   release_year        164 non-null    int64  
 3   certificate         164 non-null    object 
 4   duration_minutes    164 non-null    int64  
 5   rating              164 non-null    float64
 6   director            164 non-null    object 
 7   oscar_nominations   164 non-null    int64  
 8   genre               164 non-null    object 
 9   countries           164 non-null    object 
 10  language            164 non-null    object 
 11  production_company  164 non-null    object 
 12  budget              164 non-null    float64
 13  gross_in_million    164 non-null    float64
dtypes: float64(3), int64(4), object(7)
memory usage: 19.2+ KB


In [None]:
df['production_company'].describe()

count     164
unique     95
top        []
freq       40
Name: production_company, dtype: object

In [None]:
def convert_stringified_list(column):
    return column.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x).apply(lambda x: "None" if not x else x)


In [None]:
colums_to_convert = ['genre','countries','language','production_company']

for col in colums_to_convert:
    df[col] = convert_stringified_list(df[col])

## **Checking cleaned data**

In [None]:
print(df.isnull().sum())

movie_id              0
movie_name            0
release_year          0
certificate           0
duration_minutes      0
rating                0
director              0
oscar_nominations     0
genre                 0
countries             0
language              0
production_company    0
budget                0
gross_in_million      0
dtype: int64


# **Normalization**

In [None]:
df_genre = df[['movie_id','genre']].explode('genre')
df_countries = df[['movie_id','countries']].explode('countries')
df_language = df[['movie_id','language']].explode('language')
df_production = df[['movie_id','production_company']].explode('production_company')

## **Converting to CSV File** 📂

In [None]:
# df_genre.to_csv("../data/raw/genre.csv")
# df_countries.to_csv("../data/raw/countries.csv")
# df_language.to_csv("../data/raw/language.csv")
# df_production.to_csv("../data/raw/production_company.csv")

In [None]:
data_folder = os.path.join('../data/')
%load_ext sql
%config sqlMagic.autocommit=True
connection = sqlite3.connect(os.path.join(data_folder,'./IMDB.db'))
%sql sqlite:///../data/IMDB.db --alias imdb

In [None]:
cursor = connection.cursor()

## Creating Table for **Normalization**

In [None]:
%%sql

create table if not exists imdb_movies(
    movie_id INTEGER primary key,
    movie_name VARCHAR(200),
    release_year INTEGER,
    certificate VARCHAR(20),
    duration_minutes INTEGER,
    rating REAL,
    director VARCHAR(100),
    oscar_nominations INTEGER,
    budget DECIMAL(10,2),
    gross_in_million DECIMAL(10,2)
);


create table if not exists genres(
    movie_id INTEGER,
    genre VARCHAR(150),
    FOREIGN KEY(movie_id) REFERENCES imdb_movies(movie_id)
);


create table if not exists countries(
    movie_id INTEGER,
    countries VARCHAR(150),
    FOREIGN KEY(movie_id) REFERENCES imdb_movies(movie_id)
);

create table if not exists language(
    movie_id INTEGER,
    language VARCHAR(100),
    FOREIGN KEY(movie_id) REFERENCES imdb_movies(movie_id)
);

create table if not exists production_companies(
    movie_id INTEGER,
    production_company VARCHAR(200),
    FOREIGN KEY(movie_id) REFERENCES imdb_movies(movie_id)
);



In [None]:
df[['movie_id', 'movie_name', 'release_year', 'certificate', 'duration_minutes',
    'rating', 'director', 'oscar_nominations', 'budget', 'gross_in_million']].to_sql('imdb_movies',connection,if_exists ='append',index = False)

164

## _Inserting values_ into **table**

In [None]:
df_genre.to_sql('genres',connection,if_exists='append',index=False);
df_countries.to_sql('countries',connection,if_exists='append',index=False);
df_language.to_sql('language',connection,if_exists='append',index=False);
df_production.to_sql('production_companies',connection,if_exists='append',index=False);