In [47]:
## importing modules

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time

import pandas as pd
from sqlalchemy import create_engine
import re

In [76]:
# function to click the show more button

In [3]:
def show_all_data(driver):
    while True:
        try:
            # Wait until it finds the button
            see_more_btn=WebDriverWait(driver,10).until(
                EC.element_to_be_clickable((By.CLASS_NAME,"ipc-see-more__button"))
            )
            # Scroll to the button
            driver.execute_script("arguments[0].scrollIntoView(true);",see_more_btn)
            time.sleep(1)
            # clicks the button
            driver.execute_script("arguments[0].click();",see_more_btn)
        except Exception:
            break

In [6]:
# Removes the number annotations

In [8]:
def convert_to_number(st):
    st=st.replace("(","").replace(")","")
    num_notes={"K":1000,"M":1000000,"B":1000000000}
    if st[-1] in num_notes.keys():
        multiplier=num_notes[st[-1]]
        num=float(st[:-1])*multiplier
        return num
    return float(st)

In [10]:
url="https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres="

In [12]:
# Function to get the data of movies based on the genre

In [18]:
def get_movies_by_genre(genre):
    # Array to store movies
    movies_arr=[]
    # Creating a crome driver
    driver=webdriver.Chrome()
    driver.get(f'{url}{genre}')
    # Calling the show_all_data function
    show_all_data(driver)
    time.sleep(3)
    
    # collecting all movie containers
    movies=driver.find_elements(By.CLASS_NAME,"ipc-metadata-list-summary-item")

    files_not_found=0

    # looing through movie containers and get their respective names and info
    for movie in movies:
        try:
            Movie_Name=movie.find_element(By.CLASS_NAME,"ipc-title__text")
            Rating=movie.find_element(By.CLASS_NAME,"ipc-rating-star--rating")
            Voting_Count=movie.find_element(By.CLASS_NAME,"ipc-rating-star--voteCount")
            Duration=movie.find_element(By.CSS_SELECTOR,".jttFlJ:nth-of-type(2)")

            driver.execute_script("arguments[0].scrollIntoView(true);",movie)

            Movie_Name=''.join(Movie_Name.text.split(". ")[1:])
            conv_voting_count=convert_to_number(Voting_Count.text)

            # Store the info as a dictionary
            movie_dict={
                "Movie_Name":Movie_Name,
                "Rating":float(Rating.text),
                "Voting_Count":conv_voting_count,
                "Duration":Duration.text,
                "Genre":genre.capitalize()
            }
            # Append the dictionary to the movies array
            movies_arr.append(movie_dict)
        except NoSuchElementException:
            files_not_found+=1
            continue
    print(f" files not found - {files_not_found}")
    driver.quit()
    # Returns the movie array
    return movies_arr

In [28]:
def Convert_Duration(string):
    hours=0
    minutes=0
    if "h" in string:
        hours=int(re.search(r'(\d+)h',string).group(1))
    if "m" in string:
        minutes=int(re.search(r"(\d+)m",string).group(1))

    return hours*60+minutes

In [None]:
adventure_movies=get_movies_by_genre('adventure')

In [50]:
action_movies=get_movies_by_genre('action')

 files not found - 531


In [59]:
comedy_movies=get_movies_by_genre('comedy')

 files not found - 1116


In [16]:
drama_movies=get_movies_by_genre('drama')

 files not found - 3116


In [63]:
crime_movies=get_movies_by_genre('crime')

 files not found - 382


In [65]:
adventure_movies_df=pd.DataFrame(adventure_movies)
adventure_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Mufasa: The Lion King,6.6,59000.0,1h 58m,adventure
1,Moana 2,6.6,98000.0,1h 40m,adventure
2,Kraven the Hunter,5.5,52000.0,2h 7m,adventure
3,Flow,7.9,70000.0,1h 25m,adventure
4,Gladiator II,6.5,225000.0,2h 28m,adventure


In [48]:
adventure_movies_df["Duration"]=adventure_movies_df["Duration"].apply(Convert_Duration)
adventure_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Mufasa: The Lion King,6.6,59000.0,118,adventure
1,Moana 2,6.6,98000.0,100,adventure
2,Kraven the Hunter,5.5,52000.0,127,adventure
3,Flow,7.9,70000.0,85,adventure
4,Gladiator II,6.5,225000.0,148,adventure


In [20]:
adventure_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Movie_Name    390 non-null    object 
 1   Rating        390 non-null    float64
 2   Voting_Count  390 non-null    float64
 3   Duration      390 non-null    object 
 4   Genre         390 non-null    object 
dtypes: float64(2), object(3)
memory usage: 15.4+ KB


In [56]:
adventure_movies_df.to_csv("adventure_movies.csv",index=False,encoding="utf-8",sep=",")

In [71]:
action_movies_df=pd.DataFrame(action_movies)
action_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Kraven the Hunter,5.5,52000.0,2h 7m,action
1,Gladiator II,6.5,225000.0,2h 28m,action
2,Twisters,6.5,165000.0,2h 2m,action
3,Sonic the Hedgehog 3,6.9,57000.0,1h 50m,action
4,Venom: The Last Dance,6.0,114000.0,1h 50m,action


In [56]:
action_movies_df["Duration"]=action_movies_df["Duration"].apply(Convert_Duration)
action_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Kraven the Hunter,5.5,52000.0,127,action
1,Gladiator II,6.5,225000.0,148,action
2,Twisters,6.5,165000.0,122,action
3,Sonic the Hedgehog 3,6.9,57000.0,110,action
4,Venom: The Last Dance,6.0,114000.0,110,action
...,...,...,...,...,...
803,Anger,8.8,18.0,83,action
804,Gadha17,8.0,11.0,90,action
805,Mandya Haida,6.2,236.0,139,action
806,Bring Back the Colour,5.9,7.0,47,action


In [73]:
action_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 808 entries, 0 to 807
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Movie_Name    808 non-null    object 
 1   Rating        808 non-null    float64
 2   Voting_Count  808 non-null    float64
 3   Duration      808 non-null    object 
 4   Genre         808 non-null    object 
dtypes: float64(2), object(3)
memory usage: 31.7+ KB


In [58]:
action_movies_df.to_csv('action_movies.csv',index=False,encoding="utf-8",sep=',')

In [77]:
comedy_movies_df=pd.DataFrame(comedy_movies)
comedy_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Anora,7.5,176000.0,2h 19m,comedy
1,Moana 2,6.6,98000.0,1h 40m,comedy
2,A Real Pain,7.1,85000.0,1h 30m,comedy
3,Sonic the Hedgehog 3,6.9,57000.0,1h 50m,comedy
4,Y2K,4.8,13000.0,1h 31m,comedy


In [62]:
comedy_movies_df["Duration"]=comedy_movies_df["Duration"].apply(Convert_Duration)
comedy_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Anora,7.5,176000.0,139,comedy
1,Moana 2,6.6,98000.0,100,comedy
2,A Real Pain,7.1,85000.0,90,comedy
3,Sonic the Hedgehog 3,6.9,57000.0,110,comedy
4,Y2K,4.8,13000.0,91,comedy


In [81]:
comedy_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1734 entries, 0 to 1733
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Movie_Name    1734 non-null   object 
 1   Rating        1734 non-null   float64
 2   Voting_Count  1734 non-null   float64
 3   Duration      1734 non-null   object 
 4   Genre         1734 non-null   object 
dtypes: float64(2), object(3)
memory usage: 67.9+ KB


In [60]:
comedy_movies_df.to_csv('comedy_movies.csv',index=False,encoding="utf-8",sep=',')

In [22]:
drama_movies_df=pd.DataFrame(scifi_movies)
drama_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Anora,7.5,177000.0,2h 19m,drama
1,Mufasa: The Lion King,6.6,60000.0,1h 58m,drama
2,A Complete Unknown,7.4,71000.0,2h 21m,drama
3,The Substance,7.3,294000.0,2h 21m,drama
4,The Friend,6.7,865.0,1h 59m,drama


In [24]:
drama_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3176 entries, 0 to 3175
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Movie_Name    3176 non-null   object 
 1   Rating        3176 non-null   float64
 2   Voting_Count  3176 non-null   float64
 3   Duration      3176 non-null   object 
 4   Genre         3176 non-null   object 
dtypes: float64(2), object(3)
memory usage: 124.2+ KB


In [30]:
drama_movies_df["Duration"]=drama_movies_df["Duration"].apply(Convert_Duration)
drama_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Anora,7.5,177000.0,139,drama
1,Mufasa: The Lion King,6.6,60000.0,118,drama
2,A Complete Unknown,7.4,71000.0,141,drama
3,The Substance,7.3,294000.0,141,drama
4,The Friend,6.7,865.0,119,drama


In [62]:
drama_movies_df.to_csv('drama_movies.csv',index=False,encoding="utf-8",sep=',')

In [89]:
crime_movies_df=pd.DataFrame(crime_movies)
crime_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Trap,5.8,136000.0,1h 45m,crime
1,Pushpa: The Rule - Part 2,6.1,55000.0,3h 21m,crime
2,Longlegs,6.6,183000.0,1h 41m,crime
3,Freaky Tales,7.2,906.0,1h 46m,crime
4,Juror #2,7.0,95000.0,1h 54m,crime


In [91]:
crime_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593 entries, 0 to 592
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Movie_Name    593 non-null    object 
 1   Rating        593 non-null    float64
 2   Voting_Count  593 non-null    float64
 3   Duration      593 non-null    object 
 4   Genre         593 non-null    object 
dtypes: float64(2), object(3)
memory usage: 23.3+ KB


In [74]:
crime_movies_df["Duration"]=crime_movies_df["Duration"].apply(Convert_Duration)
crime_movies_df.head()

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Trap,5.8,136000.0,105,crime
1,Pushpa: The Rule - Part 2,6.1,55000.0,201,crime
2,Longlegs,6.6,183000.0,101,crime
3,Freaky Tales,7.2,906.0,106,crime
4,Juror #2,7.0,95000.0,114,crime


In [64]:
crime_movies_df.to_csv('crime_movies.csv',index=False,encoding="utf-8",sep=',')

In [66]:
movies=pd.concat([adventure_movies_df,action_movies_df,comedy_movies_df,drama_movies_df,crime_movies_df],ignore_index=True)
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6701 entries, 0 to 6700
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Movie_Name    6701 non-null   object 
 1   Rating        6701 non-null   float64
 2   Voting_Count  6701 non-null   float64
 3   Duration      6701 non-null   int64  
 4   Genre         6701 non-null   object 
dtypes: float64(2), int64(1), object(2)
memory usage: 261.9+ KB


In [68]:
movies=movies[~(movies.iloc[:,:4].duplicated())]
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5444 entries, 0 to 6700
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Movie_Name    5444 non-null   object 
 1   Rating        5444 non-null   float64
 2   Voting_Count  5444 non-null   float64
 3   Duration      5444 non-null   int64  
 4   Genre         5444 non-null   object 
dtypes: float64(2), int64(1), object(2)
memory usage: 255.2+ KB


In [70]:
movies.to_csv("movies.csv",index=False,encoding="utf-8",sep=",")

In [72]:
movies=pd.read_csv("movies.csv")
movies

Unnamed: 0,Movie_Name,Rating,Voting_Count,Duration,Genre
0,Mufasa: The Lion King,6.6,59000.0,118,Adventure
1,Moana 2,6.6,98000.0,100,Adventure
2,Kraven the Hunter,5.5,52000.0,127,Adventure
3,Flow,7.9,70000.0,85,Adventure
4,Gladiator II,6.5,225000.0,148,Adventure
...,...,...,...,...,...
5439,Slapes,6.8,31.0,82,Crime
5440,7vens Law,9.4,20.0,68,Crime
5441,Little Black Submarines,6.9,57.0,123,Crime
5442,All Suspects,3.6,18.0,96,Crime


In [None]:
# Creating a conneciton with mysql

In [74]:
user = "root"
password = "%40Vicky143"
host = "localhost"
database = "imdb"

engine=create_engine(f"mysql+pymysql://{user}:{password}@{host}/{database}")

In [174]:
# Inserting all data into mysql

In [76]:
movies.to_sql("movies",con=engine,if_exists="replace",index=False)

5444