In [3]:
import pandas as pd
import numpy as np 
from tmdbv3api import TMDb
import json
import requests
from dotenv import load_dotenv
import os
from pathlib import Path
from tmdbv3api import Movie

In [4]:
dotenv_path = Path(os.getcwd()).parent / ".env"
load_dotenv(dotenv_path)
api_key = os.getenv("API_KEY")

In [5]:
tmdb = TMDb()
tmdb.api_key = api_key

tmdb_movie = Movie()
def get_genre(x): #pass in the title of the movies
    genres = []
    result = tmdb_movie.search(x)
    if not result:  # If the result is an empty list, return 'unknown'
        return "unknown"
    movie_id = result[0].id #we will match the "id" with the "title"
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key)) #we will get the result from the IMDb data
    data_json = response.json() #we will then convert it to a json file
    if data_json['genres']: #in the json file we will only need to extract the "genre"
        genre_str = " " 
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name']) #we will then add the "genre" to the empty genre list we created above
        return genre_str.join(genres)
    else:
        np.nan #

In [6]:
def get_director(x):
    if isinstance(x, float) or pd.isna(x):  # Check if x is NaN (float)
        return "unknown"
    elif " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]
    return "unknown"

In [7]:
def get_actor(x, num):
    # Ensure x is not NaN
    if pd.isna(x):
        return "unkown"
    
    # Split by the correct delimiter and get the last part (the actors' names)
    actors = x.split("screenplay); ")[-1].split(", ")
    
    # If there aren't enough actors, return NaN
    if len(actors) < num:
        return np.nan
    else:
        return actors[num - 1]

In [2]:
def preprocess(year):
    link = f"https://en.wikipedia.org/wiki/List_of_American_films_of_{year}"
    print(link)
    df1 = pd.read_html(link, header=0)[2]
    df2 = pd.read_html(link, header=0)[3]
    df3 = pd.read_html(link, header=0)[4]
    df4 = pd.read_html(link, header=0)[5]
    df = pd.concat([df1, df2, df3, df4], ignore_index=True)
    df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))
    df = df[['Title','Cast and crew','genres']]
    df['director_name'] = df['Cast and crew'].map(lambda x: get_director(x))
    df['actor_1_name'] = df['Cast and crew'].map(lambda x: get_actor(x,1))
    df['actor_2_name'] = df['Cast and crew'].map(lambda x: get_actor(x,2))
    df['actor_3_name'] = df['Cast and crew'].map(lambda x: get_actor(x,3))
    df = df.rename(columns={'Title':'movie_title'})
    df['movie_title'] = df['movie_title'].str.lower()

    new_df= df.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]
    new_df['actor_1_name'] = new_df['actor_1_name'].replace(np.nan, 'unknown')
    new_df['actor_2_name'] = new_df['actor_2_name'].replace(np.nan, 'unknown')
    new_df['actor_3_name'] = new_df['actor_3_name'].replace(np.nan, 'unknown')
    new_df['genres'] = new_df['genres'].replace(np.nan, 'unknown')
    new_df['comb'] = new_df['actor_1_name'] + ' ' + new_df['actor_2_name'] + ' '+ new_df['actor_3_name'] + ' '+ new_df['director_name'] +' ' + new_df['genres']
 
    return new_df

    

In [52]:
df_2018 = preprocess(2018)

https://en.wikipedia.org/wiki/List_of_American_films_of_2018


In [36]:
df_2019 = preprocess(2019)



https://en.wikipedia.org/wiki/List_of_American_films_of_2019


In [39]:
df_2020 = preprocess(2020)

https://en.wikipedia.org/wiki/List_of_American_films_of_2020


In [57]:
df_2021 = preprocess(2021)

https://en.wikipedia.org/wiki/List_of_American_films_of_2021


In [58]:
df_2022 = preprocess(2022)

https://en.wikipedia.org/wiki/List_of_American_films_of_2022


In [59]:
df_2023  = preprocess(2023)

https://en.wikipedia.org/wiki/List_of_American_films_of_2023


In [60]:
df_2024 = preprocess(2024)

https://en.wikipedia.org/wiki/List_of_American_films_of_2024


In [62]:
df = pd.concat([df_2018,df_2019,df_2020,df_2021,df_2022,df_2023,df_2024], ignore_index= True)

(2294, 7)

In [63]:
old = pd.read_csv('../artifacts/new_data.csv')
old.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...


In [64]:
new = pd.concat([old , df] , ignore_index= True)
new

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
7718,Rachel Morrison,Ryan Destiny,Brian Tyree Henry,Judy Greer,History Drama,The Fire Inside,Ryan Destiny Brian Tyree Henry Judy Greer Rach...
7719,Michael Gracey,Robbie Williams,Jonno Davies,Steve Pemberton,Music Comedy Drama,Better Man,Robbie Williams Jonno Davies Steve Pemberton M...
7720,Halina Reijn,Nicole Kidman,Harris Dickinson,Sophie Wilde,Drama,Babygirl,Nicole Kidman Harris Dickinson Sophie Wilde Ha...
7721,"Michael Schwartz, Tyler Nilson (directors/scre...",Héctor Medina,Adria Arjona,Eros de la Puente,Drama,Los Frikis,Héctor Medina Adria Arjona Eros de la Puente M...


In [65]:
new.drop_duplicates(subset ="movie_title", keep = 'last', inplace = True)
new

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
7718,Rachel Morrison,Ryan Destiny,Brian Tyree Henry,Judy Greer,History Drama,The Fire Inside,Ryan Destiny Brian Tyree Henry Judy Greer Rach...
7719,Michael Gracey,Robbie Williams,Jonno Davies,Steve Pemberton,Music Comedy Drama,Better Man,Robbie Williams Jonno Davies Steve Pemberton M...
7720,Halina Reijn,Nicole Kidman,Harris Dickinson,Sophie Wilde,Drama,Babygirl,Nicole Kidman Harris Dickinson Sophie Wilde Ha...
7721,"Michael Schwartz, Tyler Nilson (directors/scre...",Héctor Medina,Adria Arjona,Eros de la Puente,Drama,Los Frikis,Héctor Medina Adria Arjona Eros de la Puente M...


In [66]:
new.to_csv('../artifacts/final_data.csv',index=False)

In [12]:
old = pd.read_csv('../artifacts/final_data.csv')
old.tail()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
7701,Rachel Morrison,Ryan Destiny,Brian Tyree Henry,Judy Greer,History Drama,the fire inside,Ryan Destiny Brian Tyree Henry Judy Greer Rach...
7702,Michael Gracey,Robbie Williams,Jonno Davies,Steve Pemberton,Music Comedy Drama,better man,Robbie Williams Jonno Davies Steve Pemberton M...
7703,Halina Reijn,Nicole Kidman,Harris Dickinson,Sophie Wilde,Drama,babygirl,Nicole Kidman Harris Dickinson Sophie Wilde Ha...
7704,"Michael Schwartz, Tyler Nilson (directors/scre...",Héctor Medina,Adria Arjona,Eros de la Puente,Drama,los frikis,Héctor Medina Adria Arjona Eros de la Puente M...
7705,Matthew John Lawrence,Molly Brown,Jeffrey Dean Morgan,Billy Burke,Horror Comedy,bloody axe wound,Molly Brown Jeffrey Dean Morgan Billy Burke Ma...
