In [None]:
#do this if needed.
!pip install certifi

In [1]:
# urllib3 is a powerful, user-friendly HTTP client for Python
# to handle  data retrieval
import urllib3
from urllib3 import request

# to handle certificate verification
import certifi

import os.path

# to manage json data
import json

# for pandas dataframes
import pandas as pd

# Import Selenium Modules
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

# Import Time
import time
from datetime import datetime

import requests

import csv

In [3]:
# handle certificate verification and SSL warnings:
# reference https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl
http = urllib3.PoolManager(
    cert_reqs='CERT_REQUIRED',
    ca_certs=certifi.where())

# access token for TMDB API
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI2ZGYzM2RhMWRiMmMxZDE1MzU0YTFkNDI2YWQyODYzMCIsIm5iZiI6MTcyMzY5NDkwMi4xMzQ2MjcsInN1YiI6IjY2YmQ2MGQ2Y2ZjNTYwN2FmMGU5YjNhYSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.8-71HQaSOlHGMGP3SLk6awUAuQyPaai43JSXK2AS8o8"
}

In [6]:
def export_dataframe_to_csv(df, file_name):
    df.to_csv(file_name, index=False, encoding='utf-8-sig')

def read_csv_to_dataframe(file_name):
    return pd.read_csv(file_name, encoding='utf-8-sig')

In [9]:
def data_crawl_top_250_movie_titles_IMDB(use_file=False):
    file_name = './csv/top_250_movie_titles_IMDB.txt'
    check_file_exists = os.path.isfile(file_name)
    if use_file and check_file_exists:
        print("IMDB DATA FILE EXISTS: READING...", end=' ')
        with open(file_name, 'r', encoding='utf-8-sig') as file:
            data = file.read()
            titles_list = data.split('\n')
            print(f"DONE - {len(titles_list)} MOVIE TITLES IMPORTED\n")
            return titles_list

    print("EXTRACT: TOP 250 MOVIE TITLES -> IMDB")
    # create an option to run Chrome browser without opening one for this data crawling operation
    op = webdriver.ChromeOptions()
    chrome_options = Options()
    user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
    chrome_options.add_argument(f'user-agent={user_agent}')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--allow-running-insecure-content')
    chrome_options.add_argument("--headless")
    # locate the driver and load it in
    driver = webdriver.Chrome()#options=chrome_options)
    # define the url of the google form and use the driver to open up the url
    url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"
    
    driver.get(url)
    driver.maximize_window()
    driver.get_screenshot_as_file("screenshot.png")
    time.sleep(2)
    time_before_extraction = datetime.now()
    
    # find the div where all the list of movies are placed in
    movies = driver.find_elements(By.XPATH, '//ul[@role="presentation"]//li//div//div//div//div//a//h3[@class="ipc-title__text"]')
    
    # extract the movie titles of the top 250 movies
    #movies = movie_box.find_elements(By.XPATH, './/h3')
    movies_list = []
    for i in range(len(movies)):
        movies_list.append(movies[i].get_attribute("innerHTML"))
    
    time_diff = datetime.now() - time_before_extraction
    driver.close()
    print(f"DONE - {len(movies_list)} MOVIE TITLES EXTRACTED -> IMDB - TIME TAKEN: {time_diff}\n")

    with open(file_name, 'w', encoding='utf-8-sig') as file:
        for title in movies_list:
            file.write(f"{title}\n")
    
    return movies_list

In [11]:
def request_movie_data_OMDB_API(title):
    # get data from the API; replace url with target source
    url = 'http://www.omdbapi.com/?t=' + title + '&plot=full&apikey=1c62e2da'

    response = http.request('GET', url)
    if response.status != 200:
        print(f"Failed to retrieve page. Status code: {response.status}")
        return {}

    # decode json data/string into a Python dict object
    data = json.loads(response.data.decode('utf-8-sig'))
    return data

In [13]:
def gather_movies_data_OMDB_API(movies_list, use_file=False):
    file_name = './csv/top_250_movies_OMDB.csv'
    check_file_exists = os.path.isfile(file_name)
    if use_file and check_file_exists:
        print("OMDB DATA FILE EXISTS: READING...", end=' ')
        df = read_csv_to_dataframe(file_name)
        print(f"DONE - {len(df)} MOVIE DATA IMPORTED\n")
        return df
        
    movie_titles = []
    print("EXTRACT: MOVIE DATA -> OMDB API")
    
    # need to remove the numbering in front of all the movie names
    for i in range(len(movies_list)):
        # find the first space char and only take the substring after that space char
        space_index = movies_list[i].find(' ')
        title = movies_list[i][space_index+1:]
        # because we need to search the titles in the OMDB database via the URL, it requires the space chars to be replaced with plus symbol (+)
        movie_titles.append(title.replace(' ', '+'))
    
    movies_data_list = []
    time_before_extraction = datetime.now()
    
    for i in range(len(movie_titles)):
        movie_dict = request_movie_data_OMDB_API(movie_titles[i])
        if len(movie_dict) > 0 and movie_dict['Response'].lower() == 'true':
            movies_data_list.append(movie_dict)
        else:
            print(f"Movie title: {movie_titles[i].replace('+', ' ')} not found.")
    
    #print(movies_data_list)
    df = pd.json_normalize(movies_data_list)
    time_diff = datetime.now() - time_before_extraction
    print(f"DONE - {len(movies_data_list)} MOVIE RECORDS EXTRACTED -> OMDB API - TIME TAKEN: {time_diff}\n")

    export_dataframe_to_csv(df, file_name)
    return df

In [15]:
# retrieve movie's TMDB ID from TMDB using title string. returns an integer
def get_movie_id_using_title_tmdb(title):
    temp = title.replace(' ', '%20')
    url = "https://api.themoviedb.org/3/search/movie?query=" + temp + "&include_adult=false&language=en-US&page=1"
    response = requests.get(url, headers=headers)
    response_dict = response.json()
    if response_dict["total_results"] > 0:
        return response_dict["results"][0]["id"]
    # return -1 when no results are found
    return -1

# retrieve movie details from TMDB using the tmdbID integer. returns a dictionary
def get_movie_details_using_tmdbID_tmdb(tmdb_id):
    url = "https://api.themoviedb.org/3/movie/" + str(tmdb_id) + "?language=en-US"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return
    response_dict = response.json()
    return response_dict

# retrieve actors of a movie from TMDB using the tmdb ID. returns a list of dictionaries 
def get_first_three_actors_movie_TMDB_API(tmdb_id):
    url = "https://api.themoviedb.org/3/movie/" + str(tmdb_id) + "/credits?language=en-US"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return
    response_dict = response.json()
    return response_dict["cast"][:3]


def get_number_of_acting_credits_TMDB_API(actor_id):
    url = "https://api.themoviedb.org/3/person/" + str(actor_id) + "/movie_credits?language=en-US"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return 0
    return len(response.json()["cast"])


def get_actor_details_TMDB_API(actor_id):
    url = "https://api.themoviedb.org/3/person/" + str(actor_id) + "?language=en-US"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return {}
    response_dict = response.json()
    return response_dict

def get_actor_id_TMDB_API(actor_name):
    name = actor_name.replace(' ', '%20')
    url = "https://api.themoviedb.org/3/search/person?query=" + name + "&include_adult=false&language=en-US&page=1"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return
    response_dict = response.json()
    return response_dict["results"][0]["id"]

In [17]:
def extract_movie_data_TMDB_API(movies_df, use_file=False):
    file_name = './csv/top_250_movies_TMDB.csv'
    check_file_exists = os.path.isfile(file_name)
    if use_file and check_file_exists:
        print("TMDB DATA FILE EXISTS: READING...", end=' ')
        df = read_csv_to_dataframe(file_name)
        print(f"DONE - {len(df)} MOVIE DATA IMPORTED\n")
        return df
    
    movie_tmdb_ids = []
    print("EXTRACT: MOVIE DATA -> TMDB API")
    time_before_extraction = datetime.now()
    
    # request for the ids of the movies
    for i in movies_df.index:
        id = get_movie_id_using_title_tmdb(movies_df["Title"][i])
        if id == -1:
            print("ID not found for movie: " + movies_df["Title"][i])
        movie_tmdb_ids.append(id)

    movies_df.insert(2, "tmdbID", movie_tmdb_ids)
    budget_list = []
    revenue_list = []
    
    for i in movies_df.index:
        id = movies_df["tmdbID"][i]
        result_dict = get_movie_details_using_tmdbID_tmdb(id)
        if result_dict == None:
            budget_list.append('N/A')
            revenue_list.append('N/A')
        else:
            budget_list.append(result_dict["budget"])
            revenue_list.append(result_dict["revenue"])
    
    movies_df["Production"] = budget_list
    movies_df["BoxOffice"] = revenue_list
    time_diff = datetime.now() - time_before_extraction
    print(f"DONE - {len(movies_df)} MOVIE DATA EXTRACTED -> TMDB API - TIME TAKEN: {time_diff}\n")

    export_dataframe_to_csv(movies_df, file_name)

In [57]:
def get_all_actors_id_TMDB_API(movies_df, use_file=False):
    tmdb_file_name = './csv/top_250_movies_TMDB_rev.csv'
    actors_id_file_name = './csv/actor_ids_TMDB.txt'
    check_files_exists = os.path.isfile(tmdb_file_name) and os.path.isfile(actors_id_file_name)
    if use_file and check_files_exists:
        print("TMDB DATA FILE EXISTS: READING...", end=' ')
        new_movies_df = read_csv_to_dataframe(tmdb_file_name)
        print(f"DONE - {len(new_movies_df)} MOVIE DATA IMPORTED\n")
        
        print("ACTOR ID TEXT FILE EXISTS: READING...", end=' ')
        with open(actors_id_file_name, 'r', encoding='utf-8-sig') as file:
            data = file.read()
            id_list = data.split('\n')
            actors_dict = {id: {} for id in id_list}
            print(f"DONE - {len(actors_dict)} ACTOR IDs IMPORTED\n")
        return new_movies_df, actors_dict
    
    actors_dict = {}
    print("FETCH: ACTOR IDs -> TMDB API")
    time_before_extraction = datetime.now()
    
    actors_list = []
    for i in movies_df.index:
        id = movies_df["tmdbID"][i]
        result_list = get_first_three_actors_movie_TMDB_API(id)
        if result_list == None or len(result_list) < 3:
            temp_list = movies_df["Actors"][i].split(', ')
            actors_list.append(temp_list)
            for actor in temp_list:
                actor_id = get_actor_id_TMDB_API(actor)
                if not actor_id in actors_dict:
                    actors_dict[actor_id] = {}
            continue
        
        temp_list = []
        for actor_details in result_list:
            actor_name = actor_details["name"]
            actor_id = actor_details["id"]
            temp_list.append(actor_name)
            if not actor_id in actors_dict:
                actors_dict[actor_id] = {}
        actors_list.append(temp_list)

    movies_df["Actors"] = actors_list
            
    time_diff = datetime.now() - time_before_extraction
    print(f"DONE - {len(actors_dict.keys())} ACTOR IDs FETCHED -> TMDB API - TIME TAKEN: {time_diff}\n")

    export_dataframe_to_csv(movies_df, tmdb_file_name)
    with open(actors_id_file_name, 'w', encoding='utf-8-sig') as file:
        for id in actors_dict.keys():
            file.write(f"{id}\n")
    
    return movies_df, actors_dict

In [41]:
def extract_actors_data_TMDB_API(actors_dict, use_file=False):
    file_name = './csv/actor_data_TMDB.csv'
    check_file_exists = os.path.isfile(file_name)
    if use_file and check_file_exists:
        print("TMDB DATA FILE EXISTS: READING...", end=' ')
        df = read_csv_to_dataframe(file_name)
        print(f"DONE - {len(df)} ACTOR DATA IMPORTED\n")
        return df
    
    print("EXTRACT: ACTOR DATA -> TMDB API")
    time_before_extraction = datetime.now()
    num_of_acting_credits_list = []
    for id in actors_dict.keys():
        actors_dict[id] = get_actor_details_TMDB_API(id)
        num_of_acting_credits_list.append(get_number_of_acting_credits_TMDB_API(id))
        
    time_diff = datetime.now() - time_before_extraction
    print(f"DONE - {len(actors_dict.keys())} ACTOR DATA EXTRACTED -> TMDB API - TIME TAKEN: {time_diff}\n")
    df = pd.json_normalize(actors_dict.values())
    df.insert(13, "num_of_acting_credits", num_of_acting_credits_list)

    export_dataframe_to_csv(df, file_name)
    return df

In [59]:
def extract_data():
    titles_list = data_crawl_top_250_movie_titles_IMDB(use_file=True)
    movies_df = gather_movies_data_OMDB_API(titles_list, use_file=True)
    extract_movie_data_TMDB_API(movies_df, use_file=True)

    movies_df, actors_dict = get_all_actors_id_TMDB_API(movies_df, use_file=True)
    actors_df = extract_actors_data_TMDB_API(actors_dict, use_file=True)
    return movies_df, actors_df

In [45]:
def extract_oscar_wins(awards_str):
    keywords_to_find = ["Won", "Oscars", "Oscar"]
    temp_list = awards_str.split()
    for keyword in keywords_to_find:
        if keyword in temp_list:
            temp_list.remove(keyword)
    if len(temp_list) > 1:
        return 0
    try:
        return int(temp_list[0])
    except ValueError:
        return 0

def extract_wins_and_nominations(awards_str):
    temp_list = awards_str.split()
    num_of_wins = 0
    num_of_nominations = 0
    if "win" in temp_list:
        index = temp_list.index("win")
        try:
            num_of_wins = int(temp_list[index-1])
        except ValueError:
            pass
    elif "wins" in temp_list:
        index = temp_list.index("wins")
        try:
            num_of_wins = int(temp_list[index-1])
        except ValueError:
            pass

    if "nomination" in temp_list:
        index = temp_list.index("nomination")
        try:
            num_of_nominations = int(temp_list[index-1])
        except ValueError:
            pass
    elif "nominations" in temp_list:
        index = temp_list.index("nominations")
        try:
            num_of_nominations = int(temp_list[index-1])
        except ValueError:
            pass

    return num_of_wins, num_of_nominations

def format_date(date_str):
    temp_date_list = date_str.split()
    temp_date_list.reverse()
    dates = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    month = dates.index(temp_date_list[1]) + 1
    if month < 10:
        temp_date_list[1] = '0' + str(month)
    else:
        temp_date_list[1] = str(month)
    return '-'.join(temp_date_list)

In [61]:
movies_df, actors_df = extract_data()
print(movies_df)

IMDB DATA FILE EXISTS: READING... DONE - 251 MOVIE TITLES IMPORTED

OMDB DATA FILE EXISTS: READING... DONE - 250 MOVIE DATA IMPORTED

TMDB DATA FILE EXISTS: READING... DONE - 250 MOVIE DATA IMPORTED

TMDB DATA FILE EXISTS: READING... DONE - 250 MOVIE DATA IMPORTED

ACTOR ID TEXT FILE EXISTS: READING... DONE - 592 ACTOR IDs IMPORTED

TMDB DATA FILE EXISTS: READING... DONE - 591 ACTOR DATA IMPORTED

                        Title  Year      Rated     Released  Runtime  \
0    The Shawshank Redemption  1994          R  14 Oct 1994  142 min   
1               The Godfather  1972          R  24 Mar 1972  175 min   
2             The Dark Knight  2008      PG-13  18 Jul 2008  152 min   
3       The Godfather Part II  1974          R  18 Dec 1974  202 min   
4                12 Angry Men  1957   Approved  10 Apr 1957   96 min   
..                        ...   ...        ...          ...      ...   
245                  The Help  2011      PG-13  10 Aug 2011  146 min   
246     It Happened One

In [95]:
#def transform_data(movies_df, actors_df):
movies_df_revised_2 = movies_df.loc[:, ["Title", "Year", "Rated", "Released", "Runtime", "Genre", "Plot", "Language", "Country", "Awards", "Director", "Actors", "imdbRating", "imdbVotes", "BoxOffice", "Production"]]

movies_df_revised_2.rename(columns={"Rated": "certification", "Released": "release_date", "Plot": "description"}, inplace=True)
movies_df_revised_2.rename(columns={"imdbRating": "ratings", "imdbVotes": "num_of_votes", "BoxOffice": "revenue", "Production": "budget"}, inplace=True)
movies_df_revised_2.columns = [col.lower() for col in movies_df_revised_2.columns]

votes_list = []
oscars_list = []
winnings_list = []
nominations_list = []
date_list = []

for i in movies_df_revised_2.index:
    num_of_votes = movies_df_revised_2["num_of_votes"][i]
    votes = 0
    if isinstance(num_of_votes, type('hello')):
        num_of_votes= num_of_votes.replace(',', '')
        try:
            votes = int(num_of_votes)
        except ValueError:
            pass
    votes_list.append(votes)

    date_str = movies_df_revised_2["release_date"][i]
    date_list.append(format_date(date_str))
        
    oscars = 0
    wins = 0
    nominations = 0
    awards = movies_df_revised_2["awards"][i]
    if isinstance(awards, type('hello')):
        if "BAFTA Award" in awards:
            index = awards.find("BAFTA Award")
            awards = awards[index+11:]
        awards_list = awards.split('.')
        if len(awards_list) > 1:
            oscars = extract_oscar_wins(awards_list[0])
        wins, nominations = extract_wins_and_nominations(awards_list[-1])
    oscars_list.append(oscars)
    winnings_list.append(wins)
    nominations_list.append(nominations)

movies_df_revised_2 = movies_df_revised_2.drop(["awards"], axis=1)
movies_df_revised_2.insert(10, "oscars", oscars_list)
movies_df_revised_2.insert(11, "winnings", winnings_list)
movies_df_revised_2.insert(12, "nominations", nominations_list)
movies_df_revised_2["num_of_votes"] = votes_list
movies_df_revised_2["release_date"] = date_list

movies_df_revised_2["director"] = movies_df_revised_2["director"].apply(lambda person: person.split(', '))
#movies_df_revised["actors"] = movies_df_revised["actors"].apply(lambda actors: actors[1:-1].split(', '))
#print(type(movies_df_revised["actors"][0]))

print(movies_df_revised_2)

                        title  year certification release_date  runtime  \
0    The Shawshank Redemption  1994             R   1994-10-14  142 min   
1               The Godfather  1972             R   1972-03-24  175 min   
2             The Dark Knight  2008         PG-13   2008-07-18  152 min   
3       The Godfather Part II  1974             R   1974-12-18  202 min   
4                12 Angry Men  1957      Approved   1957-04-10   96 min   
..                        ...   ...           ...          ...      ...   
245                  The Help  2011         PG-13   2011-08-10  146 min   
246     It Happened One Night  1934      Approved   1934-02-22  105 min   
247                   Aladdin  1992             G   1992-11-25   90 min   
248              Paris, Texas  1984             R   1984-08-23  145 min   
249        Gangs of Wasseypur  2012     Not Rated   2012-06-22  321 min   

                            genre  \
0                           Drama   
1                    Crim

In [99]:
print(type(movies_df_revised_2["actors"][0]))

<class 'str'>


In [101]:
#movies_df_revised_2["actors"] = movies_df_revised_2["actors"].apply(lambda result: result = actors.replace('\'', ''))
#movies_df_revised_2["actors"] = movies_df_revised_2["actors"].apply(lambda actors: actors[1:-1].split(', '))
#print(type(movies_df_revised["actors"][0]))

for i in movies_df_revised_2.index:
    temp_str = movies_df_revised_2["actors"][i]
    temp_str = temp_str.replace('\'', '')
    temp_list = temp_str[1:-1].split(', ')
    movies_df_revised_2["actors"][i] = temp_list

print(movies_df_revised_2)

                        title  year certification release_date  runtime  \
0    The Shawshank Redemption  1994             R   1994-10-14  142 min   
1               The Godfather  1972             R   1972-03-24  175 min   
2             The Dark Knight  2008         PG-13   2008-07-18  152 min   
3       The Godfather Part II  1974             R   1974-12-18  202 min   
4                12 Angry Men  1957      Approved   1957-04-10   96 min   
..                        ...   ...           ...          ...      ...   
245                  The Help  2011         PG-13   2011-08-10  146 min   
246     It Happened One Night  1934      Approved   1934-02-22  105 min   
247                   Aladdin  1992             G   1992-11-25   90 min   
248              Paris, Texas  1984             R   1984-08-23  145 min   
249        Gangs of Wasseypur  2012     Not Rated   2012-06-22  321 min   

                            genre  \
0                           Drama   
1                    Crim

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  movies_df_revised_2["actors"][i] = temp_list
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_revised_

In [75]:
actors_df_revised = actors_df.loc[:, ["name", "birthday", "deathday", "gender", "num_of_acting_credits"]]
actors_df_revised.rename(columns={"birthday": "date_of_birth", "deathday": "date_of_death"}, inplace=True)
actors_df_revised["gender"] = actors_df_revised["gender"].replace([0, 1, 2], ['None', 'Female', 'Male'])
print(actors_df_revised)

                 name date_of_birth date_of_death  gender  \
0         Tim Robbins    1958-10-16           NaN    Male   
1      Morgan Freeman    1937-06-01           NaN    Male   
2          Bob Gunton    1945-11-15           NaN    Male   
3       Marlon Brando    1924-04-03    2004-07-01    Male   
4           Al Pacino    1940-04-25           NaN    Male   
..                ...           ...           ...     ...   
586  Nastassja Kinski    1961-01-24           NaN  Female   
587    Dean Stockwell    1936-03-05    2021-11-07    Male   
588    Manoj Bajpayee    1969-04-23           NaN    Male   
589      Richa Chadha    1986-12-18           NaN  Female   
590  Tigmanshu Dhulia    1967-07-03           NaN    Male   

     num_of_acting_credits  
0                       88  
1                      189  
2                       88  
3                      107  
4                      114  
..                     ...  
586                     78  
587                    124  
588   

In [313]:
actors_df_revised = actors_df.loc[:, ["name", "birthday", "deathday", "gender", "num_of_acting_credits"]]
export_dataframe_to_csv(actors_df_revised, './csv/actor_data_TMDB_rev.csv')

In [281]:
titles_list = data_crawl_top_250_movie_titles_IMDB(use_file=True)

IMDB DATA FILE EXISTS: READING... DONE - 251 MOVIE TITLES IMPORTED



In [283]:
movies_df = gather_movies_data_OMDB_API(titles_list, use_file=True)

OMDB DATA FILE EXISTS: READING... DONE - 250 MOVIE DATA IMPORTED



In [None]:
movies_df = read_csv_to_dataframe('./csv/top_250_movies_OMDB.csv')
movies_df = movies_df.iloc[:, 1:]

In [None]:
export_dataframe_to_csv(movies_df, './csv/top_250_movies_OMDB.csv')
movies_df = read_csv_to_dataframe('./csv/top_250_movies_OMDB.csv')

In [285]:
extract_movie_data_TMDB_API(movies_df, use_file=False)

EXTRACT: MOVIE DATA -> TMDB API
DONE - 250 MOVIE DATA EXTRACTED -> TMDB API - TIME TAKEN: 0:01:14.550011



In [307]:
actors_dict = get_all_actors_id_TMDB_API(movies_df, use_file=False)

FETCH: ACTOR IDs -> TMDB API
Tatsuya Nakadai
Akira Terao
Jinpachi Nezu
DONE - 591 ACTOR IDs FETCHED -> TMDB API - TIME TAKEN: 0:00:18.374528



In [311]:
actors_df = extract_actors_data_TMDB_API(actors_dict, use_file=False)

EXTRACT: ACTOR DATA -> TMDB API
DONE - 591 ACTOR DATA EXTRACTED -> TMDB API - TIME TAKEN: 0:02:18.794284



In [315]:
movies_df_revised = movies_df.loc[:, ["Title", "Year", "Rated", "Released", "Runtime", "Genre", "Plot", "Language", "Country", "Awards", "Director", "Writer", "Actors", "Metascore", "imdbRating", "imdbVotes", "BoxOffice", "Production"]]
#print(movies_df_revised)
export_dataframe_to_csv(movies_df_revised, './csv/top_250_movies_TMDB_rev.csv')

In [237]:
print(actors_df_revised)

                 name    birthday    deathday  gender  num_of_acting_credits  \
0         Tim Robbins  1958-10-16        None       2                     88   
1      Morgan Freeman  1937-06-01        None       2                    189   
2          Bob Gunton  1945-11-15        None       2                     88   
3       Marlon Brando  1924-04-03  2004-07-01       2                    107   
4           Al Pacino  1940-04-25        None       2                    114   
..                ...         ...         ...     ...                    ...   
584  Nastassja Kinski  1961-01-24        None       1                     78   
585    Dean Stockwell  1936-03-05  2021-11-07       2                    124   
586    Manoj Bajpayee  1969-04-23        None       2                     89   
587      Richa Chadha  1986-12-18        None       1                     29   
588  Tigmanshu Dhulia  1967-07-03        None       2                     11   

         id  
0       504  
1       192