In [None]:
#do this if needed.
!pip install certifi

In [43]:
# urllib3 is a powerful, user-friendly HTTP client for Python
# to handle  data retrieval
import urllib3
from urllib3 import request

# to handle certificate verification
import certifi

import os.path

# to manage json data
import json

# for pandas dataframes
import pandas as pd

# Import Selenium Modules
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

# Import Time
import time
from datetime import datetime

import requests

import csv

In [45]:
# handle certificate verification and SSL warnings:
# reference https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl
http = urllib3.PoolManager(
    cert_reqs='CERT_REQUIRED',
    ca_certs=certifi.where())

# access token for TMDB API
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI2ZGYzM2RhMWRiMmMxZDE1MzU0YTFkNDI2YWQyODYzMCIsIm5iZiI6MTcyMzY5NDkwMi4xMzQ2MjcsInN1YiI6IjY2YmQ2MGQ2Y2ZjNTYwN2FmMGU5YjNhYSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.8-71HQaSOlHGMGP3SLk6awUAuQyPaai43JSXK2AS8o8"
}

In [48]:
def export_dataframe_to_csv(df, file_name):
    df.to_csv(file_name, index=False, encoding='utf-8-sig')

def read_csv_to_dataframe(file_name):
    return pd.read_csv(file_name, encoding='utf-8-sig')

In [51]:
def data_crawl_top_250_movie_titles_IMDB(use_file=False):
    file_name = '../resources/top_250_movie_titles_IMDB.txt'
    check_file_exists = os.path.isfile(file_name)
    if use_file and check_file_exists:
        print("IMDB DATA FILE EXISTS: READING...", end=' ')
        with open(file_name, 'r', encoding='utf-8-sig') as file:
            data = file.read()
            titles_list = data.split('\n')
            print(f"DONE - {len(titles_list)} MOVIE TITLES IMPORTED\n")
            return titles_list

    print("EXTRACT: TOP 250 MOVIE TITLES -> IMDB")
    # create an option to run Chrome browser without opening one for this data crawling operation
    op = webdriver.ChromeOptions()
    '''
    chrome_options = Options()
    user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
    chrome_options.add_argument(f'user-agent={user_agent}')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--allow-running-insecure-content')
    chrome_options.add_argument("--headless")
    '''
    # locate the driver and load it in
    driver = webdriver.Chrome()#options=chrome_options)
    # define the url of the google form and use the driver to open up the url
    url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"
    
    driver.get(url)
    driver.maximize_window()
    driver.get_screenshot_as_file("screenshot.png")
    time.sleep(2)
    time_before_extraction = datetime.now()
    
    # find the div where all the list of movies are placed in
    movies = driver.find_elements(By.XPATH, '//ul[@role="presentation"]//li//div//div//div//div//a//h3[@class="ipc-title__text"]')
    
    # extract the movie titles of the top 250 movies
    #movies = movie_box.find_elements(By.XPATH, './/h3')
    movies_list = []
    for i in range(len(movies)):
        movies_list.append(movies[i].get_attribute("innerHTML"))
    
    time_diff = datetime.now() - time_before_extraction
    driver.close()
    print(f"DONE - {len(movies_list)} MOVIE TITLES EXTRACTED -> IMDB - TIME TAKEN: {time_diff}\n")

    with open(file_name, 'w', encoding='utf-8-sig') as file:
        for title in movies_list:
            file.write(f"{title}\n")
    
    return movies_list

In [53]:
def request_movie_data_OMDB_API(title):
    # get data from the API; replace url with target source
    url = 'http://www.omdbapi.com/?t=' + title + '&plot=full&apikey=1c62e2da'

    response = http.request('GET', url)
    if response.status != 200:
        print(f"Failed to retrieve page. Status code: {response.status}")
        return {}

    # decode json data/string into a Python dict object
    data = json.loads(response.data.decode('utf-8-sig'))
    return data

In [55]:
def gather_movies_data_OMDB_API(movies_list, use_file=False):
    file_name = '../resources/top_250_movies_OMDB.csv'
    check_file_exists = os.path.isfile(file_name)
    if use_file and check_file_exists:
        print("OMDB DATA FILE EXISTS: READING...", end=' ')
        df = read_csv_to_dataframe(file_name)
        print(f"DONE - {len(df)} MOVIE DATA IMPORTED\n")
        return df
        
    movie_titles = []
    print("EXTRACT: MOVIE DATA -> OMDB API")
    
    # need to remove the numbering in front of all the movie names
    for i in range(len(movies_list)):
        # find the first space char and only take the substring after that space char
        space_index = movies_list[i].find(' ')
        title = movies_list[i][space_index+1:]
        # because we need to search the titles in the OMDB database via the URL, it requires the space chars to be replaced with plus symbol (+)
        movie_titles.append(title.replace(' ', '+'))
    
    movies_data_list = []
    time_before_extraction = datetime.now()
    
    for i in range(len(movie_titles)):
        movie_dict = request_movie_data_OMDB_API(movie_titles[i])
        if len(movie_dict) > 0 and movie_dict['Response'].lower() == 'true':
            movies_data_list.append(movie_dict)
        else:
            print(f"Movie title: {movie_titles[i].replace('+', ' ')} not found.")
    
    #print(movies_data_list)
    df = pd.json_normalize(movies_data_list)
    time_diff = datetime.now() - time_before_extraction
    print(f"DONE - {len(movies_data_list)} MOVIE RECORDS EXTRACTED -> OMDB API - TIME TAKEN: {time_diff}\n")

    export_dataframe_to_csv(df, file_name)
    return df

In [57]:
# retrieve movie's TMDB ID from TMDB using title string. returns an integer
def get_movie_id_using_title_tmdb(title):
    temp = title.replace(' ', '%20')
    url = "https://api.themoviedb.org/3/search/movie?query=" + temp + "&include_adult=false&language=en-US&page=1"
    response = requests.get(url, headers=headers)
    response_dict = response.json()
    if response_dict["total_results"] > 0:
        return response_dict["results"][0]["id"]
    # return -1 when no results are found
    return -1

# retrieve movie details from TMDB using the tmdbID integer. returns a dictionary
def get_movie_details_using_tmdbID_tmdb(tmdb_id):
    url = "https://api.themoviedb.org/3/movie/" + str(tmdb_id) + "?language=en-US"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return
    response_dict = response.json()
    return response_dict

# retrieve actors of a movie from TMDB using the tmdb ID. returns a list of dictionaries 
def get_first_three_actors_movie_TMDB_API(tmdb_id):
    url = "https://api.themoviedb.org/3/movie/" + str(tmdb_id) + "/credits?language=en-US"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return
    response_dict = response.json()
    return response_dict["cast"][:3]


def get_number_of_acting_credits_TMDB_API(actor_id):
    url = "https://api.themoviedb.org/3/person/" + str(actor_id) + "/movie_credits?language=en-US"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return 0
    return len(response.json()["cast"])


def get_actor_details_TMDB_API(actor_id):
    url = "https://api.themoviedb.org/3/person/" + str(actor_id) + "?language=en-US"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return {}
    response_dict = response.json()
    return response_dict

def get_actor_id_TMDB_API(actor_name):
    name = actor_name.replace(' ', '%20')
    url = "https://api.themoviedb.org/3/search/person?query=" + name + "&include_adult=false&language=en-US&page=1"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return
    response_dict = response.json()
    return response_dict["results"][0]["id"]

In [59]:
def extract_movie_data_TMDB_API(movies_df, use_file=False):
    file_name = '../resources/top_250_movies_TMDB.csv'
    check_file_exists = os.path.isfile(file_name)
    if use_file and check_file_exists:
        print("TMDB DATA FILE EXISTS: READING...", end=' ')
        df = read_csv_to_dataframe(file_name)
        print(f"DONE - {len(df)} MOVIE DATA IMPORTED\n")
        return df
    
    movie_tmdb_ids = []
    print("EXTRACT: MOVIE DATA -> TMDB API")
    time_before_extraction = datetime.now()
    
    # request for the ids of the movies
    for i in movies_df.index:
        id = get_movie_id_using_title_tmdb(movies_df.loc[i, "Title"])
        if id == -1:
            print("ID not found for movie: " + movies_df.loc[i, "Title"])
        movie_tmdb_ids.append(id)

    movies_df.insert(2, "tmdbID", movie_tmdb_ids)
    budget_list = []
    revenue_list = []
    
    for i in movies_df.index:
        id = movies_df.loc[i, "tmdbID"]
        result_dict = get_movie_details_using_tmdbID_tmdb(id)
        if result_dict == None:
            budget_list.append('N/A')
            revenue_list.append('N/A')
        else:
            budget_list.append(result_dict["budget"])
            revenue_list.append(result_dict["revenue"])
    
    movies_df["Production"] = budget_list
    movies_df["BoxOffice"] = revenue_list
    time_diff = datetime.now() - time_before_extraction
    print(f"DONE - {len(movies_df)} MOVIE DATA EXTRACTED -> TMDB API - TIME TAKEN: {time_diff}\n")

    export_dataframe_to_csv(movies_df, file_name)

In [61]:
def get_all_actors_id_TMDB_API(movies_df, use_file=False):
    tmdb_file_name = '../resources/top_250_movies_TMDB_rev.csv'
    actors_id_file_name = '../resources/actor_ids_TMDB.txt'
    check_files_exists = os.path.isfile(tmdb_file_name) and os.path.isfile(actors_id_file_name)
    if use_file and check_files_exists:
        print("TMDB DATA FILE EXISTS: READING...", end=' ')
        new_movies_df = read_csv_to_dataframe(tmdb_file_name)
        print(f"DONE - {len(new_movies_df)} MOVIE DATA IMPORTED\n")
        
        print("ACTOR ID TEXT FILE EXISTS: READING...", end=' ')
        with open(actors_id_file_name, 'r', encoding='utf-8-sig') as file:
            data = file.read()
            id_list = data.split('\n')
            actors_dict = {id: {} for id in id_list}
            print(f"DONE - {len(actors_dict)} ACTOR IDs IMPORTED\n")
        return new_movies_df, actors_dict
    
    actors_dict = {}
    print("FETCH: ACTOR IDs -> TMDB API")
    time_before_extraction = datetime.now()
    
    actors_list = []
    for i in movies_df.index:
        id = movies_df.loc[i, "tmdbID"]
        result_list = get_first_three_actors_movie_TMDB_API(id)
        if result_list == None or len(result_list) < 3:
            temp_list = movies_df.loc[i, "Actors"].split(', ')
            actors_list.append(temp_list)
            for actor in temp_list:
                actor_id = get_actor_id_TMDB_API(actor)
                if not actor_id in actors_dict:
                    actors_dict[actor_id] = {}
            continue
        
        temp_list = []
        for actor_details in result_list:
            actor_name = actor_details["name"]
            actor_id = actor_details["id"]
            temp_list.append(actor_name)
            if not actor_id in actors_dict:
                actors_dict[actor_id] = {}
        actors_list.append(temp_list)

    movies_df["Actors"] = actors_list
            
    time_diff = datetime.now() - time_before_extraction
    print(f"DONE - {len(actors_dict.keys())} ACTOR IDs FETCHED -> TMDB API - TIME TAKEN: {time_diff}\n")

    export_dataframe_to_csv(movies_df, tmdb_file_name)
    with open(actors_id_file_name, 'w', encoding='utf-8-sig') as file:
        for id in actors_dict.keys():
            file.write(f"{id}\n")
    
    return movies_df, actors_dict

In [63]:
def extract_actors_data_TMDB_API(actors_dict, use_file=False):
    file_name = '../resources/actor_data_TMDB.csv'
    check_file_exists = os.path.isfile(file_name)
    if use_file and check_file_exists:
        print("TMDB DATA FILE EXISTS: READING...", end=' ')
        df = read_csv_to_dataframe(file_name)
        print(f"DONE - {len(df)} ACTOR DATA IMPORTED\n")
        return df
    
    print("EXTRACT: ACTOR DATA -> TMDB API")
    time_before_extraction = datetime.now()
    num_of_acting_credits_list = []
    for id in actors_dict.keys():
        actors_dict[id] = get_actor_details_TMDB_API(id)
        num_of_acting_credits_list.append(get_number_of_acting_credits_TMDB_API(id))
        
    time_diff = datetime.now() - time_before_extraction
    print(f"DONE - {len(actors_dict.keys())} ACTOR DATA EXTRACTED -> TMDB API - TIME TAKEN: {time_diff}\n")
    df = pd.json_normalize(actors_dict.values())
    df.insert(13, "num_of_acting_credits", num_of_acting_credits_list)

    export_dataframe_to_csv(df, file_name)
    return df

In [90]:
def extract_data(using_stored_file=True):
    titles_list = data_crawl_top_250_movie_titles_IMDB(use_file=using_stored_file)
    movies_df = gather_movies_data_OMDB_API(titles_list, use_file=using_stored_file)
    extract_movie_data_TMDB_API(movies_df, use_file=using_stored_file)

    movies_df, actors_dict = get_all_actors_id_TMDB_API(movies_df, use_file=using_stored_file)
    actors_df = extract_actors_data_TMDB_API(actors_dict, use_file=using_stored_file)
    return movies_df, actors_df