## this is the first file that should be ran. next would be relational notebook that uses pickle file

### 1. Setup imports and API key credentials

In [1]:
# imports and client instance creation with apikey
import os
import omdb
import pandas as pd
from omdb import OMDBClient
from datetime import datetime, timedelta
client = OMDBClient(apikey='e55a0e19')

# define input and output folder paths
input_folder = r'C:\My\Workspace\Python_Projects\Movies_Data\input_files'
output_folder = r'C:\My\Workspace\Python_Projects\Movies_Data\output_files'

### 2. Define common functions

In [2]:
# helper function to search omdb for each film and return the unique imdb ID
def get_film_object(row):
    film_title = row['Title']
    film_year_released = str(row['Year_Released'])

    try:
        film_object = client.search(film_title, year=film_year_released)
        return film_object[0]
    except Exception as e:
        print(f"Skipping row: {row} - Error: {e}")
        return None

# helper function to extract the unique imdb ids from the list
def extract_column_to_list(df, column_name):
    return df[column_name].tolist()

### 3. MovieData Operations

In [3]:
# 7/16/23 adding a copy of this here even though it needs work -its a good starting point. This should be modified as a check for changes area so the API is only hit for new entries

# first, check for current MovieData file so the API can avoid getting hit
file_name = 'movie_list_all_columns.csv'
output_file = os.path.join(output_folder, file_name)

try:
    if os.path.exists(output_file):
        file_time = datetime.fromtimestamp(os.path.getmtime(output_file))
        current_time = datetime.now()
        time_difference = current_time - file_time
        if time_difference.days > 2:
            # File is more than 2 days old, perform necessary actions
            print("Current File is outdated. Please create a new one")
        else:
            # File is less than or equal to 2 days old, no action needed
            print("DO NOT RUN A NEW FILE! DataFrame will be loaded with current file")
            data_list_df = pd.read_csv(output_file)
    else:
        # File does not exist, handle accordingly
        print("FILE DOES NOT EXIST! Please create one")
except Exception as e:
    # Exception occurred, handle the error
    print("An error occurred:", str(e))

DO NOT RUN A NEW FILE! DataFrame will be loaded with current file
An error occurred: Error tokenizing data. C error: Expected 22 fields in line 8, saw 25



In [4]:
# create path and name variable
file_name = 'film_list.csv'
input_file = os.path.join(input_folder, file_name)
# read csv
csv_data = pd.read_csv(input_file)

In [5]:
# uses get_film_object helper function to find movie match
csv_data['film_object'] = csv_data.apply(get_film_object, axis=1)

In [6]:
# 7/16/23 adding this code from movies_data_nb_dev1a, but it could still use editing to keep only the essential parts

# would like to make a non-key version of movie-list.csv so it can keep a working movie_list.csv->Movie_Data_DW.movies_staging->power bi report for the time being
# continuation of above...this creates a list of dictionaries with 5 entries each (key, title, year, imdb_id, type, poster)
film_data = [value for _, value in csv_data['film_object'].items()]
# creates a dataframe with five columns (title, year, imdb_id, type, poster)
full_df = pd.DataFrame(film_data)

column_list = ["title", "year", "imdb_id", "type", "poster"]

full_df = full_df[column_list]

# create path and name variable
file_name = 'movie_list.csv'
output_file = os.path.join(output_folder, file_name)

# this is to persist the dataframe, may not be necessary in databricks
full_df.to_csv(output_file, sep='|', index=False)

In [7]:
# uses extract_column_to_list helper function to isolate the imdb_id column
imdb_id_list = extract_column_to_list(full_df, 'imdb_id')

In [8]:
# create list of API endpoints for further lookups, example URL: https://www.omdbapi.com/?i=tt2382320&apikey=e55a0e19
url_list = []
import urllib.parse
for imdbId in imdb_id_list:
    url = 'https://www.omdbapi.com/?'
    params = {'i': imdbId, 'apikey': 'e55a0e19'}
    temp_url = str(url + urllib.parse.urlencode(params))
    url_list.append(temp_url)

In [9]:
# create csv of endpoints...can do single list or can apply to an existing dataframe
s_obj = pd.Series(url_list)
# create path and name variable
file_name = 'endpoint_list.csv'
output_file = os.path.join(input_folder, file_name)
# Save the DataFrame to a CSV file with no leading 0 and no quoting of fields
s_obj.to_csv(output_file, header=False, index=False)

In [10]:
# create a list where each row is all the movie data found from each key
import requests
# import json
data_list = []
for api_url in url_list:
    resp = requests.get(api_url)
    resp.raise_for_status()
    data = resp.json()
    for d in data["Ratings"]:
        temp_dict = {d["Source"].replace(" ", ""):d["Value"]}
        data.update(temp_dict.items()) # data is a dict object with 28 entries each pass through the loop
    data_list.append(data) #data_list is a list of dictionaries that are created each pass

#create a dataframe from the pre-loaded list
data_list_df = pd.DataFrame(data_list)

### 4. Export dataframe as is

In [11]:
import pickle

# save dictionary to data_list.pkl file
file_name = 'data_list.pkl'
output_file = os.path.join(input_folder, file_name)
with open(output_file, 'wb') as fp:
    pickle.dump(data_list, fp)

In [12]:
#save the entire set to file
file_name = 'movie_list_all_columns.json'
output_file = os.path.join(output_folder, file_name)
data_list_df.to_json(output_file, orient="table")

In [13]:
#save the entire set to file
file_name = 'movie_list_all_columns.csv'
output_file = os.path.join(output_folder, file_name)
data_list_df.to_csv(output_file,sep='|', index=False)

### 5. Clean and conform the DataFrame

In [14]:
column_list=["Title","Year","Rated","Released","Runtime","Genre","Director","Writer","Actors","Plot","Language","Country","Awards","Poster", "Ratings", "Metascore","imdbRating","imdbVotes","InternetMovieDatabase","RottenTomatoes","Metacritic","imdbID","Type","DVD","BoxOffice","Production","Website"]
# create a dataframe object based on the previous one where only the relevant columns are used
data_list_df_partial = pd.DataFrame(data_list_df, columns=column_list)

In [15]:
file_name = 'movie_list_detail.csv'
output_file = os.path.join(output_folder, file_name)
data_list_df_partial.to_csv(output_file,sep='|', index=False)