In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

# BEFORE THE LOOPS

## Designate a folder

You will save API call data in the data folder you created for project part 1

In [2]:
import os
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['akas.csv.gz', 'basics.csv.gz', 'ratings.csv.gz']

# Start INNER Loop

### Use a Function to append new results to the existing JSON file

This is a function adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/

Make sure you define and run any functions you plan to use in your for-loop prior to running the loop!

In [14]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [9]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('Data/basics.csv.gz')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,,,"Action,Crime"
1,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,,60.0,
2,tt0079644,movie,November 1828,November 1828,0,2001,,140.0,"Drama,War"
3,tt0083060,movie,The Drive to Win,Sha Ou,0,2019,,,"Drama,Sport"
4,tt0089067,movie,El día de los albañiles 2,El día de los albañiles 2,0,2001,,90.0,Comedy


In [3]:
YEARS_TO_GET = [2000,2001]

# Start OUTER loop

## Set up Progress Bar

We want to keep track of our progress and ensure our calls are working.  The progress bar works within the for statement of the for loop.  Note that this will iterate through each year that is defined in the YEARS_TO_GET variable.

In [4]:
YEAR = YEARS_TO_GET[0]
YEAR

2000

In [5]:
# Start of OUTaER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):

SyntaxError: unexpected EOF while parsing (1380708193.py, line 2)

## Select a JSON_FILE filename to save the results in progress.

Check if the file exists.
- if no:

Create the empty JSON file with with open that just contains the key "imdb_id"

- if yes:
Do nothing.

First, define the file path and names:  We are going to have multiple files since we are creating a separate file for each year.  The code below will identify the folder in the FOLDER we just define above and will name the file based on the current year. 

In [6]:
#Defining the JSON file to store results for year
JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'

Check if that file already exists or not.

In [7]:
# Check if file exists
file_exists = os.path.isfile(JSON_FILE)

If you are going through this lesson for the first time, it is very unlikely that the file exists! But, if you are at a different point in the project, and it already exists, we don't need to do anything, but just make sure it is a file you want to add to! We

The code below will create the file and save an empty dictionary with just imdb_id.  We will be appending to this empty dictionary throughout our calls.

In [8]:
# If it does not exist: create it
if file_exists == False:
# save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)


## Define/filter the IDs to call

We are going to break up our data by year, so we will define a new dataframe for each year.  Notice that which YEAR will depend on what we define YEAR as.  Leaving YEAR a variable allows the code to be easier to read and reproduce.

In [11]:
#Saving new year as the current df
df = basics.loc[ basics['startYear']==YEAR].copy()
# saving movie ids to list
movie_ids = df['tconst'].copy()#.to_list()
movie_ids

1        tt0015414
7        tt0102362
8        tt0107706
12       tt0113086
15       tt0116748
           ...    
91467    tt9465882
91514    tt9486348
91552    tt9501764
92472    tt9797592
92578    tt9833388
Name: tconst, Length: 2292, dtype: object

## Check for and remove any previously downloaded Movie id's



In [12]:
# Load existing data from json into a dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)

### Check for and filter out movie IDs that already exist

The next line of code will prevent you from wasting API calls on data you already have.  Note that it is defining the ids you are calling in such a way that it excludes any ids that are already present in the previous_df. You may recall that this will also allow you to "pick up where you left off" if your API call gets interrupted.

In [13]:
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

Now we have defined the "movie_ids_to_get". It includes the ids from our dataframe in the year we are seeking, and it excludes any that we have already made calls for. 

We will use this list for our inner loop of API calls.

# Iterate through the list of Movie IDs and make the calls

The code below relies on the function you wrote in the previous lesson that made API calls and added the certification to the .info results. Here this function is named "get_movie_with_rating".  Make sure you the function from the earlier lesson in the code file before you plan to call on it!  This loop also uses the function above (write_json)  to extend/append the results to the .json file. Make sure both functions are defined in your code file before you try to call them!

In [16]:
#Get index and movie id from list
# INNER Loop
for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

Movies from 2000:   0%|          | 0/2292 [00:00<?, ?it/s]

### Save the year's results as csv.gz file

Once all of the API calls for the current year are made, you should open your .json file with pd.read_json and convert each to a compressed csv (".csv.gz") to save space. This is done after the inner loop, but within the outer loop.

In [17]:
final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)