### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json

In [2]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

In [3]:

type(tmdb_api_key)

str

### Access the New York Times API

In [4]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230331"

# Build URL
request_url = (
    f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
    f"&fq={filter_query}&sort={sort}&fl={field_list}"
)



In [5]:
response_data = requests.get(request_url).json()


In [6]:
print(json.dumps(response_data, indent=4))

{
    "status": "OK",
    "copyright": "Copyright (c) 2024 The New York Times Company. All Rights Reserved.",
    "response": {
        "docs": [
            {
                "web_url": "https://www.nytimes.com/2023/03/30/movies/a-thousand-and-one-review.html",
                "snippet": "A mesmerizing Teyana Taylor stars in A.V. Rockwell\u2019s feature directing debut, about motherhood and survival in a fast-changing city.",
                "source": "The New York Times",
                "headline": {
                    "main": "\u2018A Thousand and One\u2019 Review: A New York Love Story",
                    "kicker": "Critic\u2019s Pick",
                    "content_kicker": null,
                    "print_headline": "An Unbending Will Meets a Shifting City",
                    "name": null,
                    "seo": null,
                    "sub": null
                },
                "keywords": [
                    {
                        "name": "subject",
         

In [14]:
# Create an empty list to store the reviews
review_list = []

# loop through pages 0-19
for page in range (20):
    try:
        request_url = (
            f"{url}api-key={nyt_api_key}&begin_date={begin_date}&end_date={end_date}"
            f"&fq={filter_query}&sort={sort}&fl={field_list}&page={page}"

    )
# create query with a page number

    try:
    
        response = requests.get(request_url)
        response_data = response.json()


        print("Response Data:", response_data)



    # API results show 10 articles at a time

    
    # Make a "GET" request and retrieve the JSON
        if "response" in response_data and "docs" in response_data["response"]:
            articles = response_data["response"]["docs"]
            review_list.extend(articles)
            if articles:
                print(f"Retrieved {len(articles)} articles for page {page}")
            

            else:
                print(f"no reuslts for page {page}")

        else:
            print(f"No results for page {page}")
    
    # Add a twelve second interval between queries to stay within API query limits
        time.sleep(12)
    
    except Exception as e:

        print(f"Error occurred while fetching data for page {page}: {e}")
    

print(f"Total articles retrieved: {len(review_list)}")
    # Try and save the reviews to the reviews_list

        # loop through _the reviews["response"]["docs"] and append each review to the list


       


Response Data: {'status': 'OK', 'copyright': 'Copyright (c) 2024 The New York Times Company. All Rights Reserved.', 'response': {'docs': [{'web_url': 'https://www.nytimes.com/interactive/2023/03/31/sports/ncaabasketball/locker-room-celebrations-water-pour.html', 'snippet': 'Hold the champagne. Water has become the go-to beverage for celebrating success in sports.', 'source': 'The New York Times', 'headline': {'main': 'When Every Win Means Water, Water Everywhere', 'kicker': None, 'content_kicker': None, 'print_headline': 'You’d Better Not Try This in Your Workplace!', 'name': None, 'seo': None, 'sub': None}, 'keywords': [{'name': 'subject', 'value': 'Water', 'rank': 1, 'major': 'N'}, {'name': 'subject', 'value': 'Athletics and Sports', 'rank': 2, 'major': 'N'}, {'name': 'subject', 'value': 'Basketball (College)', 'rank': 3, 'major': 'N'}, {'name': 'subject', 'value': 'NCAA Basketball Championships (Men)', 'rank': 4, 'major': 'N'}, {'name': 'subject', 'value': 'NCAA Basketball Champions

In [52]:
for review in review_list:
    url = review.get("web_url", "")
    if url:
        try:
            response = requests.get(url)

            if response.status_code ==200:
                if 'application/json' in response.headers.get('Content-Type', ''):
                    try:
                        review_data = response.json()
                        all_results.extend(review_data)
        # Print the page that was just retrieved
                        print("Retrieved data for page:", url)
                    except ValueError:
                        print("Response content is not valid JSON:", url)
                else:
                    print("Response content is not JSON:", url)
            else:
                print(f"Failed to retrieve data for review:", url)
        except requests.exceptions.RequestException as e:
            print(f"Error occurred while fetching data for review (url): (e)")
    else:
        print("No url found for reveiw.")


Response content is not JSON: https://www.nytimes.com/interactive/2023/03/31/sports/ncaabasketball/locker-room-celebrations-water-pour.html
Failed to retrieve data for review: https://www.nytimes.com/2023/03/31/us/minneapolis-police-reform-agreement.html
Failed to retrieve data for review: https://www.nytimes.com/2023/03/31/briefing/donald-trump-arraignment-russia-recruitment.html
Failed to retrieve data for review: https://www.nytimes.com/2023/03/31/technology/sam-altman-open-ai-chatgpt.html
Failed to retrieve data for review: https://www.nytimes.com/2023/03/31/opinion/ai-pause.html
Failed to retrieve data for review: https://www.nytimes.com/2023/03/31/technology/google-pichai-ai.html
Failed to retrieve data for review: https://www.nytimes.com/2023/03/31/science/astrolab-moon-rover-spacex.html
Failed to retrieve data for review: https://www.nytimes.com/2023/03/31/technology/facial-recognition-false-arrests.html
Failed to retrieve data for review: https://www.nytimes.com/2023/03/31/spo

In [53]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data

print(json.dumps(response_data, indent=4))
    


{
    "status": "OK",
    "copyright": "Copyright (c) 2024 The New York Times Company. All Rights Reserved.",
    "response": {
        "docs": [
            {
                "web_url": "https://www.nytimes.com/2023/03/13/arts/fbi-art-crime-team.html",
                "snippet": "A raid of the Orlando Museum of Art, in which 25 works attributed to Jean-Michel Basquiat were seized, has placed renewed spotlight on the unit.",
                "source": "The New York Times",
                "headline": {
                    "main": "The F.B.I. Has an Art Crime Team. And These Days, It\u2019s Busy.",
                    "kicker": null,
                    "content_kicker": null,
                    "print_headline": "F.B.I. Team On the Trail Of Art Crime",
                    "name": null,
                    "seo": null,
                    "sub": null
                },
                "keywords": [
                    {
                        "name": "subject",
                        

In [54]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()

reviews_list_df = pd.json_normalize(response_data['response']['docs'])
reviews_list_df

print(json.dumps(response_data, indent=4))

{
    "status": "OK",
    "copyright": "Copyright (c) 2024 The New York Times Company. All Rights Reserved.",
    "response": {
        "docs": [
            {
                "web_url": "https://www.nytimes.com/2023/03/13/arts/fbi-art-crime-team.html",
                "snippet": "A raid of the Orlando Museum of Art, in which 25 works attributed to Jean-Michel Basquiat were seized, has placed renewed spotlight on the unit.",
                "source": "The New York Times",
                "headline": {
                    "main": "The F.B.I. Has an Art Crime Team. And These Days, It\u2019s Busy.",
                    "kicker": null,
                    "content_kicker": null,
                    "print_headline": "F.B.I. Team On the Trail Of Art Crime",
                    "name": null,
                    "seo": null,
                    "sub": null
                },
                "keywords": [
                    {
                        "name": "subject",
                        

In [55]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
if 'headline.main' in reviews_list_df.columns:

    def extract_title(text):
        start_index = text.find("\u2018")
        end_index = text.find("\u2019 Review")
        if start_index != -1 and end_index != -1:
            return text [start_index + 1:end_index]
        else:
            return None

    reviews_list_df['title'] = reviews_list_df['headline.main'].apply(extract_title)
else:
    print("Column 'headline.main' does not exist in the DataFrame.")
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early

In [56]:
# Extract 'name' and 'value' from items in "keywords" column
import ast

def extract_keywords(keyword_list):
    try:
   
        keyword_list = ast.literal_eval(keyword_list)
        extracted_keywords = ""
        for item in keyword_list:
            if 'name' in item:
                
        # Extract 'name' and 'value'
                extracted_keywords += f"{item['name']},"  
        # Append the keyword item to the extracted_keywords list
        
        return extracted_keywords.strip(",")
    except (SyntaxError, ValueError):
        return "Error extracting keywords"

# Fix the "keywords" column by converting cells from a list to a string
reviews_list_df['keywords'] = reviews_list_df['keywords'].apply(lambda x:extract_keywords(x))

print(reviews_list_df.head())

                                             web_url  \
0  https://www.nytimes.com/2023/03/13/arts/fbi-ar...   
1  https://www.nytimes.com/2023/03/13/travel/anse...   
2  https://www.nytimes.com/2023/03/13/books/revie...   
3  https://www.nytimes.com/2023/03/13/books/kenza...   
4  https://www.nytimes.com/2023/03/13/business/ai...   

                                             snippet              source  \
0  A raid of the Orlando Museum of Art, in which ...  The New York Times   
1  An enigmatic photograph by America’s most famo...  The New York Times   
2  The new book by the sociologist and author of ...  The New York Times   
3  With his powerful novels and essays, Mr. Oe tr...  The New York Times   
4  Using A.I. tools in hiring is one of the most ...  The New York Times   

                    keywords                  pub_date  word_count  \
0  Error extracting keywords  2023-03-13T15:00:12+0000        1597   
1  Error extracting keywords  2023-03-13T09:00:20+0000        3672

In [57]:
# Create a list from the "title" column using to_list()

title_list = reviews_list_df['title'].to_list()
# These titles will be used in the query for The Movie Database
print(title_list)

[None, None, None, None, None, None, None, None, None, None]


### Access The Movie Database API

In [58]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

In [92]:
# Create an empty list to store the results
results_list = []
movie_ids = ["movie_id", "genres.", "name"]


# Create a request counter to sleep the requests after a multiple
# of 50 requests
import requests
import time

request_counter = 0
request_limit = 50


# Loop through the titles
for movie_id in movie_ids:
    # Check if we need to sleep before making a request
    if request_counter > 0 and request_counter % request_limit == 0:
        print("Pausing requests...")
        time.sleep(5)  



    # Add 1 to the request counter
    request_counter += 1
    
    # Perform a "GET" request for The Movie Database
    movie_details_url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    response_details = requests.get(movie_details_url)

    # Include a try clause to search for the full movie details.
    try:
        if response_details.status_code == 200:
        
    # Use the except clause to print out a statement if a movie
    # is not found.

        # Get movie id
                genres = [genre['name'] for genre in response_details.json()['genres']]


        # Make a request for a the full movie details
                spoken_languages = [lang['english_name'] for lang in response_details.json()['spoken_languages']]


        # Execute "GET" request with url
                response = requests.get(UnicodeTranslateError)

        
        # Extract the genre names into a list
                genres = [genre['name'] for genre in response_details.json()['genres']]

        # Extract the spoken_languages' English name into a list
                spoken_languages = [lang['english_name'] for lang in response_details.json()['spoken_languages']]


        # Extract the production_countries' name into a list

                production_countries = [country['name'] for country in response_details.json()['production_countries']]


        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list
                results_list.append({
                        'movie_id': movie_id,
                        'genres': genres,
                        'spoken_languages': spoken_languages,
                        'production_countries': production_countries
                })
        
        # Print out the title that was found
                print(f"Title found: {movie_id}")
        else:
                print(f"Movie ID not found: {movie_id}")
    except KeyError:
        print(f"Error occurred while processing movie ID: {movie_id}. Data not found.")
print(results_list) 


Movie ID not found: movie_id
Movie ID not found: genres.
Movie ID not found: name
[]


In [93]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [94]:
# Convert the results to a DataFrame


### Merge and Clean the Data for Export

In [95]:
# Merge the New York Times reviews and TMDB DataFrames on title


In [96]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing


# Create a list of characters to remove


# Loop through the list of columns to fix

    # Convert the column to type 'str'


    # Loop through characters to remove


# Display the fixed DataFrame


In [97]:
# Drop "byline.person" column


In [98]:
# Delete duplicate rows and reset index


In [99]:
# Export data to CSV without the index
