### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json



In [2]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [3]:
# Set the base URL
def get_url(ak, bd, ed, fq, fl, so):
    url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key="+ak+"&begin_date="+bd+"&end_date="+ed+"&fq="+fq+"so"+so+"&fl="+fl
    return url

In [4]:
# Set the base URL
#url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'
#filter_query = 'Movies'
# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

url = get_url(nyt_api_key,begin_date,end_date,filter_query,field_list,sort)
#print(url)

In [5]:
# Create an empty list to store the review
reviews_list = []
#url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=MebDfltQAgwkMaU61WR9BEXzAUB6W3fq&begin_date=20130101&end_date=20230531&fq=section_name:"Movies" AND type_of_material:"Review" AND headline:"love"&sort=newest&fl=headline,web_url,snippet,source,keywords,pub_date,byline,word_count'
# loop through pages 0-19
for page in range(20):
    # create query with page number
    query_url = f"{url}&page={str(page)}"
    reviews = requests.get(query_url).json()
    time. sleep(12)
    for review in reviews["response"]["docs"]:
        # Try and save the reviews to the reviews_list
            try:
        # loop through the reviews["response"]["docs"] and append each review to the list
        # Print the page that was just retrieved
                print(f"Checked Page: {page}")
                #print(reviews)
                reviews_list.append(review)
            except:
        #Print the page number that had no results then break from the loop
                print(f"No Results found for page: {page}")
            break

Checked Page: 0
Checked Page: 1
Checked Page: 2
Checked Page: 3
Checked Page: 4
Checked Page: 5
Checked Page: 6
Checked Page: 7
Checked Page: 8
Checked Page: 9
Checked Page: 10
Checked Page: 11
Checked Page: 12
Checked Page: 13
Checked Page: 14
Checked Page: 15
Checked Page: 16
Checked Page: 17
Checked Page: 18
Checked Page: 19


In [6]:
for review in reviews_list:
    print(f'A snippet from the review: {review["snippet"]}')
    print('---------------------------')

A snippet from the review: The documentary looks at the mass killings of kangaroos for pet-food companies, leather processors and ranchers in Australia.
---------------------------
A snippet from the review: Morgan Neville’s movie tells the tumultuous behind-the-scenes story of the making and near-unmaking of “The Other Side of the Wind.”
---------------------------
A snippet from the review: Sharon Shattuck’s documentary tells her family story of growing up with a transgender parent.
---------------------------
A snippet from the review: Drawn from the plotline of a Todd Snider song, the film follows a pool shark and an escort, taking twists that are both violent and silly.
---------------------------
A snippet from the review: It’s hard to find a reliable, talented, reasonably priced, eco-friendly contractor these days.
---------------------------
A snippet from the review: This superficial take on the writing and initial staging of “Cyrano de Bergerac” is a whirlwind of soapy declar

In [7]:
# json.dumps with the argument indent=4 is used to preview the first five results

print(json.dumps(reviews_list, indent=4))



[
    {
        "web_url": "https://www.nytimes.com/2018/01/18/movies/kangaroo-a-love-hate-story-review.html",
        "snippet": "The documentary looks at the mass killings of kangaroos for pet-food companies, leather processors and ranchers in Australia.",
        "source": "The New York Times",
        "headline": {
            "main": "Review: \u2018Kangaroo: A Love-Hate Story\u2019 Exposes a Wildlife Massacre",
            "kicker": null,
            "content_kicker": null,
            "print_headline": "Kangaroo: A Love-Hate Story",
            "name": null,
            "seo": null,
            "sub": null
        },
        "keywords": [
            {
                "name": "creative_works",
                "value": "Kangaroo: A Love-Hate Story (Movie)",
                "rank": 1,
                "major": "N"
            },
            {
                "name": "subject",
                "value": "Kangaroos",
                "rank": 2,
                "major": "N"
            }

In [8]:
# Convert the results to a DataFrame, normalizing the JSON
nyt_df = pd.json_normalize(reviews_list)
nyt_df.head(5)

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2018/01/18/movies/kang...,The documentary looks at the mass killings of ...,The New York Times,"[{'name': 'creative_works', 'value': 'Kangaroo...",2018-01-18T12:00:23+0000,263,Review: ‘Kangaroo: A Love-Hate Story’ Exposes ...,,,Kangaroo: A Love-Hate Story,,,,By Ken Jaworowski,"[{'firstname': 'Ken', 'middlename': None, 'las...",
1,https://www.nytimes.com/2018/11/01/movies/they...,Morgan Neville’s movie tells the tumultuous be...,The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2018-11-01T13:56:39+0000,794,Review: ‘They’ll Love Me When I’m Dead’ Docume...,,,The Legend Behind The Scenes,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",
2,https://www.nytimes.com/2016/06/24/movies/from...,Sharon Shattuck’s documentary tells her family...,The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2016-06-23T21:11:31+0000,252,Review: ‘From This Day Forward’ Attests to Lov...,,,Review: ‘From This Day Forward’ Attests to Lov...,,,,By Andy Webster,"[{'firstname': 'Andy', 'middlename': None, 'la...",
3,https://www.nytimes.com/2021/10/14/movies/hard...,"Drawn from the plotline of a Todd Snider song,...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2021-10-14T12:27:03+0000,320,‘Hard Luck Love Song’ Review: A Glossy Take on...,,,Hard Luck Love Song,,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",
4,https://www.nytimes.com/2019/08/29/movies/fall...,"It’s hard to find a reliable, talented, reason...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2019-08-29T14:56:44+0000,374,‘Falling Inn Love’ Review: A Prize That Needs ...,Streaming Movie Review,,A Fine Romance to Watch While Sorting Socks,,,,By Helen T. Verongos,"[{'firstname': 'Helen', 'middlename': 'T.', 'l...",


In [9]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019.
# End string should include " Review" to avoid cutting title early

nyt_df['title'] = nyt_df['headline.main'].apply(lambda st: st[st.find("\u2018")+1:st.find("\u2019 Review")])
nyt_df['title']




0     Kangaroo: A Love-Hate Story’ Exposes a Wildlif...
1     They’ll Love Me When I’m Dead’ Documents Orson...
2     From This Day Forward’ Attests to Love’s Adapt...
3                                   Hard Luck Love Song
4                                      Falling Inn Love
5                                       Cyrano, My Love
6                               Love Among the Gas Pump
7                                       Five Feet Apart
8                                         I Love My Dad
9                                          Asako I & II
10                                             Ammonite
11                                           About Fate
12                                  Love Wedding Repeat
13                                    God’s Own Country
14                 Dina,’ a Differently Abled Love Stor
15                                            Love Hard
16                      69: The Saga of Danny Hernandez
17                   Eva,’ Robot Designer Loves 

In [10]:
# Extract 'name' and 'value' from items in "keywords" column

def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string
nyt_df['keywords'] =  nyt_df['keywords'].apply(extract_keywords) 
nyt_df['keywords']

0     creative_works: Kangaroo: A Love-Hate Story (M...
1     subject: Documentary Films and Programs;creati...
2     subject: Documentary Films and Programs;person...
3     subject: Movies;persons: Dorman, Michael (1981...
4     subject: Movies;organizations: Netflix Inc;per...
5     subject: Movies;creative_works: Cyrano, My Lov...
6     subject: Movies;creative_works: Detroit Unlead...
7     subject: Movies;creative_works: Five Feet Apar...
8     subject: Movies;persons: Oswalt, Patton;creati...
9     subject: Movies;creative_works: Asako I & II (...
10    subject: Movies;persons: Ronan, Saoirse;person...
11    subject: Movies;creative_works: About Fate (Mo...
12    subject: Movies;creative_works: Love Wedding R...
13    subject: Movies;creative_works: God's Own Coun...
14    creative_works: Dina (Movie);subject: Document...
15    subject: Movies;creative_works: Love Hard (Mov...
16    subject: Documentary Films and Programs;subjec...
17    subject: Movies;persons: Maillo, Kike;crea

In [11]:
# Create a list from the "title" column using to_list()
title_list = []
title_list = nyt_df['title'].tolist()
# These titles will be used in the query for The Movie Database

title_list



['Kangaroo: A Love-Hate Story’ Exposes a Wildlife Massacr',
 'They’ll Love Me When I’m Dead’ Documents Orson Welles’s Last Fil',
 'From This Day Forward’ Attests to Love’s Adaptabilit',
 'Hard Luck Love Song',
 'Falling Inn Love',
 'Cyrano, My Love',
 'Love Among the Gas Pump',
 'Five Feet Apart',
 'I Love My Dad',
 'Asako I & II',
 'Ammonite',
 'About Fate',
 'Love Wedding Repeat',
 'God’s Own Country',
 'Dina,’ a Differently Abled Love Stor',
 'Love Hard',
 '69: The Saga of Danny Hernandez',
 'Eva,’ Robot Designer Loves Cute Ki',
 'A Tuba to Cuba',
 'Solution to His Love Problems?: Baby Formul']

### Access The Movie Database API

In [12]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie/11"
tmdb_key_string = "?api_key=" + tmdb_api_key

url = f"https://api.themoviedb.org/3/search/movie{tmdb_key_string}&include_adult=false&language=en-US&page=1"

print(url)

https://api.themoviedb.org/3/search/movie?api_key=edd86bc79f18410743b10d55284de412&include_adult=false&language=en-US&page=1


In [13]:
# Create an empty tmdb_movies_list list to store the results
# and Add the relevant data to it.
# Empty list for results
movie_id_list = []  
tmdb_movies_list = []
tmdb_movies_list_details = []
results_list = []
genre_list = []
genre_row_list = []
unique_genre_names = []
spoken_languages_list = []
unique_spoken_languages_names = []
production_countries_list = []
unique_production_countries_names = []
details_dictionary = {}
details_results_list = []
request_counter = 0
title_found_counter = 0
# loop through the title_list
    # Loop through the titles
for title in title_list:
    # Set up the query
    query_url = f"{url}&query={title}"
    #print(query_url)
    # Perform a "GET" request for The Movie Database
    results = requests.get(query_url).json()
    if results['total_results'] > 0:
        #print(json.dumps(results, indent=4))
        # Print out the title that was found
        title_found_counter += 1
        print(f"Found {results['total_results']} result(s) for {title} counter: {title_found_counter}")
        # add id and title to filtered_data
        for result in results["results"]:
            if result["id"] not in movie_id_list:
                movie_id_list.append(result["id"])
                tmdb_movies_list.append(result)
                print(f'Found id: {result["id"]} for {title}')
    else:
         print(f"NOT Found {results['total_results']} result(s) for {title}")
    # Check if we need to sleep before making a request 
    # Add 1 to the request counter
    request_counter += 1
    #print(f"Request Counter: {request_counter%50}")
    if request_counter%50 == 0:
        print("sleeping")
    # Add a 12 second interval between queries to stay within API query limits
        time.sleep(12)
# Include a try clause to search for the full movie details.
pop_movie_id_list = movie_id_list.copy()
total_pop= len(movie_id_list)
for x in range(0, len(movie_id_list)):
    try:
        movie_id = pop_movie_id_list.pop()
        movie_id_str = str(movie_id)
    # Make a request for a the full movie details
    # Execute "GET" request with url  
        details_url ='https://api.themoviedb.org/3/movie/'+movie_id_str+tmdb_key_string
        #print(details_url)
        details_results = requests.get(details_url).json()
        tmdb_movies_list_details.append(details_results)
        #print(details_results)
         # Check if we need to sleep 
        # Add 1 to the request counter
        request_counter += 1
        #print(f"Request Counter: {request_counter%50}")
        if request_counter%50 == 0:
           print('sleeping')
        # Add a 12 second interval between queries to stay within API query limits
           time.sleep(12)
        # Use the except clause to print out a statement if a movie
        # is not found.
    except:
        print(f"Full movie details not found for id: {movie_id_list[x]} for {title}")
        continue
print(f'finished {total_pop} details for movies')
for details_results in tmdb_movies_list_details:
    # Extract the genre names into a list
    genre_list.append(details_results['genres'])
    # Extract the spoken_languages' English name into a list
    spoken_languages_list.append(details_results['spoken_languages'])   
    # Extract the production_countries' name into a list
    production_countries_list.append(details_results['production_countries'])
    details_results_dictionary = {
    'title':details_results['title'],
    'details_original_title':details_results['original_title'],
    'details_budget':details_results['budget'],
    'details_original_language':details_results['original_language'],
    'details_homepage':details_results['homepage'],
    'details_overview':details_results['overview'],
    'details_popularity':details_results['popularity'],
    'details_runtime':details_results['runtime'],
    'details_revenue':details_results['revenue'],
    'details_release_date':details_results['release_date'],
    'details_vote_average':details_results['vote_average'],
    'details_vote_count':details_results['vote_count'],
    'details_genre':details_results['genres'],
    'details_spoken_languages':details_results['spoken_languages'],
    'details_production_countries':details_results['production_countries']
    }
    details_results_list.append(details_results_dictionary)
#print(genre_list)
#print(spoken_languages_list)
#print(production_countries_list)    
for genre in genre_list:
    for item in genre:
        if(item['name'] not in unique_genre_names):
            unique_genre_names.append(item['name'])
print(f'unique_genre_names = {unique_genre_names}')
for spoken_language in spoken_languages_list:
    for item in spoken_language:
        if(item['english_name'] not in unique_spoken_languages_names):
            unique_spoken_languages_names.append(item['english_name'])
print(f'unique_spoken_languages_names = {unique_spoken_languages_names}')
for production_countries in production_countries_list:
    for item in production_countries:
        if(item['name'] not in unique_production_countries_names):
            unique_production_countries_names.append(item['name'])
print(f'unique_production_countries_names = {unique_production_countries_names}')
#print(json.dumps(details_results, indent=4))
# Add the relevant data to a dictionary 
details_dictionary["unique_genre_names"] = unique_genre_names
details_dictionary["unique_spoken_languages_names"] = unique_spoken_languages_names
details_dictionary["unique_production_countries_names"] = unique_production_countries_names
details_dictionary['details_results'] = details_results_list
print(f'details_dictonary = {details_dictionary}')
# append it to the tmdb_movies_list list
#tmdb_movies_list.append(details_dictionary)


        
       



NOT Found 0 result(s) for Kangaroo: A Love-Hate Story’ Exposes a Wildlife Massacr
NOT Found 0 result(s) for They’ll Love Me When I’m Dead’ Documents Orson Welles’s Last Fil
NOT Found 0 result(s) for From This Day Forward’ Attests to Love’s Adaptabilit
Found 1 result(s) for Hard Luck Love Song counter: 1
Found id: 536208 for Hard Luck Love Song
Found 1 result(s) for Falling Inn Love counter: 2
Found id: 623195 for Falling Inn Love
Found 2 result(s) for Cyrano, My Love counter: 3
Found id: 544510 for Cyrano, My Love
Found id: 445179 for Cyrano, My Love
NOT Found 0 result(s) for Love Among the Gas Pump
Found 1 result(s) for Five Feet Apart counter: 4
Found id: 527641 for Five Feet Apart
Found 3 result(s) for I Love My Dad counter: 5
Found id: 876825 for I Love My Dad
Found id: 1152894 for I Love My Dad
Found id: 1239290 for I Love My Dad
Found 1 result(s) for Asako I & II counter: 6
Found id: 487850 for Asako I & II
Found 3 result(s) for Ammonite counter: 7
Found id: 568467 for Ammonite
F

In [14]:

# Use json.dumps with argument indent=4 to format data

print(json.dumps(tmdb_movies_list, indent=4))


[
    {
        "adult": false,
        "backdrop_path": "/l17qkQS5Goz1asb2Zs5LsopGIK3.jpg",
        "genre_ids": [
            10749,
            10402,
            18,
            53
        ],
        "id": 536208,
        "original_language": "en",
        "original_title": "Hard Luck Love Song",
        "overview": "Jesse, a charismatic but down on his luck troubadour, finds himself at an existential crossroads as bad choices catch up with him during an unexpected reunion with Carla, an old flame.",
        "popularity": 3.851,
        "poster_path": "/fWUbYGT0zTJoEZVyc2SvsenoHPF.jpg",
        "release_date": "2021-10-15",
        "title": "Hard Luck Love Song",
        "video": false,
        "vote_average": 4.9,
        "vote_count": 10
    },
    {
        "adult": false,
        "backdrop_path": "/lVJdFZc8hBJRVnOIay60tl04Qjv.jpg",
        "genre_ids": [
            10749,
            35
        ],
        "id": 623195,
        "original_language": "en",
        "original_title

In [15]:
# convert details_results_list to Dataframe
nyt_details_df = pd.DataFrame(details_results_list)
nyt_df.head(5)

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,title
0,https://www.nytimes.com/2018/01/18/movies/kang...,The documentary looks at the mass killings of ...,The New York Times,creative_works: Kangaroo: A Love-Hate Story (M...,2018-01-18T12:00:23+0000,263,Review: ‘Kangaroo: A Love-Hate Story’ Exposes ...,,,Kangaroo: A Love-Hate Story,,,,By Ken Jaworowski,"[{'firstname': 'Ken', 'middlename': None, 'las...",,Kangaroo: A Love-Hate Story’ Exposes a Wildlif...
1,https://www.nytimes.com/2018/11/01/movies/they...,Morgan Neville’s movie tells the tumultuous be...,The New York Times,subject: Documentary Films and Programs;creati...,2018-11-01T13:56:39+0000,794,Review: ‘They’ll Love Me When I’m Dead’ Docume...,,,The Legend Behind The Scenes,,,,By Manohla Dargis,"[{'firstname': 'Manohla', 'middlename': None, ...",,They’ll Love Me When I’m Dead’ Documents Orson...
2,https://www.nytimes.com/2016/06/24/movies/from...,Sharon Shattuck’s documentary tells her family...,The New York Times,subject: Documentary Films and Programs;person...,2016-06-23T21:11:31+0000,252,Review: ‘From This Day Forward’ Attests to Lov...,,,Review: ‘From This Day Forward’ Attests to Lov...,,,,By Andy Webster,"[{'firstname': 'Andy', 'middlename': None, 'la...",,From This Day Forward’ Attests to Love’s Adapt...
3,https://www.nytimes.com/2021/10/14/movies/hard...,"Drawn from the plotline of a Todd Snider song,...",The New York Times,"subject: Movies;persons: Dorman, Michael (1981...",2021-10-14T12:27:03+0000,320,‘Hard Luck Love Song’ Review: A Glossy Take on...,,,Hard Luck Love Song,,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",,Hard Luck Love Song
4,https://www.nytimes.com/2019/08/29/movies/fall...,"It’s hard to find a reliable, talented, reason...",The New York Times,subject: Movies;organizations: Netflix Inc;per...,2019-08-29T14:56:44+0000,374,‘Falling Inn Love’ Review: A Prize That Needs ...,Streaming Movie Review,,A Fine Romance to Watch While Sorting Socks,,,,By Helen T. Verongos,"[{'firstname': 'Helen', 'middlename': 'T.', 'l...",,Falling Inn Love


In [16]:
# Convert the results to a DataFrame
tmdb_df = pd.DataFrame(tmdb_movies_list)
tmdb_df.head(5)

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/l17qkQS5Goz1asb2Zs5LsopGIK3.jpg,"[10749, 10402, 18, 53]",536208,en,Hard Luck Love Song,"Jesse, a charismatic but down on his luck trou...",3.851,/fWUbYGT0zTJoEZVyc2SvsenoHPF.jpg,2021-10-15,Hard Luck Love Song,False,4.9,10
1,False,/lVJdFZc8hBJRVnOIay60tl04Qjv.jpg,"[10749, 35]",623195,en,Falling Inn Love,When a San Francisco exec wins a New Zealand i...,17.92,/4bzu5k5vwLKgwap6pHXlCY8nlh3.jpg,2019-08-29,Falling Inn Love,False,6.5,1114
2,False,/dn7wTIud3Yroopm8bWN8WtidTBW.jpg,"[35, 36]",544510,fr,Edmond,"Paris, France, December 1897. The young playwr...",11.72,/9Y1VxSfJdu8fqQqgX0x6OObxvUw.jpg,2018-11-10,"Cyrano, My Love",False,7.5,476
3,False,/uRJb4Ukin4pQeTuyv07TqsBQt72.jpg,[10749],445179,ja,わたしに運命の恋なんてありえないって思ってた,"Riko is 27, working as a love simulation game ...",7.34,/nBrixx0zRnOufOaK9oVBUGlTS5f.jpg,2017-12-23,My Long Awaited Love Story,False,8.1,5
4,False,/27ZkYMWynuK2qiDP6awc3MsCaOs.jpg,"[10749, 18]",527641,en,Five Feet Apart,Seventeen-year-old Stella spends most of her t...,56.515,/kreTuJBkUjVWePRfhHZuYfhNE1T.jpg,2019-03-14,Five Feet Apart,False,8.271,5395


### Merge and Clean the Data for Export

In [17]:
# Merge the New York Times reviews and TMDB DataFrames on title

combined_df = pd.merge(nyt_df, tmdb_df, how='inner', on='title')
combined_df2 = pd.merge(combined_df, nyt_details_df, how='inner', on='title')

# Set index to "Country of Nationality"
combined_df2 = combined_df2.set_index("title")
combined_df2

Unnamed: 0_level_0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,...,details_overview,details_popularity,details_runtime,details_revenue,details_release_date,details_vote_average,details_vote_count,details_genre,details_spoken_languages,details_production_countries
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Hard Luck Love Song,https://www.nytimes.com/2021/10/14/movies/hard...,"Drawn from the plotline of a Todd Snider song,...",The New York Times,"subject: Movies;persons: Dorman, Michael (1981...",2021-10-14T12:27:03+0000,320,‘Hard Luck Love Song’ Review: A Glossy Take on...,,,Hard Luck Love Song,...,"Jesse, a charismatic but down on his luck trou...",3.851,104,0,2021-10-15,4.9,10,"[{'id': 10749, 'name': 'Romance'}, {'id': 1040...","[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o..."
Falling Inn Love,https://www.nytimes.com/2019/08/29/movies/fall...,"It’s hard to find a reliable, talented, reason...",The New York Times,subject: Movies;organizations: Netflix Inc;per...,2019-08-29T14:56:44+0000,374,‘Falling Inn Love’ Review: A Prize That Needs ...,Streaming Movie Review,,A Fine Romance to Watch While Sorting Socks,...,When a San Francisco exec wins a New Zealand i...,17.92,98,0,2019-08-29,6.5,1114,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o..."
"Cyrano, My Love",https://www.nytimes.com/2019/10/17/movies/cyra...,This superficial take on the writing and initi...,The New York Times,"subject: Movies;creative_works: Cyrano, My Lov...",2019-10-17T11:00:03+0000,291,"‘Cyrano, My Love’ Review: A Nose for Romance",,,"Cyrano, My Love",...,"Paris, France, December 1897. The young playwr...",11.72,113,0,2018-11-10,7.5,476,"[{'id': 35, 'name': 'Comedy'}, {'id': 36, 'nam...","[{'english_name': 'French', 'iso_639_1': 'fr',...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is..."
Five Feet Apart,https://www.nytimes.com/2019/03/14/movies/five...,Haley Lu Richardson and Cole Sprouse navigate ...,The New York Times,subject: Movies;creative_works: Five Feet Apar...,2019-03-14T13:00:07+0000,258,‘Five Feet Apart’ Review: Ailing Teenagers Liv...,,,Five Feet Apart,...,Seventeen-year-old Stella spends most of her t...,56.515,116,92600000,2019-03-14,8.271,5395,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...","[{'english_name': 'Spanish', 'iso_639_1': 'es'...","[{'iso_3166_1': 'US', 'name': 'United States o..."
I Love My Dad,https://www.nytimes.com/2022/08/04/movies/i-lo...,This comedy is a daddy-issues movie with a que...,The New York Times,"subject: Movies;persons: Oswalt, Patton;creati...",2022-08-04T11:00:05+0000,418,‘I Love My Dad’ Review: A Father Catfishes His...,,,A Father Catfishes His Son. We Just Wait for t...,...,A hopelessly estranged father catfishes his so...,8.4,90,0,2022-08-05,5.8,42,"[{'id': 35, 'name': 'Comedy'}]","[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o..."
Asako I & II,https://www.nytimes.com/2019/05/16/movies/asak...,The Japanese director Ryusuke Hamaguchi follow...,The New York Times,subject: Movies;creative_works: Asako I & II (...,2019-05-16T11:00:01+0000,547,‘Asako I & II’ Review: Double the Love (and th...,Critic’s Pick,,Falling For a Drifter And His Lookalike,...,College student Asako falls in love at first s...,11.186,119,0,2018-09-01,6.8,159,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...","[{'english_name': 'Japanese', 'iso_639_1': 'ja...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]"
Ammonite,https://www.nytimes.com/2020/11/12/movies/ammo...,Kate Winslet’s fossil hunter and Saoirse Ronan...,The New York Times,"subject: Movies;persons: Ronan, Saoirse;person...",2020-11-12T12:00:04+0000,510,‘Ammonite’ Review: Love on the Rocks,Critic’s Pick,,Love on the Rocks,...,Upon discovering an SD card on a beach from 20...,0.6,7,0,2021-10-17,0.0,0,[],[],[]
Ammonite,https://www.nytimes.com/2020/11/12/movies/ammo...,Kate Winslet’s fossil hunter and Saoirse Ronan...,The New York Times,"subject: Movies;persons: Ronan, Saoirse;person...",2020-11-12T12:00:04+0000,510,‘Ammonite’ Review: Love on the Rocks,Critic’s Pick,,Love on the Rocks,...,"In 1840s England, palaeontologist Mary Anning ...",26.617,118,1396905,2020-11-13,7.0,532,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...","[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]"
Ammonite,https://www.nytimes.com/2020/11/12/movies/ammo...,Kate Winslet’s fossil hunter and Saoirse Ronan...,The New York Times,"subject: Movies;persons: Ronan, Saoirse;person...",2020-11-12T12:00:04+0000,510,‘Ammonite’ Review: Love on the Rocks,Critic’s Pick,,Love on the Rocks,...,Upon discovering an SD card on a beach from 20...,0.6,7,0,2021-10-17,0.0,0,[],[],[]
Ammonite,https://www.nytimes.com/2020/11/12/movies/ammo...,Kate Winslet’s fossil hunter and Saoirse Ronan...,The New York Times,"subject: Movies;persons: Ronan, Saoirse;person...",2020-11-12T12:00:04+0000,510,‘Ammonite’ Review: Love on the Rocks,Critic’s Pick,,Love on the Rocks,...,"In 1840s England, palaeontologist Mary Anning ...",26.617,118,1396905,2020-11-13,7.0,532,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...","[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]"


In [18]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing
#print(combined_df2.dtypes)
combined_fixed_df = combined_df2.copy()
#The genres, spoken_languages, and production_countries columns were saved as lists, but we want the columns to be strings without the list characters ([, ], and '). To fix these columns, perform the following actions:
#Create a list of the columns that need fixing called columns_to_fix.
columns_to_fix = ['details_genre', 'details_spoken_languages', 'details_production_countries']
#Create a list of characters to remove called characters_to_remove.
characters_to_remove = ["[", "]", "'"]
#Loop through columns_to_fix and do the following:
#for fix_column in columns_to_fix:
def row_to_string(row,fix_column):  
    result_string = ''
    for dictionary in row[fix_column]:
        for key, value in dictionary.items():
            result_string += f" {value} "
            result_string = result_string.rstrip(', ').replace(characters_to_remove[0],'').replace('characters_to_remove[1]','').replace("characters_to_remove[3]",'')
    return result_string

combined_fixed_df['details_genre_fixed'] = combined_fixed_df.apply(lambda row: row_to_string(row,columns_to_fix[0] ), axis=1)
combined_fixed_df['details_spoken_languages_fixed'] = combined_fixed_df.apply(lambda row: row_to_string(row,columns_to_fix[1]), axis=1)
combined_fixed_df['details_production_countries_fixed'] = combined_fixed_df.apply(lambda row: row_to_string(row,columns_to_fix[2]), axis=1)
#print(combined_fixed_df['details_genre_fixed'])

#Use astype() to convert the column to a string.
#Loop through the characters_to_remove and use the Pandas str.replace() method to remove the character from the string.
#Print the head of the updated DataFrame to confirm the list characters were removed.
combined_fixed_df.head(5)
 


Unnamed: 0_level_0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,...,details_revenue,details_release_date,details_vote_average,details_vote_count,details_genre,details_spoken_languages,details_production_countries,details_genre_fixed,details_spoken_languages_fixed,details_production_countries_fixed
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Hard Luck Love Song,https://www.nytimes.com/2021/10/14/movies/hard...,"Drawn from the plotline of a Todd Snider song,...",The New York Times,"subject: Movies;persons: Dorman, Michael (1981...",2021-10-14T12:27:03+0000,320,‘Hard Luck Love Song’ Review: A Glossy Take on...,,,Hard Luck Love Song,...,0,2021-10-15,4.9,10,"[{'id': 10749, 'name': 'Romance'}, {'id': 1040...","[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o...",10749 Romance 10402 Music 18 Drama 53 Thriller,English en English Spanish es Español,US United States of America
Falling Inn Love,https://www.nytimes.com/2019/08/29/movies/fall...,"It’s hard to find a reliable, talented, reason...",The New York Times,subject: Movies;organizations: Netflix Inc;per...,2019-08-29T14:56:44+0000,374,‘Falling Inn Love’ Review: A Prize That Needs ...,Streaming Movie Review,,A Fine Romance to Watch While Sorting Socks,...,0,2019-08-29,6.5,1114,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o...",10749 Romance 35 Comedy,English en English,US United States of America
"Cyrano, My Love",https://www.nytimes.com/2019/10/17/movies/cyra...,This superficial take on the writing and initi...,The New York Times,"subject: Movies;creative_works: Cyrano, My Lov...",2019-10-17T11:00:03+0000,291,"‘Cyrano, My Love’ Review: A Nose for Romance",,,"Cyrano, My Love",...,0,2018-11-10,7.5,476,"[{'id': 35, 'name': 'Comedy'}, {'id': 36, 'nam...","[{'english_name': 'French', 'iso_639_1': 'fr',...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",35 Comedy 36 History,French fr Français Russian ru Pусский,BE Belgium FR France
Five Feet Apart,https://www.nytimes.com/2019/03/14/movies/five...,Haley Lu Richardson and Cole Sprouse navigate ...,The New York Times,subject: Movies;creative_works: Five Feet Apar...,2019-03-14T13:00:07+0000,258,‘Five Feet Apart’ Review: Ailing Teenagers Liv...,,,Five Feet Apart,...,92600000,2019-03-14,8.271,5395,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...","[{'english_name': 'Spanish', 'iso_639_1': 'es'...","[{'iso_3166_1': 'US', 'name': 'United States o...",10749 Romance 18 Drama,Spanish es Español English en English French ...,US United States of America
I Love My Dad,https://www.nytimes.com/2022/08/04/movies/i-lo...,This comedy is a daddy-issues movie with a que...,The New York Times,"subject: Movies;persons: Oswalt, Patton;creati...",2022-08-04T11:00:05+0000,418,‘I Love My Dad’ Review: A Father Catfishes His...,,,A Father Catfishes His Son. We Just Wait for t...,...,0,2022-08-05,5.8,42,"[{'id': 35, 'name': 'Comedy'}]","[{'english_name': 'English', 'iso_639_1': 'en'...","[{'iso_3166_1': 'US', 'name': 'United States o...",35 Comedy,English en English,US United States of America


In [19]:
# Drop "byline.person" column

combined_cleanup_df = combined_df2.drop(columns='byline.person', inplace=False)
combined_cleanup_df.columns

Index(['web_url', 'snippet', 'source', 'keywords', 'pub_date', 'word_count',
       'headline.main', 'headline.kicker', 'headline.content_kicker',
       'headline.print_headline', 'headline.name', 'headline.seo',
       'headline.sub', 'byline.original', 'byline.organization', 'adult',
       'backdrop_path', 'genre_ids', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'video', 'vote_average', 'vote_count',
       'details_original_title', 'details_budget', 'details_original_language',
       'details_homepage', 'details_overview', 'details_popularity',
       'details_runtime', 'details_revenue', 'details_release_date',
       'details_vote_average', 'details_vote_count', 'details_genre',
       'details_spoken_languages', 'details_production_countries'],
      dtype='object')

In [20]:
# Delete duplicate rows and reset index

#combined_cleanup_duplicates_df = combined_cleanup_df.drop_duplicates(keep='first')

combined_cleanup_duplicates_df = combined_cleanup_df.drop_duplicates(subset=['web_url', 'snippet', 'source', 'pub_date', 'word_count'], keep='first')
combined_cleanup_duplicates_df.shape

(11, 42)

In [21]:
# Export data to CSV without the index
combined_cleanup_duplicates_df.to_csv('output\combined.csv', encoding='utf-8')