# The data

The data is a list of article metadata from NYT for the month of January through
the years 2013 to 2024. The data is in JSON format. 

Our goal is to import all the saved data and create a pandas DataFrame with it.
This will let us analyze the data and answer questions like:

- Trends in article topics over the last 10 years
- Most popular authors
- Most popular sections
- Most popular keywords
- Most popular articles
- and so on...

In [4]:
import json
import requests
import time
import os
from dotenv import dotenv_values

config = dotenv_values(".env")

API_KEY = config['API_KEY']

In [None]:
def get_nyt_articles(year, month):
    url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={API_KEY}'
    response = requests.get(url)
    response.raise_for_status()
    # return only the articles. The response object contains metadata as well.
    return response.json()['response']['docs']

def build_nyt_archive():
    articles = []
    month = 1
    for year in range(2014, 2025):
        articles.extend(get_nyt_articles(year, month))
        print(f'Fetched {len(articles)} articles total.')
        time.sleep(20)
    return articles

def save_nyt_archive(articles):
    # check if the data folder exists, if not, create it
    if not os.path.exists('data'):
        os.makedirs('data')
    with open('data/nyt_archive_2014_2024_jan.json', 'w') as f:
        json.dump(articles, f)

articles = build_nyt_archive()
save_nyt_archive(articles)
print('NYT archive saved to nyt_archive_2013_2023_jan.json')



Fetched 7337 articles total.


In [5]:
def load_json():
    with open('data/nyt_archive_2013_2024_jan.json', 'r') as f:
        return json.load(f)
    
articles = load_json()
print(len(articles))
print(articles[0])

65325
{'abstract': 'The Emancipation Proclamation evolved during the Civil War years, as did the thinking of its author.', 'web_url': 'https://opinionator.blogs.nytimes.com/2012/12/31/abraham-lincoln-and-the-emancipation-proclamation/', 'snippet': 'The Emancipation Proclamation evolved during the Civil War years, as did the thinking of its author.', 'lead_paragraph': 'In an op-ed, Eric Foner writes:', 'source': 'The New York Times', 'multimedia': [], 'headline': {'main': 'Abraham Lincoln and the Emancipation Proclamation', 'kicker': 'Opinionator', 'content_kicker': None, 'print_headline': '', 'name': None, 'seo': None, 'sub': None}, 'keywords': [{'name': 'subject', 'value': 'Civil War (US) (1861-65)', 'rank': 1, 'major': 'N'}, {'name': 'subject', 'value': 'Emancipation Proclamation (1863)', 'rank': 2, 'major': 'N'}, {'name': 'subject', 'value': 'Slavery', 'rank': 3, 'major': 'N'}, {'name': 'persons', 'value': 'Lincoln, Abraham', 'rank': 4, 'major': 'N'}], 'pub_date': '2013-01-01T00:05:

In [6]:
# Exploration

# Each article is a dictionary with multiple keys. Some of the values are
# dictionaries themselves. For example, the 'headline' key has a dictionary
# as its value. The 'keywords' key has a list of dictionaries as its value.

# keys of the articles dictionary
print(articles[0].keys())

# keys of the headline dictionary
print(articles[0]['headline'].keys())

# keys of the first keyword dictionary
print(articles[0]['keywords'][0].keys())

# multimedia is a list of dictionaries for the multimedia content of the article
# count of the multimedia content of the first article
print(len(articles[0]['multimedia']))

# the data I think we should keep from the articles dictionary is:
# abstract, byline (Author Name), document_type, headline, keywords, news_desk, section_name, word_count, 

# print the data in the keywords field
print(articles[0]['keywords'])

# print the data in the headline field
print(articles[0]['headline'])

# print the data in the byline field
print(articles[0]['byline'])



dict_keys(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri'])
dict_keys(['main', 'kicker', 'content_kicker', 'print_headline', 'name', 'seo', 'sub'])
dict_keys(['name', 'value', 'rank', 'major'])
0
[{'name': 'subject', 'value': 'Civil War (US) (1861-65)', 'rank': 1, 'major': 'N'}, {'name': 'subject', 'value': 'Emancipation Proclamation (1863)', 'rank': 2, 'major': 'N'}, {'name': 'subject', 'value': 'Slavery', 'rank': 3, 'major': 'N'}, {'name': 'persons', 'value': 'Lincoln, Abraham', 'rank': 4, 'major': 'N'}]
{'main': 'Abraham Lincoln and the Emancipation Proclamation', 'kicker': 'Opinionator', 'content_kicker': None, 'print_headline': '', 'name': None, 'seo': None, 'sub': None}
{'original': 'By The Editors', 'person': [], 'organization': 'The Editors'}


# Creating dataframes

We will create a pandas DataFrame from the data based on the fields we've
decided to keep.

In [54]:
import pandas as pd
import numpy as np


def extract_main_author(byline):
    """Extracts the name of the person with rank 1 from the byline."""
    if 'person' in byline and byline['person']:
        # Look for the person with rank 1
        for person in byline['person']:
            if person.get('rank') == 1:
                # Format the name as 'Firstname Lastname'
                return f"{person.get('firstname', '')} {person.get('lastname', '')}".strip()
    # If no rank 1 person is found, return NaN
    return np.nan


def extract_keywords_with_subcategories(article):
    """Extracts keywords and keeps the first rank as main and second rank as subcategory."""
    keyword_dict = {}
    
    # Initialize lists to hold keywords by type
    subjects = []
    organizations = []
    glocations = []
    persons = []

    # Iterate over keywords and categorize them
    for keyword in article.get('keywords', []):
        keyword_type = keyword['name']
        keyword_value = keyword['value']
        
        # Append the keyword to the appropriate list
        if keyword_type == 'subject':
            subjects.append(keyword_value)
        elif keyword_type == 'organizations':
            organizations.append(keyword_value)
        elif keyword_type == 'glocations':
            glocations.append(keyword_value)
        elif keyword_type == 'persons':
            keyword_dict['person'] = keyword_value

    # Assign the first and second ranked keywords for each type
    if subjects:
        keyword_dict['subject'] = subjects[0]  # First ranked subject
        keyword_dict['subject_subcategory'] = subjects[1] if len(subjects) > 1 else np.nan  # Second ranked subject

    if organizations:
        keyword_dict['organization'] = organizations[0]  # First ranked organization
        keyword_dict['organization_subcategory'] = organizations[1] if len(organizations) > 1 else np.nan  # Second ranked organization

    if glocations:
        keyword_dict['glocation'] = glocations[0]  # First ranked glocation
        keyword_dict['glocation_subcategory'] = glocations[1] if len(glocations) > 1 else np.nan  # Second ranked glocation

    if persons:
        keyword_dict['person'] = persons[0]
        keyword_dict['person_subcategory'] = persons[1] if len(persons) > 1 else np.nan

    return keyword_dict

articles_data = []
for article in articles:
    article_data = {
        'headline': article['headline']['main'],
        'pub_date': article['pub_date'],
        'document_type': article['document_type'],
        'word_count': article.get('word_count', 0),
        'news_desk': article.get('news_desk'),
        'section_name': article.get('section_name'),
        'type_of_material': article.get('type_of_material'),
        'multimedia_count': len(article.get('multimedia', [])),
        'author': extract_main_author(article.get('byline', {})),
    }
    # Add the keywords with subcategories
    article_data.update(extract_keywords_with_subcategories(article))

    articles_data.append(article_data)

# Create DataFrame
df = pd.DataFrame(articles_data)

df.to_csv('data/raw.csv', index=False)

In [55]:
# count the null values in each row and add a new column to the dataframe for
# the count of null values.
df['null_count'] = df.isnull().sum(axis=1)

# sort the dataframe by the number of null values in each row
df = df.sort_values('null_count', ascending=True)

# convert the all entries in the whole dataframe to title case
df = df.apply(lambda x: x.str.title() if x.dtype == "object" else x)

# drop the rows with more than 2 null values
df = df[df['null_count'] <= 2]

# drop the null_count column
df = df.drop(columns='null_count')

# save the cleaned data to a new csv file
df.to_csv('data/cleaned.csv', index=False)

# Date cleaning

The date column is a string. We can use regex to extract the year, month, and
day from the date and put them in their own columns.

In [56]:
# date cleaning
# convert the pub_date column to a datetime object
df['pub_date'] = pd.to_datetime(df['pub_date'])

# get the year, month, and day from the pub_date column
df['year'] = df['pub_date'].dt.year
df['month'] = df['pub_date'].dt.month
df['day'] = df['pub_date'].dt.day

# drop the pub_date column
df = df.drop('pub_date', axis=1)

# save the cleaned data to a new CSV file
df.to_csv('data/cleaned.csv', index=False)

## Headline Apostrophe Cleaning

In [57]:
# Cleaning text encoding in the 'headline' column

df['headline'] = df['headline'].str.replace(r'[\u2018\u2019]', "'", regex=True)

df['headline'] = df['headline'].str.replace("'S ", "'s ")

df.to_csv('data/cleaned.csv', encoding='utf-8', index=False)

## Persons cleaning

In [58]:
df['person'] = df['person'].str.replace(r"\s*\(.*\)", "", regex=True)

# # Switch from 'Lastname, Firstname MiddleInitial' to 'Firstname MiddleInitial Lastname'
df['person'] = df['person'].str.replace(
    r"(\w+),\s*(\w+)(\s+\w)?", r"\2\3 \1", regex=True)

## Keywords cleaning

In [59]:
df['subject'] = df['subject'].str.replace(r"\s*\(.*\)", "", regex=True)

df['subject_subcategory'] = df['subject_subcategory'].str.replace(r"\s*\(.*\)", "", regex=True)

df['organization'] = df['organization'].str.replace(r"\s*\(.*\)", "", regex=True)

df['organization_subcategory'] = df['organization_subcategory'].str.replace(r"\s*\(.*\)", "", regex=True)

df['type_of_material'] = df['type_of_material'].str.replace(r"\s*\(.*\)", "", regex=True)



df.to_csv('data/cleaned.csv', encoding='utf-8', index=False)