# The data

The data is a list of article metadata from NYT for the month of January through
the years 2013 to 2024. The data is in JSON format. 

Our goal is to import all the saved data and create a pandas DataFrame with it.
This will let us analyze the data and answer questions like:

- Trends in article topics over the last 10 years
- Most popular authors
- Most popular sections
- Most popular keywords
- Most popular articles
- and so on...

In [17]:
import json

def load_json():
    with open('data/nyt_archive_2013_2023_jan.json', 'r') as f:
        return json.load(f)
    
articles = load_json()
print(len(articles))
print(articles[0])

58117
{'abstract': 'The Emancipation Proclamation evolved during the Civil War years, as did the thinking of its author.', 'web_url': 'https://opinionator.blogs.nytimes.com/2012/12/31/abraham-lincoln-and-the-emancipation-proclamation/', 'snippet': 'The Emancipation Proclamation evolved during the Civil War years, as did the thinking of its author.', 'lead_paragraph': 'In an op-ed, Eric Foner writes:', 'source': 'The New York Times', 'multimedia': [], 'headline': {'main': 'Abraham Lincoln and the Emancipation Proclamation', 'kicker': 'Opinionator', 'content_kicker': None, 'print_headline': '', 'name': None, 'seo': None, 'sub': None}, 'keywords': [{'name': 'subject', 'value': 'Civil War (US) (1861-65)', 'rank': 1, 'major': 'N'}, {'name': 'subject', 'value': 'Emancipation Proclamation (1863)', 'rank': 2, 'major': 'N'}, {'name': 'subject', 'value': 'Slavery', 'rank': 3, 'major': 'N'}, {'name': 'persons', 'value': 'Lincoln, Abraham', 'rank': 4, 'major': 'N'}], 'pub_date': '2013-01-01T00:05:

In [22]:
# Exploration

# Each article is a dictionary with multiple keys. Some of the values are
# dictionaries themselves. For example, the 'headline' key has a dictionary
# as its value. The 'keywords' key has a list of dictionaries as its value.

# keys of the articles dictionary
print(articles[0].keys())

# keys of the headline dictionary
print(articles[0]['headline'].keys())

# keys of the first keyword dictionary
print(articles[0]['keywords'][0].keys())

# multimedia is a list of dictionaries for the multimedia content of the article
# count of the multimedia content of the first article
print(len(articles[0]['multimedia']))

# the data I think we should keep from the articles dictionary is:
# abstract, byline (Author Name), document_type, headline, keywords, news_desk, section_name, word_count, 

# print the data in the keywords field
print(articles[0]['keywords'])

# print the data in the headline field
print(articles[0]['headline'])

# print the data in the byline field
print(articles[0]['byline'])



dict_keys(['abstract', 'web_url', 'snippet', 'lead_paragraph', 'source', 'multimedia', 'headline', 'keywords', 'pub_date', 'document_type', 'news_desk', 'section_name', 'byline', 'type_of_material', '_id', 'word_count', 'uri'])
dict_keys(['main', 'kicker', 'content_kicker', 'print_headline', 'name', 'seo', 'sub'])
dict_keys(['name', 'value', 'rank', 'major'])
0
[{'name': 'subject', 'value': 'Civil War (US) (1861-65)', 'rank': 1, 'major': 'N'}, {'name': 'subject', 'value': 'Emancipation Proclamation (1863)', 'rank': 2, 'major': 'N'}, {'name': 'subject', 'value': 'Slavery', 'rank': 3, 'major': 'N'}, {'name': 'persons', 'value': 'Lincoln, Abraham', 'rank': 4, 'major': 'N'}]
{'main': 'Abraham Lincoln and the Emancipation Proclamation', 'kicker': 'Opinionator', 'content_kicker': None, 'print_headline': '', 'name': None, 'seo': None, 'sub': None}
{'original': 'By The Editors', 'person': [], 'organization': 'The Editors'}


# Creating dataframes

We will create a pandas DataFrame from the data based on the fields we've
decided to keep.

In [32]:
import pandas as pd
import numpy as np

def extract_keywords_dynamic(article):
    """Extracts keywords and groups them dynamically by their 'name'."""
    keyword_dict = {}

    for keyword in article.get('keywords', []):
        keyword_type = keyword['name']
        keyword_value = keyword['value']

        if keyword_type not in keyword_dict:
            keyword_dict[keyword_type] = []

        keyword_dict[keyword_type].append(keyword_value)

    for key in keyword_dict:
        keyword_dict[key] = ', '.join(keyword_dict[key])

    return keyword_dict

articles_data = []
for article in articles:
    article_data = {
        'headline': article['headline']['main'],
        'pub_date': article['pub_date'],
        'document_type': article['document_type'],
        'word_count': article.get('word_count', 0),
        'news_desk': article.get('news_desk'),
        'section_name': article.get('section_name'),
        'type_of_material': article.get('type_of_material'),
        'multimedia_count': len(article.get('multimedia', [])),
    }
    article_data.update(extract_keywords_dynamic(article))

    articles_data.append(article_data)

df = pd.DataFrame(articles_data)

print(df.tail())

                                                headline  \
58112  Up Next for Rafael Nadal? His Specialty, the F...   
58113  Multilayered Paintings That Pay Tribute to Hip...   
58114  Biden Will Designate Qatar as Major Non-NATO Ally   
58115  Rachel Maddow, MSNBC’s No. 1 star, is taking a...   
58116  The White House threatens sanctions against Pu...   

                       pub_date document_type  word_count   news_desk  \
58112  2022-01-31T21:41:04+0000       article        1448      Sports   
58113  2022-01-31T21:53:31+0000       article        1548      TStyle   
58114  2022-01-31T21:55:26+0000       article         468  Washington   
58115  2022-01-31T21:59:59+0000       article         386    Business   
58116  2022-01-31T22:07:53+0000       article         435     Foreign   

       section_name type_of_material  multimedia_count  \
58112        Sports             News                 5   
58113    T Magazine             News                 5   
58114          U.S.       