# All Article Per Month

In [4]:
 # import needed libraries   
import requests 
import json
import pandas as pd
import datetime as dt  

In [5]:
#function to access private API key
def get_api_key(path):
    with open(path) as f:
        return json.load(f)

In [6]:
# function to get a list of all articles for provided months from API
def get_articles(year_month):
    articles_list = []
    
    # get API key from private folder in director out of repo 
    api_key = get_api_key("../.nyt_api.json")['api_key']
    
    # make API call for every month passed through 
    for date in year_month:
        year = str(date[0])
        month = str(date[1])
        url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={api_key}'
        response = requests.get(url)
        response_json = response.json()
        articles = response_json['response']['docs']
        articles_list.extend(articles)
        
    return articles_list 

In [16]:
# function to extract only needed information and make strings lowercase 
def cleaned_articles(archive):
    cleaned = []
    
    # loop through every article and append to empty list 
    for article in archive:
        idx = article['uri']
        date_published = article['pub_date']
        headline = article['headline']['main'].lower()
        keywords = [x['value'].lower() for x in article['keywords']]
        section = article['section_name'].lower()
        word_count = article['word_count']
        cleaned.append([idx, date_published, headline, keywords, author, section, word_count])
        
    return cleaned

In [17]:
# make a list of months than pass to API call function 
article_dates = [(2021, 12), (2022, 1)]
articles = get_articles(article_dates)

In [18]:
# pass list of articles through cleaning function 
cleaned_articles = cleaned_articles(articles)

In [19]:
# put articles in dataframe and drop duplictes, if any 
df_articles = pd.DataFrame(cleaned_articles)
df_articles.drop_duplicates(0, inplace=True)

In [20]:
# rename columns to strings 
df_articles.columns = ['idx', 'date_published', 'headline', 'keywords', 'author', 'section', 'word_count']

In [21]:
df_articles.shape

(5011, 6)

In [22]:
df_articles.head()

Unnamed: 0,idx,headline,keywords,author,section,word_count
0,nyt://article/4e2abcbc-9563-56c0-bf18-c93ce4e7...,let’s end the covid blame games,"[coronavirus omicron variant, coronavirus (201...",bret stephens,opinion,918
1,nyt://article/f7a5e7ef-e4cd-575c-a0e1-ff4d18d5...,trump’s iran policy has become a disaster for ...,"[iran, nuclear weapons, embargoes and sanction...",thomas l. friedman,opinion,1560
2,nyt://article/778af86c-b600-5aeb-be98-0a9a1aa1...,this extinct eagle may have gulped guts like a...,"[eagles (birds), vultures (birds), condors, en...",sabrina imbler,science,913
3,nyt://article/fb7763c2-0f41-5c81-9ecf-17607ba5...,republican recriminations point to a rocky pat...,"[republican party, house of representatives, b...",jonathan weisman,u.s.,1172
4,nyt://article/ba0afd37-0c00-5535-8868-21130e28...,a times editor who attended oxford recalls a d...,[school shootings and armed attacks],karen workman,u.s.,309


In [24]:
df_articles.keywords[0]

['coronavirus omicron variant',
 'coronavirus (2019-ncov)',
 'politics and government',
 'vaccination and immunization',
 'rural areas',
 'rumors and misinformation',
 'deaths (fatalities)',
 'republican party',
 'democratic party']

In [285]:
date_sourced = pd.Timestamp("today").strftime("%m/%d/%Y")
date_sourced = date_sourced.replace('/','_')
date_sourced

'01_10_2022'

In [None]:
# df_articles.to_csv('data/article_archive.csv', index=False)