# All Article Per Month

In [18]:
 # import required packages  
import requests 
import json
import pandas as pd 

In [19]:
#function to access private API key
def get_api_key(path):
    with open(path) as f:
        return json.load(f)

In [20]:
# function to get a list of all articles for provided months from API
def get_articles(year_month):
    articles_list = []
    
    # get API key from private folder in director out of repo 
    api_key = get_api_key("../.nyt_api.json")['api_key']
    
    # make API call for every month passed through 
    for date in year_month:
        year = str(date[0])
        month = str(date[1])
        url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={api_key}'
        response = requests.get(url)
        response_json = response.json()
        articles = response_json['response']['docs']
        articles_list.extend(articles)
        
    return articles_list 

In [21]:
def cleaned_articles(archive):
    cleaned = []
    
    # loop through every article and append to empty list 
    for article in archive:
        uri = article['uri']
        date_published = article['pub_date'][:10]
        headline = article['headline']['main'].lower()
        keywords = ''.join(x['value'].lower() for x in article['keywords'])
        snippet = article['snippet']
        word_count = article['word_count']
        cleaned.append([uri, date_published, headline, keywords, snippet, word_count])
        
    return cleaned

In [22]:
# make a list of months than pass to API call function 
article_dates_train = [(2021, 12), (2022, 1)]
article_dates_deploy = [(2022, 1)]
articles = get_articles(article_dates_train)

In [23]:
# pass list of articles through cleaning function 
cleaned_articles = cleaned_articles(articles)

In [24]:
# put articles in dataframe and drop duplictes, if any 
df_articles = pd.DataFrame(cleaned_articles)
df_articles.dropna(inplace=True)

In [25]:
# rename columns to strings 
df_articles.columns = ['uri', 'date_published', 'headline', 'keywords', 'snippet', 'word_count']

In [26]:
df_articles.shape

(5291, 6)

In [27]:
"""uncomment and unindent next two lines to only accept articles with dates after training phase
"""
    # last_training_day = pd.to_datetime('2022/01/15').date()
    # df_articles = df_articles[df_articles.date_published > last_training_day]

datetime.date(2022, 1, 7)

In [29]:
df_articles.head()

Unnamed: 0,idx,date_published,headline,keywords,snippet,word_count
4766,nyt://article/4cc6765c-3935-595b-a0c0-209c81d6...,2022-01-08,5 big questions for the political year ahead,"united states politics and governmentbiden, jo...",Inflation and the pandemic are hurting Preside...,1460
4767,nyt://article/b6d1a470-eb88-555b-ab9a-a0ccfd69...,2022-01-08,officers ran off after firing stun gun that se...,"video recordings, downloads and streamingstun ...",The footage was released by New York’s attorne...,753
4768,nyt://article/263fa419-e2af-5fdc-9c27-8db5a68a...,2022-01-08,amy schneider becomes first woman to surpass $...,"game showsschneider, amy (1979- )jeopardy! (tv...","Schneider, a software engineering manager, is ...",410
4769,nyt://video/070a5c45-58b5-5e17-90ca-f8b55e55cbb4,2022-01-08,"sidney poitier, pioneering actor, dies at 94",actors and actressesacademy awards (oscars)dea...,Sidney Poitier was the first Black actor to wi...,0
4770,nyt://article/7f82d598-ab33-5f2f-abc9-3e4d1f87...,2022-01-08,ex-sheriff’s deputy pleads guilty in killing o...,"police brutality, misconduct and shootingscali...","Aaron Russell, 25, faces up to 11 years in pri...",503


In [77]:
df_articles.date_published[0]

datetime.date(2021, 12, 1)

In [None]:
# df_articles.to_csv('data/article_archive.csv', index=False)

In [None]:
""" uncoment and unindent next line to save delpoyment archive data to a seperate file. 
    comment line above to prevent delpoyment data from leaking into training file.
"""
    # df_articles.to_csv('data/archive_deploy.csv', index=False)