## Request, clean and save NYT archivre articles

In [91]:
 # import required packages  
import requests 
import json
import pandas as pd 

In [92]:
#function to access private API key
def get_api_key(path):
    with open(path) as f:
        return json.load(f)

In [93]:
# function to get a list of all articles for provided months from API
def get_articles(year_month):
    articles_list = []
    
    # get API key from private folder in director out of repo 
    api_key = get_api_key("../.nyt_api.json")['api_key']
    
    # make API call for every month passed through 
    for date in year_month:
        year = str(date[0])
        month = str(date[1])
        url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={api_key}'
        response = requests.get(url)
        response_json = response.json()
        articles = response_json['response']['docs']
        articles_list.extend(articles)
        
    return articles_list 

In [94]:
# function to extract only needed information and make strings lowercase 
def cleaned_articles(archive):
    cleaned = []
    
    # loop through every article and append to empty list 
    for article in archive:
        uri = article['uri']
        date_published = article['pub_date'][:10]
        headline = article['headline']['main'].lower()
        keywords = ''.join(x['value'].lower() for x in article['keywords'])
        paragraph = article['lead_paragraph'].lower()
        word_count = article['word_count']
        cleaned.append([uri, date_published, headline, keywords, paragraph, word_count])
        
    return cleaned

In [95]:
# make a list of months than pass to API call function 
article_dates_train = [(2021, 12), (2022, 1)]
article_dates_deploy = [(2022, 1)]
articles = get_articles(article_dates_train)

In [96]:
# articles[0][]

In [97]:
# pass list of articles through cleaning function 
cleaned_articles = cleaned_articles(articles)

In [98]:
# put articles in dataframe and drop duplictes, if any 
df_articles = pd.DataFrame(cleaned_articles)
df_articles.dropna(inplace=True)

In [99]:
# rename columns to strings 
df_articles.columns = ['uri', 'date_published', 'headline', 'keywords', 'paragraph', 'word_count']

In [100]:
df_articles.shape

(5994, 6)

In [101]:
"""uncomment and unindent next two lines to only accept articles with dates after training phase
"""
    # last_training_day = pd.to_datetime('2022/01/15').date()
    # df_articles = df_articles[df_articles.date_published > last_training_day]

'uncomment and unindent next two lines to only accept articles with dates after training phase\n'

In [102]:
df_articles.head()

Unnamed: 0,uri,date_published,headline,keywords,paragraph,word_count
0,nyt://article/4e2abcbc-9563-56c0-bf18-c93ce4e7...,2021-12-01,let’s end the covid blame games,coronavirus omicron variantcoronavirus (2019-n...,"over the summer, as covid cases started rising...",918
1,nyt://article/f7a5e7ef-e4cd-575c-a0e1-ff4d18d5...,2021-12-01,trump’s iran policy has become a disaster for ...,irannuclear weaponsembargoes and sanctionsunit...,the judges have voted and the results are in: ...,1560
2,nyt://article/778af86c-b600-5aeb-be98-0a9a1aa1...,2021-12-01,this extinct eagle may have gulped guts like a...,eagles (birds)vultures (birds)condorsendangere...,"at craigmore station in canterbury, new zealan...",913
3,nyt://article/fb7763c2-0f41-5c81-9ecf-17607ba5...,2021-12-01,republican recriminations point to a rocky pat...,republican partyhouse of representativesboeber...,washington — hostilities between the republica...,1172
4,nyt://article/ba0afd37-0c00-5535-8868-21130e28...,2021-12-01,a times editor who attended oxford recalls a d...,school shootings and armed attacks,the last time i was inside the walls of oxford...,309


In [103]:
df_articles.date_published[0]

'2021-12-01'

In [104]:
# df_articles.to_csv('data/article_archive.csv', index=False)

In [105]:
""" uncoment and unindent next line to save delpoyment archive data to a seperate file. 
    comment line above to prevent delpoyment data from leaking into training file.
"""
# df_articles.to_csv('data/archive_deploy.csv', index=False)

' uncoment and unindent next line to save delpoyment archive data to a seperate file. \n    comment line above to prevent delpoyment data from leaking into training file.\n'

In [106]:
df_articles.to_csv('data/exp_archive.csv', index=False)