## Request, clean and save Top 20 most shared NYT articles

In [1]:
 # import required packages  
import requests 
import json
import pandas as pd

In [2]:
#function to access private API key
def get_api_key(path):
    with open(path) as f:
        return json.load(f)

In [3]:
# function to get a list of most shared articles for provided time period (in days)
def get_most_shared_articles(search_period):
    most_shared_articles = []
    period = str(search_period)
    
    # get API key from private folder in director out of repo
    api_key = get_api_key("../../.nyt_api.json")['api_key']
    url = f'https://api.nytimes.com/svc/mostpopular/v2/shared/{period}/facebook.json?api-key={api_key}'
    response = requests.get(url)
    response_json = response.json()
    most_shared_articles = response_json['results']
    
    return most_shared_articles

In [4]:
# function to extract only needed information and make strings lowercase 
def cleaned_shared_articles(most_shared_articles):
    cleaned_articles = []
    
    # loop through every article and append to empty list 
    for article in most_shared_articles:
        date = pd.Timestamp("today").strftime("%m/%d/%Y")
        idx = pd.to_datetime(date)
        uri = article['uri']
        date_published = pd.to_datetime(article['published_date'])
        cleaned_articles.append([idx, uri, date_published])
        
    return cleaned_articles

In [5]:
# access current day to label data with the day it was accessed 
date_sourced = pd.Timestamp("today").strftime("%m/%d/%Y")
date_sourced = date_sourced.replace('/','_')

In [6]:
# uncomment to initalize some data from the past 30 days
# most_shared_before = get_most_shared_articles(30)

In [7]:
# cleaned_most_shared_before = cleaned_shared_articles(most_shared_before)

In [8]:
# df_shared_before = pd.DataFrame(cleaned_most_shared_before)

In [9]:
# df_shared_before.columns = ['date_sourced', 'uri', 'date_published']

In [10]:
# df_shared_before.head()

In [11]:
# df_shared_before.shape

In [12]:
# df_shared_before.to_csv(f'data/most_popular/most_shared_before{date_sourced}.csv', index=False)

In [13]:
# make API call with function to get most shared articles from previous day 
most_shared_articles = get_most_shared_articles(1)

In [14]:
# pass list of articles through cleaning function 
cleaned_most_shared = cleaned_shared_articles(most_shared_articles)

In [15]:
# put articles in dataframe
df_shared = pd.DataFrame(cleaned_most_shared)

In [16]:
# rename columns to strings 
df_shared.columns = ['date_sourced', 'uri', 'date_published']

In [17]:
df_shared.head()

Unnamed: 0,date_sourced,uri,date_published
0,2022-01-18,nyt://article/f0eae40a-c7c9-584f-a85d-d68544f5...,2022-01-17
1,2022-01-18,nyt://article/a8cd2b1d-beab-5c35-a403-6395feaa...,2022-01-16
2,2022-01-18,nyt://article/47ae1e8e-64e9-5d46-96df-5edfb972...,2022-01-16
3,2022-01-18,nyt://article/3e9f24a2-4313-57d0-bff3-e900f930...,2022-01-15
4,2022-01-18,nyt://article/32042f84-b000-5027-aadf-8d55047b...,2022-01-16


In [20]:
df_shared.shape

(20, 4)

In [21]:
# df_shared.to_csv(f'../data/most_popular/most_shared_{date_sourced}.csv', index=False) 

In [22]:
""" uncoment and unindent next line to divert top 20 lists for deployment into a different folder.
    comment line above to prevent delpoyment data from leaking into training folder.
"""
    # df_shared.to_csv(f'../data/most_popular_deploy/most_shared_{date_sourced}.csv', index=False)

' uncoment and unindent next line to divert top 20 lists for deployment into a different folder.\n    comment line above to prevent delpoyment data from leaking into training folder.\n'