### Imports

In [2]:
# DJIA Stock Price Data
import yfinance as yf

# NYTimes News Data
import requests

# Utility
from os import path, getenv
from dotenv import load_dotenv
import datetime as dt
import pandas as pd
import json
from calendar import monthrange
from time import sleep

### List of Stocks
3M, American Express, Amgen, Amazon, Apple, Boeing, Caterpillar, Chevron, Cisco, Coca-Cola,<br>
Disney, Goldman Sachs, Home Depot, Honeywell, IBM, Johnson & Johnson, JPMorgan Chase, McDonald's, Merck, Microsoft,<br>
Nike, Nvidia, Proctor & Gamble, Salesforce, Sherwin-Williams, Travelers, UnitedHealth Group, Verizon, Visa, Walmart<br>
###### 30 stocks used based on the Dow Jones Industrial Average (DJIA), as of 15/02/2025.<br>

In [None]:
stock_ticks_list = [
    'MMM', 'AXP', 'AMGN', 'AMZN', 'AAPL', 'BA', 'CAT', 'CVX', 'CSCO', 'KO',
    'DIS', 'GS', 'HD', 'HON', 'IBM', 'JNJ', 'JPM', 'MCD', 'MRK', 'MSFT',
    'NKE', 'NVDA', 'PG', 'CRM', 'SHW', 'TRV', 'UNH', 'VZ', 'V', 'WMT'
]

djia_begin_date = dt.date(2015, 1, 1)
djia_end_date = dt.date(2024, 12, 31)

#### Stock Price Data using yfinance

In [None]:
df = yf.download(stock_ticks_list, djia_begin_date, djia_end_date)
df.head()

[*********************100%***********************]  30 of 30 completed


Price,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,AAPL,AMGN,AMZN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,...,MSFT,NKE,NVDA,PG,SHW,TRV,UNH,V,VZ,WMT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-02,24.320431,120.226555,15.426,80.133911,113.657204,70.110161,58.910957,20.189041,73.464218,86.310013,...,27913900,4985800,113680000,7251400,1700400,1270800,3060900,8389600,11421200,13505400
2015-01-05,23.635286,118.797874,15.1095,78.014648,112.870049,66.409294,57.846901,19.786871,70.527756,85.048729,...,39673900,6889200,197952000,8626100,2161800,1728700,4679000,12751200,18964500,20937000
2015-01-06,23.63751,114.970566,14.7645,76.352005,111.540627,65.981987,56.88229,19.779554,70.495132,84.597633,...,36447900,7576000,197764000,7791200,2133000,2350900,3468300,11070000,22950100,24615300
2015-01-07,23.96896,118.985886,14.921,78.019562,113.272377,67.004509,56.613789,19.962358,70.436417,85.46302,...,29114100,7256000,321808000,5986600,2487300,1558200,3225800,9346800,20793600,25495200
2015-01-08,24.8899,118.557289,15.023,79.125465,115.275307,67.691231,58.264572,20.115917,72.048203,86.346848,...,29645200,5978200,283780000,6823300,3236100,1941200,5346100,10443200,17617500,38140800


In [None]:
output_path = path.abspath("../../data/djia_stock_data")
df.to_csv(path_or_buf=output_path)

##### How to access and query DJIA stock data

In [None]:
pd.read_csv(output_path, header=[0,1], index_col=0)

Price,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,AAPL,AMGN,AMZN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,...,MSFT,NKE,NVDA,PG,SHW,TRV,UNH,V,VZ,WMT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-02,24.320431,120.226555,15.426000,80.133911,113.657204,70.110161,58.910957,20.189041,73.464218,86.310013,...,27913900,4985800,113680000,7251400,1700400,1270800,3060900,8389600,11421200,13505400
2015-01-05,23.635286,118.797874,15.109500,78.014648,112.870049,66.409294,57.846901,19.786871,70.527756,85.048729,...,39673900,6889200,197952000,8626100,2161800,1728700,4679000,12751200,18964500,20937000
2015-01-06,23.637510,114.970566,14.764500,76.352005,111.540627,65.981987,56.882290,19.779554,70.495132,84.597633,...,36447900,7576000,197764000,7791200,2133000,2350900,3468300,11070000,22950100,24615300
2015-01-07,23.968960,118.985886,14.921000,78.019562,113.272377,67.004509,56.613789,19.962358,70.436417,85.463020,...,29114100,7256000,321808000,5986600,2487300,1558200,3225800,9346800,20793600,25495200
2015-01-08,24.889900,118.557289,15.023000,79.125465,115.275307,67.691231,58.264572,20.115917,72.048203,86.346848,...,29645200,5978200,283780000,6823300,3236100,1941200,5346100,10443200,17617500,38140800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-23,254.989655,264.000000,225.059998,298.099121,177.690002,364.055389,342.899994,58.580814,142.970001,111.400002,...,19152500,14203800,176053500,6983900,1083300,1408300,4286000,7984600,21292300,26205400
2024-12-24,257.916443,264.489990,229.050003,302.748199,179.339996,366.227417,344.429993,59.444923,143.839996,112.559998,...,7164500,4919100,105157000,2460800,445900,329200,1824400,2684100,11712200,8992400
2024-12-26,258.735504,263.179993,227.050003,303.276947,180.380005,365.779053,341.720001,59.574043,143.979996,112.550003,...,8194200,6363500,116205600,3629400,750500,434100,3387000,2856000,12946400,10994000
2024-12-27,255.309296,262.649994,223.750000,300.343842,180.720001,363.527283,338.450012,59.206551,144.000000,111.550003,...,18117700,6334400,170582600,4367900,881700,631700,3471800,3489200,15168100,11384400


In [48]:
df.loc[:, [('Close', 'AAPL'), ('Close', 'AMZN'), ('Volume', 'AAPL'), ('Volume', 'AMZN')]]

Price,Close,Close,Volume,Volume
Ticker,AAPL,AMZN,AAPL,AMZN
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2015-01-02,24.320431,15.426000,212818400,55664000
2015-01-05,23.635286,15.109500,257142000,55484000
2015-01-06,23.637510,14.764500,263188400,70380000
2015-01-07,23.968960,14.921000,160423600,52806000
2015-01-08,24.889900,15.023000,237458000,61768000
...,...,...,...,...
2024-12-23,254.989655,225.059998,40858800,28070000
2024-12-24,257.916443,229.050003,23234700,15007500
2024-12-26,258.735504,227.050003,27237100,16146700
2024-12-27,255.309296,223.750000,42355300,27367100


##### News Data using NY Times Archive API
More information on the API [here](https://developer.nytimes.com/docs/articlesearch-product/1/overview).

In [10]:
class nytimes_news_data:
    '''
    Retrieve New York Times article data for a given year, by month.
    Note that the articles shown per page is 10 and the page limit specified by the API is 200. If the number of articles > 2000, it will not be shown.
    '''

    def __init__(self, params: dict):
        self.API_KEY = params['API_KEY']
        self.fq = params.get('fq', '')
        self.year = params['year']
        self.months = self.__generate_monthly_dates()

    def get(self):
        for month in range(1, 13):
            self.page = 0
            self.nyt_begin_date = self.months[month]['start']
            self.nyt_end_date = self.months[month]['end']

            self.__pages()

    def __pages(self):
        is_empty = False
        max_retries = 3  # Number of retries per request

        while not is_empty and self.page <= 200:
            try:
                url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={self.fq}&begin_date={self.nyt_begin_date}&end_date={self.nyt_end_date}&api-key={self.API_KEY}&page={self.page}"
                
                for attempt in range(max_retries):
                    response = requests.get(url)
                    response_json = response.json()

                    # API fault detection
                    if "fault" in response_json:  
                        print(f"API Fault encountered: {response_json}")
                        sleep(10)
                        continue  # Retry the same request

                    # If no fault, process response
                    if self.__is_empty(response):
                        is_empty = True
                    else:
                        self.export(response)
                        self.page += 1
                    
                    break  # Exit retry loop if successful

                else:
                    print(f"Max retries reached. Skipping this page. begin_date: {self.nyt_begin_date}, end_date: {self.nyt_end_date}, page: {self.page}")
                    self.page += 1

            except requests.RequestException as e:
                print(f"Network error: {e}. Retrying in 10 seconds...")
                sleep(10)

            sleep(10)


    # Export functions
    def export(self, response):
        output_path = self.__filepath()
        with open(output_path, 'w') as json_file:
            json.dump(response.json(), json_file, indent=4)

    def __filename(self):
        filename = f"{self.fq}_mth{self.nyt_begin_date[4:6]}_pg{self.page}"
        return filename

    def __filepath(self):
        filename = self.__filename()
        output_path = path.abspath(f"../../data/raw/{str(self.year)}/" + filename)
        return output_path

    # Helper functions
    def __generate_monthly_dates(self):
        months = {}
        
        for month in range(1, 13):
            start_date = f"{self.year}{month:02d}01"
            last_day = monthrange(self.year, month)[1]  # Get last day of the month
            end_date = f"{self.year}{month:02d}{last_day}"
            
            months[month] = {"start": start_date, "end": end_date}
        
        return months
    
    def __is_empty(self, response):
        try:
            response_json = response.json()  # Attempt to parse JSON
            if 'response' not in response_json:  # Check if 'response' key exists
                print(f"API Error: {response_json}")  # Log the full response
                return True  # Treat as empty (or handle differently)
            
            return response_json['response']['docs'] == []
        except Exception as e:
            print(f"Error parsing response: {e}")  # Catch JSON parsing errors
            return True  # Assume empty in case of failure

In [11]:
# Test
load_dotenv()
API_KEY = getenv("NYT_API_KEY")

for year in range(2024, 2025):
    params = {
        'API_KEY': API_KEY,
        'fq': 'organizations:("Apple Inc")',
        'year': year
        }

    test = nytimes_news_data(params)
    test.get()

API Fault encountered: {'fault': {'faultstring': 'Rate limit quota violation. Quota limit  exceeded. Identifier : 31f15a50-e029-49cd-ae87-8af8ebd54e52', 'detail': {'errorcode': 'policies.ratelimit.QuotaViolation'}}}


##### Troubleshooting

In [12]:
# load_dotenv()
# API_KEY = getenv("NYT_API_KEY")
nyt_begin_date = "20230101"
nyt_end_date = "20230131"
fq = 'organizations:("Apple Inc")'

url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={fq}&begin_date={nyt_begin_date}&end_date={nyt_end_date}&api-key={API_KEY}&page=0"
response = requests.get(url)
response.json()

{'status': 'OK',
 'copyright': 'Copyright (c) 2025 The New York Times Company. All Rights Reserved.',
 'response': {'docs': [{'abstract': 'The tech giant will assess its compliance with its official human rights policy, according to a federal filing.',
    'web_url': 'https://www.nytimes.com/2023/01/17/business/economy/apple-labor.html',
    'snippet': 'The tech giant will assess its compliance with its official human rights policy, according to a federal filing.',
    'lead_paragraph': 'Apple will conduct an assessment of its U.S. labor practices under an agreement with a coalition of investors that includes five New York City pension funds.',
    'print_section': 'B',
    'print_page': '3',
    'source': 'The New York Times',
    'multimedia': [{'rank': 0,
      'subtype': 'xlarge',
      'caption': None,
      'credit': None,
      'type': 'image',
      'url': 'images/2023/01/18/multimedia/17apple-labor-print-wtcg/17apple-labor1-wtcg-articleLarge.jpg',
      'height': 400,
      'w