# Stock and ETF History Download

In [None]:
#Code borrowed from https://www.kaggle.com/code/jacksoncrow/download-nasdaq-historical-data/notebook

In [1]:
import yfinance as yf
import numpy as np
import pandas as pd
import os, contextlib, shutil
from os.path import isfile, join

## Download all NASDAQ traded symbols

In [2]:
#Create symbol database using NASDAQ trading data
data = pd.read_csv("http://www.nasdaqtrader.com/dynamic/SymDir/nasdaqtraded.txt", sep='|')
data_clean = data[data['Test Issue'] == 'N']
symbols = data_clean['NASDAQ Symbol'].tolist()
print('Total number of symbols traded = {}'.format(len(symbols)))

Total number of symbols traded = 11207


In [3]:
data.head()

Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
0,Y,A,"Agilent Technologies, Inc. Common Stock",N,,N,100.0,N,,A,A,N
1,Y,AA,Alcoa Corporation Common Stock,N,,N,100.0,N,,AA,AA,N
2,Y,AAA,Alternative Access First Priority CLO Bond ETF,P,,Y,100.0,N,,AAA,AAA,N
3,Y,AAAU,Goldman Sachs Physical Gold ETF Shares,Z,,Y,100.0,N,,AAAU,AAAU,N
4,Y,AACBU,Artius II Acquisition Inc. - Units,Q,G,N,100.0,N,N,,AACBU,N


## Download Historical Data

In [4]:
offset = 0
limit = 3000 #Prevents getting rate limited
period = 'max' #Data timeline period

In [5]:
%%time

limit = limit if limit else len(symbols)
end = min(offset + limit, len(symbols))
is_valid = [False] * len(symbols)
# force silencing of verbose API
with open(os.devnull, 'w') as devnull:
    with contextlib.redirect_stdout(devnull):
        for i in range(offset, end):
            s = symbols[i]
            data = yf.download(s, period=period, progress = False)
            if len(data.index) == 0:
                continue
        
            is_valid[i] = True
            data.to_csv('hist/{}.csv'.format(s))

print('Total number of valid symbols downloaded = {}'.format(sum(is_valid)))


1 Failed download:
['AACT=']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['AACT+']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['AAM=']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['AAM+']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['ABLLW']: YFInvalidPeriodError("ABLLW: Period 'max' is invalid, must be of the format 1d, 5d, etc.")

1 Failed download:
['ABLVW']: YFInvalidPeriodError("ABLVW: Period 'max' is invalid, must be of the format 1d, 5d, etc.")

1 Failed download:
['ABPWW']: YFInvalidPeriodError("ABPWW: Period 'max' is invalid, must be of the format 1d, 5d, etc.")

1 Failed download:
['ABR-D']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['ABR-E']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:
['ABR-F']: YFTzMissingError('possibly delisted; no timezone found')

1 Failed download:


Total number of valid symbols downloaded = 2620
CPU times: user 3min 17s, sys: 23.4 s, total: 3min 40s
Wall time: 22min 29s


In [6]:
#Export data to csv
valid_data = data_clean[is_valid]
valid_data.to_csv('symbols_valid_meta.csv', index=False)

## Seperate ETFs and Stocks

In [8]:
#Label if symbols are ETFs or not
etfs = valid_data[valid_data['ETF'] == 'Y']['NASDAQ Symbol'].tolist()
stocks = valid_data[valid_data['ETF'] == 'N']['NASDAQ Symbol'].tolist()

In [9]:
#Seperate ETFs and Stocks data into seperate folders
def move_symbols(symbols, dest):
    for s in symbols:
        filename = '{}.csv'.format(s)
        shutil.move(join('hist', filename), join(dest, filename))
        
move_symbols(etfs, "etfs")
move_symbols(stocks, "stocks")