In [1]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml
import yfinance as yf

In [2]:
pd.set_option('display.expand_frame_repr', False)

In [3]:
data_path = Path('/Volumes/My Book/Data_Analysis/Nasdaq') # set to e.g. external harddrive
DATA_STORE = data_path/'assets.h5'

In [4]:
df = (pd.read_csv(data_path/'wiki_prices.csv',
                 parse_dates=['date'],
                 index_col=['date', 'ticker'],
                 infer_datetime_format=True)
     .sort_index())

print(df.info(show_counts=True))
with pd.HDFStore(DATA_STORE) as store:
    store.put('quandl/wiki/prices', df)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15389314 entries, (Timestamp('1962-01-02 00:00:00'), 'ARNC') to (Timestamp('2018-03-27 00:00:00'), 'ZUMZ')
Data columns (total 12 columns):
 #   Column       Non-Null Count     Dtype  
---  ------       --------------     -----  
 0   open         15388776 non-null  float64
 1   high         15389259 non-null  float64
 2   low          15389259 non-null  float64
 3   close        15389313 non-null  float64
 4   volume       15389314 non-null  float64
 5   ex-dividend  15389314 non-null  float64
 6   split_ratio  15389313 non-null  float64
 7   adj_open     15388776 non-null  float64
 8   adj_high     15389259 non-null  float64
 9   adj_low      15389259 non-null  float64
 10  adj_close    15389313 non-null  float64
 11  adj_volume   15389314 non-null  float64
dtypes: float64(12)
memory usage: 1.4+ GB
None


In [5]:
df = pd.read_csv(data_path/'wiki_stocks.csv')
# no longer needed
# df = pd.concat([df.loc[:, 'code'].str.strip(),
#                 df.loc[:, 'name'].str.split('(', expand=True)[0].str.strip().to_frame('name')], axis=1)

print(df.info(show_counts=True))
with pd.HDFStore(DATA_STORE) as store:
    store.put('quandl/wiki/stocks', df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3199 entries, 0 to 3198
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   code    3199 non-null   object
 1   name    3199 non-null   object
dtypes: object(2)
memory usage: 50.1+ KB
None


In [28]:
"""
df = web.DataReader(name='SP500', data_source='fred', start=2009).squeeze().to_frame('close')
print(df.info())
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/fred', df)


The below code needs to be taken to a separate notebook and used to source SP500 index data and save in store,
because the yf_override() call disables the web.DataReader function to be used for other data sources.

"""
yf.pdr_override()
df = pd.DataFrame(web.get_data_yahoo('^OEX', start='2000-01-01', end='2024-02-02'))
print(df.head(), df.tail())

with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/fred', df)

[*********************100%%**********************]  1 of 1 completed

                  Open        High         Low       Close   Adj Close      Volume
Date                                                                              
2000-01-03  792.830017  798.429993  778.200012  788.789978  788.789978   931800000
2000-01-04  788.789978  788.789978  757.750000  759.010010  759.010010  1009000000
2000-01-05  759.010010  768.229980  747.969971  761.520020  761.520020  1085500000
2000-01-06  761.520020  767.659973  756.559998  762.640015  762.640015  1092300000
2000-01-07  762.640015  783.510010  759.330017  783.489990  783.489990  1225200000                    Open         High          Low        Close    Adj Close      Volume
Date                                                                                   
2024-01-26  2313.659912  2324.020020  2311.610107  2316.320068  2316.320068  3353400000
2024-01-29  2318.689941  2335.350098  2314.699951  2334.239990  2334.239990  3525160000
2024-01-30  2335.310059  2336.050049  2326.370117  2330.679932  233




In [29]:
sp500_stooq = (pd.read_csv(data_path/'^spx_d.csv', index_col=0,
                     parse_dates=True).loc['1950':'2024'].rename(columns=str.lower))
print(sp500_stooq.info())
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stooq', sp500_stooq)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 18734 entries, 1950-01-03 to 2024-02-09
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    18734 non-null  float64
 1   high    18734 non-null  float64
 2   low     18734 non-null  float64
 3   close   18734 non-null  float64
 4   volume  18734 non-null  float64
dtypes: float64(5)
memory usage: 878.2 KB
None


In [30]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
df = pd.read_html(url, header=0)[0]
df.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [31]:
df.columns = ['ticker', 'name', 'gics_sector', 'gics_sub_industry',
              'location', 'first_added', 'cik', 'founded']
df = df.set_index('ticker')

In [32]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 503 entries, MMM to ZTS
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name               503 non-null    object
 1   gics_sector        503 non-null    object
 2   gics_sub_industry  503 non-null    object
 3   location           503 non-null    object
 4   first_added        503 non-null    object
 5   cik                503 non-null    int64 
 6   founded            503 non-null    object
dtypes: int64(1), object(6)
memory usage: 31.4+ KB
None


In [33]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stocks', df)

In [34]:
df = pd.read_csv(data_path/'us_equities_meta_data.csv')
df.columns = ['ticker', 'name', 'lastsale', 'netchange',
              'pctchange', 'marketcap', 'country', 'ipoyear', 'volume', 'sector', 'industry']
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7176 entries, 0 to 7175
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ticker     7175 non-null   object
 1   name       7176 non-null   object
 2   lastsale   7176 non-null   object
 3   netchange  7176 non-null   object
 4   pctchange  7175 non-null   object
 5   marketcap  6759 non-null   object
 6   country    6878 non-null   object
 7   ipoyear    4076 non-null   object
 8   volume     7176 non-null   object
 9   sector     6410 non-null   object
 10  industry   6410 non-null   object
dtypes: object(11)
memory usage: 616.8+ KB
None


In [35]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('us_equities/stocks', df.set_index('ticker'))

In [36]:
# ML dataset downloaded from openML.org
mnist = fetch_openml('mnist_784', version=1)
print(mnist.DESCR)

**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  
**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  
**Please cite**:  

The MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  

It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image b

In [37]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [38]:
mnist_path = Path(data_path/'mnist')
if not mnist_path.exists():
    mnist_path.mkdir()

In [39]:
np.save(mnist_path / 'data', mnist.data.astype(np.uint8))
np.save(mnist_path / 'labels', mnist.target.astype(np.uint8))

In [40]:
fashion_mnist = fetch_openml(name='Fashion-MNIST')

In [41]:
print(fashion_mnist.DESCR)

**Author**: Han Xiao, Kashif Rasul, Roland Vollgraf  
**Source**: [Zalando Research](https://github.com/zalandoresearch/fashion-mnist)  
**Please cite**: Han Xiao and Kashif Rasul and Roland Vollgraf, Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms, arXiv, cs.LG/1708.07747  

Fashion-MNIST is a dataset of Zalando's article images, consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. Fashion-MNIST is intended to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits. 

Raw data available at: https://github.com/zalandoresearch/fashion-mnist

### Target classes
Each training and test example is assigned to one of the following labels:
Label  Description  
0  T-shirt/top  
1  Trouser  
2  Pullover  
3  Dress  
4  

In [42]:
label_dict = {0: 'T-shirt/top',
              1: 'Trouser',
              2: 'Pullover',
              3: 'Dress',
              4: 'Coat',
              5: 'Sandal',
              6: 'Shirt',
              7: 'Sneaker',
              8: 'Bag',
              9: 'Ankle boot'}

In [43]:
fashion_path = Path(data_path/'fashion_mnist')
if not fashion_path.exists():
    fashion_path.mkdir()

In [44]:
pd.Series(label_dict).to_csv(fashion_path / 'label_dict.csv', index=False, header=None)

In [45]:
np.save(fashion_path / 'data', fashion_mnist.data.astype(np.uint8))
np.save(fashion_path / 'labels', fashion_mnist.target.astype(np.uint8))