Download the QuandlWiki Stock Prices and a few data that will be later used for the future project

In [1]:
from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml

In [2]:
DATA_STORE = Path('assets.h5')

Quandi Wiki Prices
Quandl makes available data with stock prices for 3000 US publicly traded companies, the data is supported till the April 11, 2018, here we download their data and save as csv file named 'wiki_prices', we can also download data by Quandl api and with api key they provided

In [3]:
wiki_prices = (pd.read_csv('wiki_prices.csv',
                 parse_dates=['date'],
                 index_col=['date', 'ticker'],
                 infer_datetime_format=True)
     .sort_index())

In [6]:
wiki_prices .head()

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,ex-dividend,split_ratio,adj_open,adj_high,adj_low,adj_close,adj_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1962-01-02,ARNC,65.56,65.75,65.38,65.38,5600.0,0.0,1.0,3.458163,3.468185,3.448668,3.448668,44800.0
1962-01-02,BA,50.88,50.88,50.0,50.0,11595.0,0.0,1.0,0.88716,0.88716,0.871816,0.871816,352198.125
1962-01-02,CAT,38.5,38.87,38.12,38.5,13600.0,0.0,1.0,1.57837,1.593539,1.562791,1.57837,163200.0
1962-01-02,DD,241.5,244.25,241.5,241.5,2000.0,0.0,1.0,4.718414,4.772144,4.718414,4.718414,36000.0
1962-01-02,DIS,37.25,38.5,37.25,37.25,2098.0,0.0,1.0,0.141259,0.145999,0.141259,0.141259,408858.24


In [33]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('quandl/wiki/prices', wiki_prices)

In [7]:
wiki_prices.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,ex-dividend,split_ratio,adj_open,adj_high,adj_low,adj_close,adj_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-03-27,ZIXI,4.43,4.45,4.35,4.39,207304.0,0.0,1.0,4.43,4.45,4.35,4.39,207304.0
2018-03-27,ZNGA,3.85,3.85,3.68,3.7,11026010.0,0.0,1.0,3.85,3.85,3.68,3.7,11026010.0
2018-03-27,ZOES,15.13,15.24,14.75,14.88,261242.0,0.0,1.0,15.13,15.24,14.75,14.88,261242.0
2018-03-27,ZTS,84.08,84.45,81.21,81.86,2676191.0,0.0,1.0,84.08,84.45,81.21,81.86,2676191.0
2018-03-27,ZUMZ,24.65,24.65,23.35,23.6,403884.0,0.0,1.0,24.65,24.65,23.35,23.6,403884.0


The following file contains the code and company name of the 3000 companies

In [5]:
wiki_stocks = pd.read_csv('wiki_stocks.csv')
with pd.HDFStore(DATA_STORE) as store:
    store.put('quandl/wiki/stocks', wiki_stocks)
    
wiki_stocks.head()

Unnamed: 0,code,name
0,A,Agilent Technologies Inc.
1,AA,Alcoa Inc.
2,AAL,American Airlines Group Inc.
3,AAMC,Altisource Asset Management
4,AAN,Aaron's Inc.


Next we download historical data of SP 500 from Fed, only 10 years from of daily data is available

In [35]:
sp500_fred = web.DataReader(name='SP500', data_source='fred', start=2011).squeeze().to_frame('close')

In [36]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/fred', sp500_fred)

Here we download sp500 data from stoop, store data from 1950 - 2020

In [37]:
sp500_stooq = (pd.read_csv('^spx_d.csv', index_col=0,
                     parse_dates=True).rename(columns=str.lower))
print(sp500_stooq.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 18151 entries, 1950-01-03 to 2021-10-14
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    18151 non-null  float64
 1   high    18151 non-null  float64
 2   low     18151 non-null  float64
 3   close   18151 non-null  float64
 4   volume  18151 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 850.8 KB
None


Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1950-01-03,16.66,16.66,16.66,16.66,700000
1950-01-04,16.85,16.85,16.85,16.85,1050000
1950-01-05,16.93,16.93,16.93,16.93,1416667
1950-01-06,16.98,16.98,16.98,16.98,1116667
1950-01-07,17.09,17.09,17.09,17.09,1116667


In [38]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stooq', sp500_stooq)

SP 500 Constituents data downloads

In [53]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
df = pd.read_html(url, header=0)[0]
df.head()

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [54]:
df = df.drop('SEC filings', axis=1).set_index('Symbol')
df.head()

Unnamed: 0_level_0,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
ABMD,Abiomed,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [55]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stocks', df)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['Security', 'GICS Sector', 'GICS Sub-Industry', 'Headquarters Location',
       'Date first added', 'Founded'],
      dtype='object')]

  if (await self.run_code(code, result,  async_=asy)):


Meta Data on US-Traded Stock

In [2]:
df = pd.read_csv('us_equities_meta_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6834 entries, 0 to 6833
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ticker     6834 non-null   object 
 1   name       6834 non-null   object 
 2   lastsale   6718 non-null   float64
 3   marketcap  5766 non-null   float64
 4   ipoyear    3038 non-null   float64
 5   sector     5288 non-null   object 
 6   industry   5288 non-null   object 
dtypes: float64(3), object(4)
memory usage: 373.9+ KB


In [5]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('us_equities/stocks', df.set_index('ticker'))

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['name', 'sector', 'industry'], dtype='object')]

  if (await self.run_code(code, result,  async_=asy)):
