# dataloader notebook 
loading data from disk to actually get a dataframe ready for training

In [1]:
# Import statements
import yfinance as yf
import pandas as pd
import os
import numpy as np
from datetime import date
from pathlib import Path
import pandas_ta as ta
from tqdm import tqdm
from price_data_pull_yfinance import gethistoricalOHLC, saveHistStockData, loadHistDataFromDisk

from global_vars import dataPathToOHLC, pathToMasterDF, float16Cols, float32Cols, dataPathToTechIndicators

In [2]:
pd.set_option('display.max_columns', 500)

## Import all important stuff

In [5]:
masterDf = pd.read_csv(pathToMasterDF);
tickerDF = pd.read_csv('../01_data/SnP500_constituents.csv')
mdf = masterDf

## data sanity checks
Lets check all the data that we have on disk and compare it with the data that should be the tickers.

In [28]:
symbol_list = tickerDF.Symbol.to_list();
ticker_list = mdf[mdf.FILEPATH_TECH_IND_CALC != ''].TICKER.to_list()

In [41]:
def getTickersOnDisk(path):
    
    arr = os.listdir(path)
    tickers_on_disk = [ticker.split('_')[0] for ticker in arr]
    return tickers_on_disk

In [43]:
tickers_on_disk = getTickersOnDisk('../01_data/01_price_data/01_price_data_w_tech_indicators/');
tickers_on_disk[:5]

['AAL', 'AAPL', 'AAP', 'ABBV', 'ABC']

In [46]:
problematic_tickers = [ticker for ticker in ticker_list if ticker not in tickers_on_disk]

In [25]:
problematicTickers = ["BRK.B", "KO", "DVN", "EMN", "ECL", "HES", "MSI", "NVDA", "PGR", "RF", "TXT", "MOS", "WHR", "WMB", "XRX"]

In [22]:
masterDf.to_csv(pathToMasterDF, index=False)

### check some of the problematic results


In [34]:
df = loadHistDataFromDisk('PGR')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1990-01-02,0.572266,0.577637,0.568359,0.575684,889200.0,0.0,0.0
1,1990-01-03,0.577637,0.586914,0.577637,0.57959,1198800.0,0.0,0.0
2,1990-01-04,0.575684,0.575684,0.57373,0.575684,1760400.0,0.0,0.0
3,1990-01-05,0.575684,0.57959,0.575684,0.577637,100800.0,0.0,0.0
4,1990-01-08,0.575684,0.57959,0.575684,0.57959,644400.0,0.0,0.0


## Technical Indicator Strategies
Based on the implementation of https://github.com/twopirllc/pandas-ta

In [16]:
df = loadHistDataFromDisk('AAPL')
df.set_index(pd.DatetimeIndex(df['Date']), inplace = True);
df = df.drop('Date',axis = 1);

In [73]:
df = df.iloc[:1000]

#### Candles Technical Indicators

In [75]:
before = len(df.columns)
df.ta.strategy('Candles')
after = len(df.columns)
print(f'There have been {after-before} features added')

There have been 0 features added


#### Cycles Technical Indicators

In [78]:
before = len(df.columns)
df.ta.strategy('Cycles')
after = len(df.columns)
print(f'There have been {after-before} features added')

There have been 1 features added


#### Overlap Technical Indicators

In [80]:
before = len(df.columns)
df.ta.strategy('Overlap')
after = len(df.columns)
print(f'There have been {after-before} features added')

There have been 40 features added


#### Momentum Technical Indicators

In [79]:
before = len(df.columns)
df.ta.strategy('Momentum')
after = len(df.columns)
print(f'There have been {after-before} features added')

There have been 60 features added


#### Statistics Technical Indicators

In [82]:
before = len(df.columns)
df.ta.strategy('Statistics')
after = len(df.columns)
print(f'There have been {after-before} features added')

There have been 9 features added


#### Trend Technical Indicators

In [83]:
before = len(df.columns)
df.ta.strategy('Trend')
after = len(df.columns)
print(f'There have been {after-before} features added')

There have been 23 features added


#### Volatility Technical Indicators

In [85]:
before = len(df.columns)
df.ta.strategy('Volatility')
after = len(df.columns)
print(f'There have been {after-before} features added')

There have been 31 features added


#### Volume Technical Indicators

In [86]:
before = len(df.columns)
df.ta.strategy('Volume')
after = len(df.columns)
print(f'There have been {after-before} features added')

There have been 18 features added
