In [1]:
import yfinance as yf
import pandas as pd
from numpy import where
from pandas import concat
from sklearn.utils import resample

In [2]:
# write function to download table from yfinance https://pypi.org/project/yfinance/
# pip install yfinance (to pip install)
# pip install yfinance --upgrade (to upgrade, latest version 0.1.62 as of 2021-07-09)

def get_tables(stock_list):
    """
    Function takes list object and pickles the dataframes for use later
    Inputs: stock_list (python list object of lower case stock ticker symbols)
    Ouputs: stores .pkl object in data folder
    """
    for stock in stock_list:
        stock_df = yf.Ticker(stock.upper()).history(period="max")
        stock_df.to_pickle(f'./data/{stock.lower()}_df.pkl')
        
# list of common stock ticker symbols
stock_list = ['aapl', 'ibm', 'tsla', 'vgt']

# call function to pickle dataframes after downloads
get_tables(stock_list)

In [9]:
# write function to transform dataframe and split for train/test dataframe sets

def data(stock, days_ahead):
    """
    Inputs: stock, string of stock symbol like 'aapl' or 'AAPL'
            days_ahead, int days prediction ahead, 1 for 1 day ahead, 2 for 2 days ahead, etc...
    Output: X_train, X_test, y_train, y_test (for modeling), stock_df (DataFrame)
    """
    
    # load stock data pickle file from data folder
    stock_df = pd.read_pickle(f'./data/{stock.lower()}_df.pkl')
    
    # some open values are 0.00, set as their close value, otherwise leave as open value
    # this is done so we don't divide by zero, see next line for 'oc' calculation
    stock_df['Open'] = where(stock_df['Open'] == 0, stock_df['Close'], stock_df['Open'])
    
    # calculate daily open close % difference
    stock_df['oc'] = (stock_df.Close - stock_df.Open) / stock_df.Open
    
    # calculate daily high low % difference
    stock_df['hl'] = (stock_df.High - stock_df.Low) / stock_df.Low
    
    # % change from pervious day Close
    stock_df['prev_change'] = stock_df['Close'].pct_change()
    
    # 13 day rolling moving standard deviation of prev_change
    stock_df['13std'] = stock_df.prev_change.rolling(13).std()
    
    # 13 day rolling moving of prev_change
    stock_df['13mva'] = stock_df.prev_change.rolling(13).mean()
    
    # Direction (target feature column)
    stock_df['direction'] = where(stock_df['prev_change'].shift(-days_ahead) > stock_df['prev_change'], 1, -1)
    
    # Drop nulls generated from rolling columns
    stock_df.dropna(axis=0, inplace=True)
    
    # split stock_df to train and test dataframes
    split = int(stock_df.shape[0] * 0.85)
    train = stock_df[:split]
    test = stock_df[split:]
    
    # upsample target class inbalance for 'direction' column in train dataframe
    train_major = train[train['direction'] == -1]
    train_minor = train[train['direction'] == 1]
    
    train_minor_upsampled = resample(train_minor
                                     , replace = True
                                     , n_samples = train_major.shape[0]
                                     , random_state = 42
                                    )
    
    train_upsampled = concat([train_major, train_minor_upsampled])
    
    # shuffle train dataframe 
    train = train_upsampled.sample(frac=1).reset_index(drop=True)
    
    # features of model
    features = ['oc'
                , 'hl'
                , '13std'
                , '13mva'
               ]
    
    # X_train, X_test, y_train, y_test
    X_train = train[features]
    y_train = train['direction']
    
    X_test = test[features]
    y_test = test['direction']
    
    return X_train, X_test, y_train, y_test, stock_df

In [10]:
X_train, X_test, y_train, y_test, stock_df = data('aapl', 1)

In [11]:
X_train.sample(5)

Unnamed: 0,oc,hl,13std,13mva
2939,0.008571,0.02312,0.016232,-0.002866
2708,-0.013224,0.030531,0.022421,-0.013756
4766,-0.005254,0.015443,0.009604,0.002449
3574,0.0,0.006849,0.029577,0.003639
4548,-0.034245,0.057557,0.056839,-0.019048


In [12]:
X_test.sample(5)

Unnamed: 0_level_0,oc,hl,13std,13mva
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-04-19,-0.005525,0.015811,0.014958,0.002882
2015-12-30,-0.011604,0.014182,0.015155,-0.00597
2019-03-26,-0.02541,0.044967,0.016869,0.006271
2018-04-10,0.001445,0.0144,0.021376,0.001094
2015-11-11,-0.002234,0.019182,0.018993,-0.001447


In [13]:
y_train.sample(5)

7304    1
3034    1
2250   -1
1247   -1
6002   -1
Name: direction, dtype: int64

In [14]:
y_test.sample(5)

Date
2017-06-22    1
2016-07-29    1
2017-07-03    1
2016-05-19    1
2021-04-28    1
Name: direction, dtype: int64

In [16]:
stock_df.tail(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,oc,hl,prev_change,13std,13mva,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-07-06,140.070007,143.149994,140.070007,142.020004,108181800,0.0,0.0,0.013922,0.021989,0.014718,0.008923,0.006773,1
2021-07-07,143.539993,144.889999,142.660004,144.570007,104911600,0.0,0.0,0.007176,0.015632,0.017955,0.009328,0.007185,-1
2021-07-08,141.580002,144.059998,140.669998,143.240005,105575500,0.0,0.0,0.011725,0.024099,-0.0092,0.009193,0.007254,1
2021-07-09,142.75,145.649994,142.649994,145.110001,99788400,0.0,0.0,0.016532,0.02103,0.013055,0.009132,0.007173,-1
2021-07-12,146.210007,146.320007,144.0,144.5,76196700,0.0,0.0,-0.011696,0.016111,-0.004204,0.009477,0.005873,-1
