In [1]:
import yfinance as yf
import pandas as pd
from numpy import where
from pandas import concat
from sklearn.utils import resample

In [2]:
# write function to download table from yfinance https://pypi.org/project/yfinance/
# pip install yfinance (to pip install)
# pip install yfinance --upgrade (to upgrade, latest version 0.1.62 as of 2021-07-09)

def get_tables(stock_list):
    """
    Function takes list object and pickles the dataframes for use later
    Inputs: stock_list (python list object of lower case stock ticker symbols)
    Ouputs: stores .pkl object in data folder
    """
    for stock in stock_list:
        stock_df = yf.Ticker(stock.upper()).history(period="max")
        stock_df.to_pickle(f'./data/{stock.lower()}_df.pkl')
        
# list of common stock ticker symbols
stock_list = ['aapl', 'ibm', 'tsla', 'vgt']

# call function to pickle dataframes after downloads
get_tables(stock_list)

In [26]:
# write function to transform dataframe and split for train/test dataframe sets

def data(stock, days_ahead):
    """
    Inputs: stock, string of stock symbol like 'aapl' or 'AAPL'
            days_ahead, int days prediction ahead, 1 for 1 day ahead, 2 for 2 days ahead, etc...
    Output: X_train, X_test, y_train, y_test (for modeling), stock_df (DataFrame)
    """
    
    # load stock data pickle file from data folder
    stock_df = pd.read_pickle(f'./data/{stock.lower()}_df.pkl')
    
    # create month and day of week column
    stock_df['month'] = stock_df.index.month
    stock_df['day'] = stock_df.index.dayofweek
    
    # some open values are 0.00, set as their close value, otherwise leave as open value
    # this is done so we don't divide by zero, see next line for 'oc' calculation
    stock_df['Open'] = where(stock_df['Open'] == 0, stock_df['Close'], stock_df['Open'])
    
    # calculate daily open close % difference
    stock_df['oc'] = (stock_df.Close - stock_df.Open) / stock_df.Open
    
    # calculate daily high low % difference
    stock_df['hl'] = (stock_df.High - stock_df.Low) / stock_df.Low
    
    # % change from pervious day Close
    stock_df['prev_change'] = stock_df['Close'].pct_change()
    
    # 13 day rolling moving standard deviation of prev_change
    stock_df['13std'] = stock_df.prev_change.rolling(13).std()
    
    # 13 day rolling moving of prev_change
    stock_df['13mva'] = stock_df.prev_change.rolling(13).mean()
    
    # Direction (target feature column)
    stock_df['direction'] = where(stock_df['prev_change'].shift(-days_ahead) > stock_df['prev_change'], 1, -1)
    
    # Drop nulls generated from rolling columns
    stock_df.dropna(axis=0, inplace=True)
    
    # split stock_df to train and test dataframes
    split = int(stock_df.shape[0] * 0.85)
    train = stock_df[:split]
    test = stock_df[split:]
    
    # upsample target class inbalance for 'direction' column in train dataframe
    train_major = train[train['direction'] == -1]
    train_minor = train[train['direction'] == 1]
    
    train_minor_upsampled = resample(train_minor
                                     , replace = True
                                     , n_samples = train_major.shape[0]
                                     , random_state = 42
                                    )
    
    train_upsampled = concat([train_major, train_minor_upsampled])
    
    # shuffle train dataframe 
    train = train_upsampled.sample(frac=1).reset_index(drop=True)
    
    # features of model
    features = ['oc'
                , 'hl'
                , '13std'
                , '13mva'
                , 'month'
                , 'day'
               ]
    
    # X_train, X_test, y_train, y_test
    X_train = train[features]
    y_train = train['direction']
    
    X_test = test[features]
    y_test = test['direction']
    
    return X_train, X_test, y_train, y_test, stock_df

In [35]:
X_train, X_test, y_train, y_test, stock_df = data('aapl', 1)

In [36]:
X_train.sample(5)

Unnamed: 0,oc,hl,13std,13mva,month,day
8968,-0.03604751,0.075834,0.043568,0.013599,6,3
6416,0.0,0.005517,0.036453,-0.007032,11,1
849,-0.01440836,0.031784,0.018825,-0.0037,7,4
8604,-1.159267e-16,0.017699,0.015816,0.002528,3,1
3811,0.0,0.049181,0.030812,0.003886,11,3


In [37]:
X_test.sample(5)

Unnamed: 0_level_0,oc,hl,13std,13mva,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-07-17,-0.006805,0.013643,0.011341,0.004918,7,4
2020-07-21,-0.021906,0.025919,0.013218,0.00498,7,1
2017-10-05,0.007848,0.009023,0.010616,-0.001553,10,3
2016-06-03,0.001329,0.008415,0.009941,0.003292,6,4
2019-06-11,-0.000257,0.012397,0.016566,0.005041,6,1


In [38]:
y_train.sample(5)

5644   -1
1136    1
6102    1
8826   -1
7291   -1
Name: direction, dtype: int64

In [39]:
y_test.sample(5)

Date
2017-10-12    1
2016-12-05    1
2019-06-19    1
2020-08-17    1
2016-06-03    1
Name: direction, dtype: int64

In [40]:
stock_df.tail(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,month,day,oc,hl,prev_change,13std,13mva,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-12-06,164.289993,167.879898,164.279999,165.320007,107496982,0.0,0.0,12,0,0.006269,0.021913,0.021503,0.017943,0.007142,1
2021-12-07,169.080002,171.580002,168.339996,171.179993,120405352,0.0,0.0,12,1,0.01242,0.019247,0.035446,0.019471,0.0086,-1
2021-12-08,172.125,175.960007,170.699997,175.080002,116998901,0.0,0.0,12,2,0.017168,0.030814,0.022783,0.019041,0.008158,-1
2021-12-09,174.910004,176.75,173.919998,174.559998,108157647,0.0,0.0,12,3,-0.002001,0.016272,-0.00297,0.019074,0.006624,1
2021-12-10,175.205002,179.630005,174.690002,179.449997,110986702,0.0,0.0,12,4,0.024229,0.028279,0.028013,0.01992,0.008553,-1
