In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

In [2]:
# aapl_day = yf.download("AAPL", start="1980-12-08", end=None)
# aapl_day.to_pickle('./data/aapl_day.pkl')
# amd_day = yf.download("AMD", start="1980-03-17", end=None)
# amd_day.to_pickle('./data/amd_day.pkl')

In [3]:
# aapl_hour = yf.download(tickers="AAPL",
#                        period="2y",
#                        interval="1h"
#                       )
# aapl_hour.to_pickle('./data/aapl_hr.pkl')

In [4]:
def compute_up_down(begin, end):
    if abs((begin - end) / begin) <= 0.007:
        return 0
    elif (begin - end) / begin > 0.007:
        return 1
    else:
        return 2
    
def compute_volume_per_dollar(vol, begin, end):
    if begin - end == 0:
        return vol
    else:
        return vol / abs(begin - end)
    
def compute_pct_change(num1, num2, dem):
    if dem == 0:
        return 0
    else:
        return (num2 - num1) / dem

In [5]:
a = pd.read_pickle('./data/amd_day.pkl')
a = a.rename(columns={'Adj Close': 'ac'
                      , 'Open': 'o'
                      , 'Close': 'c'
                      , 'High': 'h'
                      , 'Low': 'l'
                      , 'Volume': 'v'
                     })

In [6]:
a['dow'] = a.index.weekday

a['vpd_hlc'] = a.apply(lambda row: compute_volume_per_dollar(row['v'], row['h'], row['l']), axis=1)  # volume per dollar high low change
a['vpd_opac'] = a.apply(lambda row: compute_volume_per_dollar(row['v'], row['o'], row['c']), axis=1)  # volume per dollar open close absolute change

a['ac1'] = a['ac'].shift(1)
a['ac3'] = a['ac'].shift(3)
a['ac5'] = a['ac'].shift(5)
a['ac8'] = a['ac'].shift(8)
a['ac13'] = a['ac'].shift(13)
a['ac21'] = a['ac'].shift(21)

a['mu_rol3'] = a['ac'].rolling(window=3).mean()
a['mu_rol5'] = a['ac'].rolling(window=5).mean()
a['mu_rol8'] = a['ac'].rolling(window=8).mean()
a['mu_rol13'] = a['ac'].rolling(window=13).mean()
a['mu_rol21'] = a['ac'].rolling(window=21).mean()

a['std_rol3'] = a['ac'].rolling(window=3).std()
a['std_rol5'] = a['ac'].rolling(window=5).std()
a['std_rol8'] = a['ac'].rolling(window=8).std()
a['std_rol13'] = a['ac'].rolling(window=13).std()
a['std_rol21'] = a['ac'].rolling(window=21).std()

a['hl_pct'] = a.apply(lambda row: compute_pct_change(row['l'], row['h'], row['o']), axis=1)
a['oc_pct'] = a.apply(lambda row: compute_pct_change(row['o'], row['c'], row['o']), axis=1)

a['ac1_pct'] = a['ac1'] - a['ac'] / a['ac']
a['ac3_pct'] = a['ac3'] - a['ac'] / a['ac']
a['ac5_pct'] = a['ac5'] - a['ac'] / a['ac']
a['ac8_pct'] = a['ac8'] - a['ac'] / a['ac']
a['ac13_pct'] = a['ac13'] - a['ac'] / a['ac']
a['ac21_pct'] = a['ac21'] - a['ac'] / a['ac']

a.dropna(axis=0, inplace=True)

a['ac1_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac1'], row['ac']), axis=1)
a['ac3_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac3'], row['ac']), axis=1)
a['ac5_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac5'], row['ac']), axis=1)
a['ac8_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac8'], row['ac']), axis=1)
a['ac13_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac13'], row['ac']), axis=1)
a['ac21_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac21'], row['ac']), axis=1)

a['ac1_pred'] = a['ac1_up_dwn'].shift(-1)
a['ac3_pred'] = a['ac3_up_dwn'].shift(-3)
a['ac5_pred'] = a['ac5_up_dwn'].shift(-5)
a['ac8_pred'] = a['ac8_up_dwn'].shift(-8)
a['ac13_pred'] = a['ac13_up_dwn'].shift(-13)
a['ac21_pred'] = a['ac21_up_dwn'].shift(-21)

In [7]:
a.iloc[-1]

o              9.279000e+01
h              9.316000e+01
l              9.183000e+01
c              9.209000e+01
ac             9.209000e+01
v              4.052330e+07
dow            3.000000e+00
vpd_hlc        3.046860e+07
vpd_opac       5.789005e+07
ac1            9.233000e+01
ac3            9.548000e+01
ac5            9.256000e+01
ac8            9.801000e+01
ac13           9.795000e+01
ac21           8.745000e+01
mu_rol3        9.281667e+01
mu_rol5        9.328000e+01
mu_rol8        9.392375e+01
mu_rol13       9.496461e+01
mu_rol21       9.558143e+01
std_rol3       1.057607e+00
std_rol5       1.447171e+00
std_rol8       1.814497e+00
std_rol13      2.113948e+00
std_rol21      2.577292e+00
hl_pct         1.433346e-02
oc_pct        -7.543966e-03
ac1_pct        9.133000e+01
ac3_pct        9.448000e+01
ac5_pct        9.156000e+01
ac8_pct        9.701000e+01
ac13_pct       9.695000e+01
ac21_pct       8.645000e+01
ac1_up_dwn     0.000000e+00
ac3_up_dwn     1.000000e+00
ac5_up_dwn     0.000

In [8]:
a.columns

Index(['o', 'h', 'l', 'c', 'ac', 'v', 'dow', 'vpd_hlc', 'vpd_opac', 'ac1',
       'ac3', 'ac5', 'ac8', 'ac13', 'ac21', 'mu_rol3', 'mu_rol5', 'mu_rol8',
       'mu_rol13', 'mu_rol21', 'std_rol3', 'std_rol5', 'std_rol8', 'std_rol13',
       'std_rol21', 'hl_pct', 'oc_pct', 'ac1_pct', 'ac3_pct', 'ac5_pct',
       'ac8_pct', 'ac13_pct', 'ac21_pct', 'ac1_up_dwn', 'ac3_up_dwn',
       'ac5_up_dwn', 'ac8_up_dwn', 'ac13_up_dwn', 'ac21_up_dwn', 'ac1_pred',
       'ac3_pred', 'ac5_pred', 'ac8_pred', 'ac13_pred', 'ac21_pred'],
      dtype='object')

In [9]:
a1_col_list = ['o'
               # , 'h'
               # , 'l'
               , 'c'
               # ,
               , 'ac'
               # , 'v'
               , 'vpd_hlc'
               , 'vpd_opac'
               # , 'ac1'
               # , 'ac3'
               # , 'ac5'
               # , 'ac8'
               # , 'ac13'
               # , 'ac21'
               , 'mu_rol3'
               # , 'mu_rol5'
               , 'mu_rol8'
               # , 'mu_rol13'
               , 'mu_rol21'
               , 'std_rol3'
               # , 'std_rol5'
               , 'std_rol8'
               # , 'std_rol13'
               , 'std_rol21'
               , 'hl_pct'
               , 'oc_pct'
               , 'ac1_pct'
               # , 'ac3_pct'
               # , 'ac5_pct'
               , 'ac8_pct'
               # , 'ac13_pct'
               , 'ac21_pct'
               # , 'ac1_up_dwn'
               # , 'ac3_up_dwn'
               # , 'ac5_up_dwn'
               # , 'ac8_up_dwn'
               # , 'ac13_up_dwn'
               # , 'ac21_up_dwn'
               , 'ac1_pred'
              ]

a1_master = a[a1_col_list].copy()

a1_X_pred = a1_master[a1_col_list[:-1]].iloc[-1].copy()

a1_master0 = a1_master.iloc[:-1].sample(frac=1, random_state=21)
a1_master0 = a1_master0.sample(frac=1, random_state=42)

idx_split = int(a.shape[0] * 0.9)

a1_X_train = a1_master0.iloc[:idx_split].drop('ac1_pred', axis=1)
a1_y_train = a1_master0['ac1_pred'].iloc[:idx_split]
a1_X_test = a1_master0.iloc[idx_split:].drop('ac1_pred', axis=1)
a1_y_test = a1_master0['ac1_pred'].iloc[idx_split:]

In [10]:
a1_X_pred

o            9.279000e+01
c            9.209000e+01
ac           9.209000e+01
vpd_hlc      3.046860e+07
vpd_opac     5.789005e+07
mu_rol3      9.281667e+01
mu_rol8      9.392375e+01
mu_rol21     9.558143e+01
std_rol3     1.057607e+00
std_rol8     1.814497e+00
std_rol21    2.577292e+00
hl_pct       1.433346e-02
oc_pct      -7.543966e-03
ac1_pct      9.133000e+01
ac8_pct      9.701000e+01
ac21_pct     8.645000e+01
Name: 2023-04-13 00:00:00, dtype: float64

In [11]:
a.iloc[-1]

o              9.279000e+01
h              9.316000e+01
l              9.183000e+01
c              9.209000e+01
ac             9.209000e+01
v              4.052330e+07
dow            3.000000e+00
vpd_hlc        3.046860e+07
vpd_opac       5.789005e+07
ac1            9.233000e+01
ac3            9.548000e+01
ac5            9.256000e+01
ac8            9.801000e+01
ac13           9.795000e+01
ac21           8.745000e+01
mu_rol3        9.281667e+01
mu_rol5        9.328000e+01
mu_rol8        9.392375e+01
mu_rol13       9.496461e+01
mu_rol21       9.558143e+01
std_rol3       1.057607e+00
std_rol5       1.447171e+00
std_rol8       1.814497e+00
std_rol13      2.113948e+00
std_rol21      2.577292e+00
hl_pct         1.433346e-02
oc_pct        -7.543966e-03
ac1_pct        9.133000e+01
ac3_pct        9.448000e+01
ac5_pct        9.156000e+01
ac8_pct        9.701000e+01
ac13_pct       9.695000e+01
ac21_pct       8.645000e+01
ac1_up_dwn     0.000000e+00
ac3_up_dwn     1.000000e+00
ac5_up_dwn     0.000

In [12]:
print(a.shape)
print(idx_split)
print(a1_X_train.shape)
print(a1_y_train.shape)
print(a1_X_test.shape)
print(a1_y_test.shape)

(10840, 45)
9756
(9756, 16)
(9756,)
(1083, 16)
(1083,)


In [13]:
9583+1064

10647

In [14]:
a.shape

(10840, 45)

In [15]:
a1_X_train.iloc[0]

o            8.250000e+00
c            8.281250e+00
ac           8.281250e+00
vpd_hlc      2.245818e+06
vpd_opac     2.470400e+07
mu_rol3      8.468750e+00
mu_rol8      8.613281e+00
mu_rol21     8.617560e+00
std_rol3     2.480392e-01
std_rol8     2.171096e-01
std_rol21    2.484139e-01
hl_pct       4.166667e-02
oc_pct       3.787879e-03
ac1_pct      7.375000e+00
ac8_pct      7.500000e+00
ac21_pct     7.656250e+00
Name: 1983-04-04 00:00:00, dtype: float64

In [16]:
a1_y_train[0]

1.0

In [17]:
a1_X_test.iloc[0]

o            1.037500e+01
c            1.100000e+01
ac           1.100000e+01
vpd_hlc      3.687855e+06
vpd_opac     4.056640e+06
mu_rol3      1.085417e+01
mu_rol8      1.061719e+01
mu_rol21     1.015476e+01
std_rol3     1.572882e-01
std_rol8     4.070219e-01
std_rol21    5.041310e-01
hl_pct       6.626506e-02
oc_pct       6.024096e-02
ac1_pct      9.687500e+00
ac8_pct      9.500000e+00
ac21_pct     8.937500e+00
Name: 1993-02-24 00:00:00, dtype: float64

In [18]:
a1_y_test[0]

0.0

In [19]:
a1_master['ac1_pred'].value_counts()

1.0    4376
2.0    4322
0.0    2141
Name: ac1_pred, dtype: int64

In [20]:
a1_X_train.isin([np.inf, -np.inf]).sum()

o            0
c            0
ac           0
vpd_hlc      0
vpd_opac     0
mu_rol3      0
mu_rol8      0
mu_rol21     0
std_rol3     0
std_rol8     0
std_rol21    0
hl_pct       0
oc_pct       0
ac1_pct      0
ac8_pct      0
ac21_pct     0
dtype: int64

In [21]:
# modeling

xgb_model = xgb.XGBClassifier(objective='reg:squaredlogerror'
                              # , num_class=2
                              , random_state=42
                              # , max_depth=7
                              # , n_estimators=89
                              # , learning_rate=0.05
                              # , min_child_weight=4
                             )

xgb_model.fit(a1_X_train, a1_y_train)

y_pred = xgb_model.predict(a1_X_test)

y_pred_prob = xgb_model.predict_proba(a1_X_test)[:, 1]

acc = accuracy_score(a1_y_test, y_pred)

print(f'Accuracy: {acc}')

Accuracy: 0.41458910433979684


In [22]:
xgb_model.feature_importances_

array([0.05271924, 0.06967137, 0.        , 0.05985871, 0.06139939,
       0.07436428, 0.07765953, 0.0702377 , 0.0622757 , 0.06404307,
       0.06709889, 0.06735954, 0.06180384, 0.07564788, 0.07048443,
       0.06537647], dtype=float32)

In [23]:
a1_X_train.columns

Index(['o', 'c', 'ac', 'vpd_hlc', 'vpd_opac', 'mu_rol3', 'mu_rol8', 'mu_rol21',
       'std_rol3', 'std_rol8', 'std_rol21', 'hl_pct', 'oc_pct', 'ac1_pct',
       'ac8_pct', 'ac21_pct'],
      dtype='object')

In [24]:
xgb_model.predict(a1_X_pred.to_numpy().reshape(1,-1))[0]

2

In [25]:
xgb_model.predict_proba(a1_X_pred.to_numpy().reshape(1,-1))

array([[0.33997232, 0.24933687, 0.4106908 ]], dtype=float32)

In [26]:
type(a1_X_pred)

pandas.core.series.Series

In [27]:
a1_X_pred.shape

(16,)

In [28]:
# add predictions to X_train

a1_master1 = a[a1_col_list].copy()
a1_master1['pred0'] = xgb_model.predict(a1_master1[a1_X_train.columns])

In [30]:
a1_master1.tail(10)

Unnamed: 0_level_0,o,c,ac,vpd_hlc,vpd_opac,mu_rol3,mu_rol8,mu_rol21,std_rol3,std_rol8,std_rol21,hl_pct,oc_pct,ac1_pct,ac8_pct,ac21_pct,ac1_pred,pred0
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-03-30,98.0,97.879997,97.879997,26174790.0,497309500.0,96.176664,97.109999,90.693809,1.661696,1.72186,7.250574,0.023265,-0.001225,95.089996,95.809998,77.290001,0.0,0
2023-03-31,96.339996,98.010002,98.010002,18067160.0,33429580.0,97.326665,97.369999,91.530475,1.072958,1.674618,7.018199,0.032074,0.017335,96.879997,94.93,79.440002,1.0,1
2023-04-03,96.699997,96.559998,96.559998,25583440.0,361825100.0,97.483332,97.242498,92.246666,0.802269,1.69505,6.706032,0.020476,-0.001448,97.010002,96.580002,80.519997,1.0,1
2023-04-04,97.040001,95.870003,95.870003,20968910.0,36919630.0,96.813334,96.691249,92.947142,1.092261,1.215301,6.242306,0.021228,-0.012057,95.559998,99.279999,80.160004,1.0,1
2023-04-05,94.349998,92.559998,92.559998,16872390.0,29597360.0,94.996666,96.017499,93.444761,2.138232,1.780448,5.730767,0.03328,-0.018972,94.870003,96.949997,81.110001,0.0,1
2023-04-06,91.470001,92.470001,92.470001,20863920.0,47778400.0,93.633334,95.499999,93.782856,1.937535,2.147464,5.432227,0.025036,0.010933,91.559998,95.610001,84.370003,2.0,2
2023-04-10,91.32,95.480003,95.480003,10990840.0,12628910.0,93.503334,95.615,94.328094,1.712437,2.114312,4.958331,0.052343,0.045554,91.470001,93.559998,83.029999,1.0,1
2023-04-11,96.059998,94.029999,94.029999,15365630.0,25584150.0,93.993334,95.3575,94.869047,1.505336,2.172831,4.181706,0.035186,-0.021133,94.480003,95.089996,81.669998,1.0,1
2023-04-12,94.970001,92.330002,92.330002,16375170.0,20220880.0,93.946668,94.663751,95.360475,1.576653,2.138117,3.047569,0.034327,-0.027798,93.029999,96.879997,81.010002,0.0,0
2023-04-13,92.790001,92.089996,92.089996,30468600.0,57890050.0,92.816666,93.92375,95.581428,1.057607,1.814497,2.577292,0.014333,-0.007544,91.330002,97.010002,86.449997,,2
