In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

In [2]:
# aapl_day = yf.download("AAPL", start="1980-12-08", end=None)
# aapl_day.to_pickle('./data/aapl_day.pkl')
# amd_day = yf.download("AMD", start="1980-03-17", end=None)
# amd_day.to_pickle('./data/amd_day.pkl')

In [3]:
# aapl_hour = yf.download(tickers="AAPL",
#                        period="2y",
#                        interval="1h"
#                       )
# aapl_hour.to_pickle('./data/aapl_hr.pkl')

In [4]:
def compute_up_down(begin, end):
    if abs((begin - end) / begin) <= 0.012:
        return 0
    elif (begin - end) / begin > 0.007:
        return 1
    else:
        return 2
    
def compute_volume_per_dollar(vol, begin, end):
    if begin - end == 0:
        return vol
    else:
        return vol / abs(begin - end)
    
def compute_pct_change(num1, num2, dem):
    if dem == 0:
        return 0
    else:
        return (num2 - num1) / dem

In [5]:
a = pd.read_pickle('./data/amd_day.pkl')
a = a.rename(columns={'Adj Close': 'ac'
                      , 'Open': 'o'
                      , 'Close': 'c'
                      , 'High': 'h'
                      , 'Low': 'l'
                      , 'Volume': 'v'
                     })

In [6]:
a['dow'] = a.index.weekday

a['vpd_hlc'] = a.apply(lambda row: compute_volume_per_dollar(row['v'], row['h'], row['l']), axis=1)  # volume per dollar high low change
a['vpd_opac'] = a.apply(lambda row: compute_volume_per_dollar(row['v'], row['o'], row['c']), axis=1)  # volume per dollar open close absolute change

a['ac1'] = a['ac'].shift(1)
a['ac3'] = a['ac'].shift(3)
a['ac5'] = a['ac'].shift(5)
a['ac8'] = a['ac'].shift(8)
a['ac13'] = a['ac'].shift(13)
a['ac21'] = a['ac'].shift(21)

a['mu_rol3'] = a['ac'].rolling(window=3).mean()
a['mu_rol5'] = a['ac'].rolling(window=5).mean()
a['mu_rol8'] = a['ac'].rolling(window=8).mean()
a['mu_rol13'] = a['ac'].rolling(window=13).mean()
a['mu_rol21'] = a['ac'].rolling(window=21).mean()

a['std_rol3'] = a['ac'].rolling(window=3).std()
a['std_rol5'] = a['ac'].rolling(window=5).std()
a['std_rol8'] = a['ac'].rolling(window=8).std()
a['std_rol13'] = a['ac'].rolling(window=13).std()
a['std_rol21'] = a['ac'].rolling(window=21).std()

a['hl_pct'] = a.apply(lambda row: compute_pct_change(row['l'], row['h'], row['o']), axis=1)
a['oc_pct'] = a.apply(lambda row: compute_pct_change(row['o'], row['c'], row['o']), axis=1)

a['ac1_pct'] = a['ac1'] - a['ac'] / a['ac']
a['ac3_pct'] = a['ac3'] - a['ac'] / a['ac']
a['ac5_pct'] = a['ac5'] - a['ac'] / a['ac']
a['ac8_pct'] = a['ac8'] - a['ac'] / a['ac']
a['ac13_pct'] = a['ac13'] - a['ac'] / a['ac']
a['ac21_pct'] = a['ac21'] - a['ac'] / a['ac']

a.dropna(axis=0, inplace=True)

a['ac1_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac1'], row['ac']), axis=1)
a['ac3_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac3'], row['ac']), axis=1)
a['ac5_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac5'], row['ac']), axis=1)
a['ac8_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac8'], row['ac']), axis=1)
a['ac13_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac13'], row['ac']), axis=1)
a['ac21_up_dwn'] = a.apply(lambda row: compute_up_down(row['ac21'], row['ac']), axis=1)

a['ac1_pred'] = a['ac1_up_dwn'].shift(-1)
a['ac3_pred'] = a['ac3_up_dwn'].shift(-3)
a['ac5_pred'] = a['ac5_up_dwn'].shift(-5)
a['ac8_pred'] = a['ac8_up_dwn'].shift(-8)
a['ac13_pred'] = a['ac13_up_dwn'].shift(-13)
a['ac21_pred'] = a['ac21_up_dwn'].shift(-21)

In [7]:
a1_col_list = [# 'o'
               # , 'h'
               # , 'l'
                'c'
               # , 'ac'
               , 'v'
               # , 'vpd_hlc'
               # , 'vpd_opac'
               # , 'ac1'
               # , 'ac3'
               # , 'ac5'
               # , 'ac8'
               # , 'ac13'
               # , 'ac21'
               , 'mu_rol3'
               # , 'mu_rol5'
               # , 'mu_rol8'
               # , 'mu_rol13'
               , 'mu_rol21'
               # , 'std_rol3'
               # , 'std_rol5'
               # , 'std_rol8'
               , 'std_rol13'
               , 'std_rol21'
               # , 'hl_pct'
               , 'oc_pct'
               # , 'ac1_pct'
               , 'ac3_pct'
               # , 'ac5_pct'
               # , 'ac8_pct'
               , 'ac13_pct'
               , 'ac21_pct'
               # , 'ac1_up_dwn'
               # , 'ac3_up_dwn'
               # , 'ac5_up_dwn'
               # , 'ac8_up_dwn'
               # , 'ac13_up_dwn'
               # , 'ac21_up_dwn'
               , 'ac1_pred'
              ]

a1_master0 = a[a1_col_list].copy()

a1_X_pred = a1_master0[a1_col_list[:-1]].iloc[-1].copy()

a1_master0.drop(a1_master0.index[-1], axis=0, inplace=True)

a1_master0 = a1_master0.sample(frac=1, random_state=21)
a1_master0 = a1_master0.sample(frac=1, random_state=42)

idx_split2_9th = int(a.shape[0] * 2.0 / 9)
idx_split3_9th = int(a.shape[0] * 1.0 / 3)
idx_split5_9th = int(a.shape[0] * 5.0 / 9)
idx_split6_9th = int(a.shape[0] * 2.0 / 3)
idx_split8_9th = int(a.shape[0] * 8.0 / 9)

a1_X_train = a1_master0.iloc[:idx_split2_9th].drop('ac1_pred', axis=1)
a1_y_train = a1_master0['ac1_pred'].iloc[:idx_split2_9th]
a1_X_test = a1_master0.iloc[idx_split2_9th:idx_split3_9th].drop('ac1_pred', axis=1)
a1_y_test = a1_master0['ac1_pred'].iloc[idx_split2_9th:idx_split3_9th]

In [8]:
a1_master0['ac1_pred'].value_counts()

2.0    3642
1.0    3632
0.0    3567
Name: ac1_pred, dtype: int64

In [9]:
# sanity check first partition for training
print(a1_master0.shape)
print(a1_X_train.shape)
print(a1_y_train.shape)
print(a1_X_test.shape)
print(a1_y_test.shape)

(10841, 11)
(2409, 10)
(2409,)
(1205, 10)
(1205,)


In [10]:
# check for infinities
a1_X_train.isin([np.inf, -np.inf]).sum()

c            0
v            0
mu_rol3      0
mu_rol21     0
std_rol13    0
std_rol21    0
oc_pct       0
ac3_pct      0
ac13_pct     0
ac21_pct     0
dtype: int64

In [19]:
# modeling partition first 1/3

xgb_model0 = xgb.XGBClassifier(objective='multi:softmax'
                              , num_class=3
                              , random_state=42
                              , max_depth=7
                              , n_estimators=55
                              , learning_rate=0.0001
                              , min_child_weight=1
                             )

xgb_model0.fit(a1_X_train, a1_y_train)

y_pred = xgb_model0.predict(a1_X_test)

# y_pred_prob = xgb_model0.predict_proba(a1_X_test)[:, 1]

acc = accuracy_score(a1_y_test, y_pred)

print(f'Accuracy: {acc}')

Accuracy: 0.3360995850622407


In [12]:
xgb_model0.feature_importances_

array([0.06181573, 0.10500024, 0.09692524, 0.08994446, 0.09718166,
       0.10838262, 0.13061236, 0.14201309, 0.08128438, 0.08684026],
      dtype=float32)

In [None]:
a1_X_train.columns

In [None]:
xgb_model0.predict(a1_X_pred.to_numpy().reshape(1,-1))[0]

In [None]:
xgb_model0.predict_proba(a1_X_pred.to_numpy().reshape(1,-1))

In [None]:
# add predictions to X_train

# idx_split2_9th = int(a.shape[0] * 2.0 / 9)
# idx_split3_9th = int(a.shape[0] * 1.0 / 3)
# idx_split5_9th = int(a.shape[0] * 5.0 / 9)
# idx_split6_9th = int(a.shape[0] * 2.0 / 3)
# idx_split8_9th = int(a.shape[0] * 8.0 / 9)




a1_master1 = a1_master0[a1_col_list].iloc[idx_split3_9th:idx_split6_9th].copy()
a1_master1['pred0'] = xgb_model0.predict(a1_master1[a1_X_train.columns])

idx_2_3 = int(a1_master1.shape[0] * 2 / 3)

a1_X_train = a1_master1.iloc[:idx_2_3].drop('ac1_pred', axis=1)
a1_y_train = a1_master1['ac1_pred'].iloc[:idx_2_3]
a1_X_test = a1_master1.iloc[idx_2_3:].drop('ac1_pred', axis=1)
a1_y_test = a1_master1['ac1_pred'].iloc[idx_2_3:]

In [None]:
# sanity check first partition for training
print(a1_master1.shape)
print(a1_X_train.shape)
print(a1_y_train.shape)
print(a1_X_test.shape)
print(a1_y_test.shape)

In [None]:
# modeling partition 2nd 1/3

xgb_model1 = xgb.XGBClassifier(objective='multi:softmax'
                              , num_class=3
                              , random_state=42
                              , max_depth=7
                              , n_estimators=55
                              , learning_rate=0.01
                              , min_child_weight=3
                             )

xgb_model1.fit(a1_X_train, a1_y_train)

y_pred = xgb_model1.predict(a1_X_test)

# y_pred_prob = xgb_model0.predict_proba(a1_X_test)[:, 1]

acc = accuracy_score(a1_y_test, y_pred)

print(f'Accuracy: {acc}')

In [None]:
xgb_model1.feature_importances_

In [None]:
# 4/16/2023 --to do: need to put back last row to fill pred0 column

In [None]:
a1_master1.tail(10)

In [None]:
a1_master1.shape

In [None]:
len(a1_X_train.columns)

In [None]:
a1_X_columns = list(a1_X_train.columns)
a1_X_columns.append('pred0')
a1_X_columns

In [None]:
a1_X_pred1 = a1_master1[a1_X_columns].iloc[-1].copy()
a1_X_pred1

In [None]:
a1_master1_scrambled = a1_master1.iloc[:-1].copy()
a1_master1_scrambled.tail(10)

In [None]:
a1_master1_scrambled = a1_master1_scrambled.sample(frac=1)
a1_master1_scrambled = a1_master1_scrambled.sample(frac=1)

a1_master1_scrambled.tail(10)

In [None]:
a1_idx_tt_split = int(a1_master1_scrambled.shape[0] * 0.9)
a1_idx_tt_split

In [None]:
a1_X_train = a1_master1_scrambled[a1_X_columns].iloc[:a1_idx_tt_split]
a1_y_train = a1_master1_scrambled['ac1_pred'].iloc[:a1_idx_tt_split]

a1_X_test = a1_master1_scrambled[a1_X_columns].iloc[a1_idx_tt_split:]
a1_y_test = a1_master1_scrambled['ac1_pred'].iloc[a1_idx_tt_split:]

print(f'a1_master1 shape: {a1_master1.shape}')
print(f'X_train shape: {a1_X_train.shape}')
print(f'y_train shape: {a1_y_train.shape}')
print(f'X_test shape: {a1_X_test.shape}')
print(f'y_test shape: {a1_y_test.shape}')
print(f'a1_X_pred1 shape: {a1_X_pred1.shape}')

In [None]:
# 2nd modeling

xgb_model1 = xgb.XGBClassifier(objective='binary:logistic'
                              # , num_class=2
                              , random_state=42
                              # , max_depth=7
                              # , n_estimators=89
                              # , learning_rate=0.05
                              # , min_child_weight=4
                             )

xgb_model1.fit(a1_X_train, a1_y_train)

y_pred = xgb_model1.predict(a1_X_test)

y_pred_prob = xgb_model1.predict_proba(a1_X_test)[:, 1]

acc = accuracy_score(a1_y_test, y_pred)

print(f'Accuracy: {acc}')

In [None]:
xgb_model1.feature_importances_

In [None]:
a1_X_columns