In [33]:
import pandas as pd
import numpy as np
from pandas_datareader.data import DataReader
import talib as ta
from ta import add_all_ta_features
import yfinance

# statistics
from statsmodels.tsa.stattools import adfuller

# Unsupervised Machin Learning
from sklearn.decomposition import PCA

# Supervised Machin Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# Reporting
import matplotlib.pyplot as plt

### initial data extraction

In [34]:
# data extraction
symbol = "^VIX"
start_date = "2017-01-01"
end_date = "2022-06-01"
ticker = yfinance.Ticker(symbol)
df = ticker.history(start=start_date,end=end_date, auto_adjust=True)

In [35]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-01-03 00:00:00-05:00,14.070000,14.070000,12.850000,12.850000,0,0,0
2017-01-04 00:00:00-05:00,12.780000,12.800000,11.630000,11.850000,0,0,0
2017-01-05 00:00:00-05:00,11.960000,12.090000,11.400000,11.670000,0,0,0
2017-01-06 00:00:00-05:00,11.700000,11.740000,10.980000,11.320000,0,0,0
2017-01-09 00:00:00-05:00,11.710000,12.080000,11.460000,11.560000,0,0,0
...,...,...,...,...,...,...,...
2022-05-24 00:00:00-04:00,29.430000,31.070000,29.040001,29.450001,0,0,0
2022-05-25 00:00:00-04:00,29.330000,30.230000,28.160000,28.370001,0,0,0
2022-05-26 00:00:00-04:00,28.420000,28.459999,27.110001,27.500000,0,0,0
2022-05-27 00:00:00-04:00,27.500000,27.540001,25.570000,25.719999,0,0,0


In [48]:
df = add_all_ta_features(df, open="Open", high="High", low="Low", close="Close", volume="Volume", fillna=True)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [50]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,volume_adi,volume_obv,volume_cmf,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-03 00:00:00-05:00,14.07,14.07,12.85,12.85,0,0,0,-0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.85,-32.349937,0.0,0.0
2017-01-04 00:00:00-05:00,12.78,12.8,11.63,11.85,0,0,0,-0.0,0,0.0,...,-0.624394,-0.124879,-0.499515,0.0,0.0,0.0,12.098375,-7.782101,-8.101594,-7.782101
2017-01-05 00:00:00-05:00,11.96,12.09,11.4,11.67,0,0,0,-0.0,0,0.0,...,-1.226732,-0.345249,-0.881483,0.0,0.0,0.0,11.82578,-1.51899,-1.530645,-9.182881
2017-01-06 00:00:00-05:00,11.7,11.74,10.98,11.32,0,0,0,-0.0,0,0.0,...,-1.916831,-0.659566,-1.257265,0.0,0.0,0.0,11.503803,-2.999146,-3.045041,-11.90662
2017-01-09 00:00:00-05:00,11.71,12.08,11.46,11.56,0,0,0,-0.0,0,0.0,...,-2.289756,-0.985604,-1.304152,0.0,0.0,0.0,11.534477,2.120148,2.097985,-10.03891


### data preprocessing-stationary

In mathematics and statistics, a stationary process (or a strict/strictly stationary process or strong/strongly stationary process) is a stochastic process whose unconditional joint probability distribution does not change when shifted in time.Consequently, parameters such as mean and variance also do not change over time. If you draw a line through the middle of a stationary process then it should be flat; it may have 'seasonal' cycles, but overall it does not trend up nor down.

In [52]:
# identify non-stationary columns
non_stationary = []
for col in df.columns:
    dftest = adfuller(df[col].values)
    p_value = dftest[1]
    t_test = dftest[0] < dftest[4]["5%"] # return true/false
    if p_value > 0.05 or not t_test:
        non_stationary.append(col)
non_stationary

  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -n

['Volume',
 'Dividends',
 'Stock Splits',
 'volume_adi',
 'volume_obv',
 'volume_cmf',
 'volume_fi',
 'volume_em',
 'volume_sma_em',
 'volume_vpt',
 'volume_vwap',
 'volume_mfi',
 'volume_nvi',
 'volatility_bbl',
 'volatility_dcl',
 'trend_sma_fast',
 'trend_sma_slow',
 'trend_ichimoku_b',
 'trend_visual_ichimoku_b',
 'momentum_pvo',
 'momentum_pvo_signal',
 'momentum_pvo_hist']

In [62]:
# convert non-stationary to stationary
df_stationary = df.copy()
df_stationary[non_stationary] = df_stationary[non_stationary].pct_change()
df_stationary = df_stationary.iloc[1:]

In [63]:
# find NaN Rows
na_list = df_stationary.columns[df_stationary.isna().any().tolist()]
df_stationary.drop(columns=na_list,inplace=True)

In [65]:
# handle inf values
df_stationary.replace([np.inf, -np.inf],0,inplace=True)

### data preprocessing - scaling and  target setting

In [68]:
# set target (for supervised ML later on)
df_stationary["target"] = -1
df_stationary.loc[df_stationary.Close.shift(-1) > df_stationary.Close, "target"] = 1
df_stationary.dropna(inplace=True)

In [69]:
# split target from feature set
X = df_stationary.iloc[:,:-1]
y = df_stationary.iloc[:,-1]

In [71]:
# feature scaling
df_scaled = df_stationary.copy()
X_fs = StandardScaler().fit_transform(X)

In [72]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X_fs,y, test_size=0.7, random_state=42)

 ### Unsupervised ML - PCA Dimensionality Reduction