In [92]:
import pandas as pd
import yfinance as yf
from hmmlearn import hmm
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import warnings
import numpy as np
import talib as ta
warnings.filterwarnings("ignore")

np.random.seed(42)

# Load data
data = yf.download("DTE", start="2023-04-01", end="2024-04-01", interval="1h")
data['RSI'] = ta.RSI(data['Close'], timeperiod=14)
data['SMA'] = ta.SMA(data['Close'], timeperiod=10)
data['EMA'] = ta.EMA(data['Close'], timeperiod=10)
data['WMA'] = ta.WMA(data['Close'], timeperiod=10)
#adding momentum indicators
data['ROC'] = ta.ROC(data['Close'], timeperiod=10)
data['MOM'] = ta.MOM(data['Close'], timeperiod=10)
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
data['NextDayClose'] = data['Close'].shift(-1)
data['NextDayMovement'] = (data['NextDayClose'] - data['Close']).apply(lambda x: 1 if x > 0 else 0)
data.drop('NextDayClose', axis=1, inplace=True)
data.dropna(inplace=True)  # Remove NA values due to shift

# Features for HMM and other models
features = ['Open', 'RSI', 'SMA', 'EMA', 'WMA', 'ROC', 'MOM']
data_features = data[features]

# Train Hidden Markov Model to infer hidden states
n_components = 4
hmm_model = hmm.GaussianHMM(n_components=n_components, covariance_type="full", n_iter=100000)
hmm_model.fit(data_features)
hidden_states = hmm_model.predict(data_features)

# Split data for training and testing, reset index to align indices
X_train, X_test, y_train, y_test = train_test_split(data_features, data['NextDayMovement'], test_size=0.45, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

hidden_states_train, hidden_states_test = train_test_split(hidden_states, test_size=0.45, random_state=42)
hidden_states_test = pd.Series(hidden_states_test).reset_index(drop=True)

# Train GMM
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
gmm.fit(X_train)
gmm_probs = gmm.predict_proba(X_test)

# Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='binary:logistic', 
                    n_estimators=1000, max_depth=3, learning_rate=0.01, random_state=42)
xgb.fit(X_train, y_train)
xgb_probs = xgb.predict_proba(X_test)

# Prepare logistic regression data
X_lr = pd.DataFrame({
    'Open': X_test['Open'],
    'RSI': X_test['RSI'],
    'SMA': X_test['SMA'],
    'EMA': X_test['EMA'],
    'WMA': X_test['WMA'],
    'ROC': X_test['ROC'],
    'MOM': X_test['MOM'],
    'Hidden_states': hidden_states_test,
    'GMM_prob': gmm_probs[:, 1],
    'XGB_prob': xgb_probs[:, 1],
    'Hidden_XGB_inter': hidden_states_test * xgb_probs[:, 1],
    'Hidden GMM inter': hidden_states_test * gmm_probs[:, 1]
})


# Statistical significance
X_lr = sm.add_constant(X_lr)  # Adding a constant for intercept
model = sm.Logit(y_test, X_lr,)
results = model.fit_regularized(method='l1', disp=False, alpha=0.01, L1_wt=3, full_output=True, maxiter=10000)
print(results.summary())
scoring = results.predict(X_lr)
accuracy = accuracy_score(y_test, scoring.round())
print(f'Accuracy of logistic regression model after regularization: {accuracy}')


[*********************100%%**********************]  1 of 1 completed


                           Logit Regression Results                           
Dep. Variable:        NextDayMovement   No. Observations:                  775
Model:                          Logit   Df Residuals:                      762
Method:                           MLE   Df Model:                           12
Date:                Wed, 15 May 2024   Pseudo R-squ.:                 0.02663
Time:                        13:12:22   Log-Likelihood:                -522.36
converged:                       True   LL-Null:                       -536.65
Covariance Type:            nonrobust   LLR p-value:                  0.004552
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.8771      0.724      1.211      0.226      -0.542       2.296
Open                -0.1864      1.220     -0.153      0.879      -2.578       2.205
RSI                  0.0160 