In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import KernelPCA, PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

from src.features.build_features import StockTechnicals

In [2]:
# Load training data using MSFT
train_ticker = "MSFT"
all_msft_data = pd.read_csv(f'../data/{train_ticker}.csv')

# number of future days for SMA to rise
N = 26

In [3]:
technicals = StockTechnicals(all_msft_data)
X = technicals.features
y = technicals.future_sma_higher_than_current_price(N)

# we don't have the last N days of data (they're in the future)
y = y[~np.isnan(y)]
X = X[:len(y)]

  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


In [4]:
# limit to top features as determined by a RandomForest in feature-selection.ipnyb
top_features = [
    'trend_visual_ichimoku_b',
    'volume_obv',
    'volatility_kcw',
    'volatility_atr',
    'trend_mass_index',
    'trend_kst_sig',
    'volume_cmf',
    'trend_adx',
    'trend_macd_signal',
    'volatility_bbw',
    'trend_kst_diff',
    'momentum_tsi',
    'trend_macd_diff',
]

In [5]:
# limit X to top features only
X_top = X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X_top, y, random_state=2, stratify=y)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [6]:
# run a logistic regression to get a benchmark performance on the top 13 features
for c in [0.01, 0.1, 1.0, 10, 100]:
    lr = LogisticRegression(C=c, random_state=2, solver='liblinear')
    lr.fit(X_train_std, y_train)
    y_pred = lr.predict(X_test_std)

    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}\n"
          f"F1-Score:       {f1_score(y_test, y_pred)}")
    

Accuracy score: 0.6603518267929634
F1-Score:       0.7756925826630919
Accuracy score: 0.6495263870094723
F1-Score:       0.7612903225806451
Accuracy score: 0.6508795669824087
F1-Score:       0.7575187969924813
Accuracy score: 0.6549391069012178
F1-Score:       0.7592067988668555
Accuracy score: 0.6535859269282814
F1-Score:       0.7580340264650283


In [7]:
# Limiting to the top 13 features achieves an accuracy very close to using all 69 (~65% v ~69%).
# As an alternative, let's see what happens when reducing to 13 "fake" features using PCA 

In [8]:
# set up test and train datasets using full X
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, stratify=y)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# fit a pca with 13 "fake" attributes
pca = PCA(13, random_state=1, tol=0.1)
X_train_std_pca = pca.fit_transform(X_train_std)
X_test_std_pca = pca.transform(X_test_std)

In [9]:
# run a logistic regression with the 13-feature pca dataset
for c in [0.01, 0.1, 1.0, 10, 100]:
    lr = LogisticRegression(C=c, random_state=2, solver='liblinear')
    lr.fit(X_train_std_pca, y_train)
    y_pred = lr.predict(X_test_std_pca)

    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}\n"
          f"F1-Score:       {f1_score(y_test, y_pred)}")

Accuracy score: 0.6332882273342354
F1-Score:       0.7612334801762115
Accuracy score: 0.638700947225981
F1-Score:       0.7672188317349607
Accuracy score: 0.6373477672530447
F1-Score:       0.7665505226480837
Accuracy score: 0.6373477672530447
F1-Score:       0.7665505226480837
Accuracy score: 0.6373477672530447
F1-Score:       0.7665505226480837


In [10]:
# using a PCA with 13 "fake" attributes doesn't achieve better results than simply using
# our 13 top attributes as selected from the RandomForest. Therefore, we'll stick with those
# as they're more easily explained. Next we'll try using the kernel trick to see if we can
# improve classification by temporarily projecting data into a higher dimensional space

In [11]:
# trying out kpca

for k in ('poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'):
    print(f"\nKernel: {k}")
    kpca = KernelPCA(n_components=13, random_state=1, kernel='poly', gamma=1.0)
    X_train_std_kpca = kpca.fit_transform(X_train_std)
    X_test_std_kpca = kpca.transform(X_test_std)
    
    # run a logistic regression
    for c in [0.01, 0.1, 1.0, 10, 100]:
        lr = LogisticRegression(C=c, random_state=2, solver='liblinear')
        lr.fit(X_train_std_kpca, y_train)
        y_pred = lr.predict(X_test_std_kpca)
    
        print(f"Accuracy score: {accuracy_score(y_test, y_pred)}\n"
              f"F1-Score:       {f1_score(y_test, y_pred)}")


Kernel: poly
Accuracy score: 0.6549391069012178
F1-Score:       0.7811158798283262
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811

Kernel: rbf
Accuracy score: 0.6549391069012178
F1-Score:       0.7811158798283262
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811

Kernel: sigmoid
Accuracy score: 0.6549391069012178
F1-Score:       0.7811158798283262
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811
Accuracy score: 0.6535859269282814
F1-Score:       0.7811965811965811
Accuracy score: 0.6535859269282814
F1-Score:  

In [12]:
# It doesn't seem like the kernel trick with KPCA is very effective at improving our accuracy either.
# Above is just a sample of the various hyperparameter combinations we attempted. It is our belief
# that we will need to engineer additional features or implement deeper neural networks to improve 
# our classification accuracy

In [13]:
# Stocks are inherently a time-series prediction, so let's see if adding lag values to  various
# metrics helps improve the prediction accuracy using a linear model at all

lag_days = [1, 2, 3, 4, 5, 10, 20]

# add lags is a convenience function I wrote to add trailing values to a list of metrics
all_msft_data = pd.read_csv(f'../data/{train_ticker}.csv')
technicals = StockTechnicals(all_msft_data)
technicals.add_lags(metrics=top_features, days=lag_days)
X = technicals.features
y = technicals.future_sma_higher_than_current_price(26)

# drop final 26 days which we don't have a y value
y = y[~np.isnan(y)]
X = X[:len(y)]

  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


In [14]:
# limit to top features as determined by a RandomForest in feature-selection.ipnyb
top_features = [
    'trend_visual_ichimoku_b',
    'volume_obv',
    'volatility_kcw',
    'volatility_atr',
    'trend_mass_index',
    'trend_kst_sig',
    'volume_cmf',
    'trend_adx',
    'trend_macd_signal',
    'volatility_bbw',
    'trend_kst_diff',
    'momentum_tsi',
    'trend_macd_diff',
]

top_features.extend([f"{feat}_{n}_day_lag" for feat in top_features for n in lag_days])
X_top = X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X_top, y, random_state=2, stratify=y)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# run a logistic regression
for c in [0.01, 0.1, 1.0, 10, 100]:
    lr = LogisticRegression(C=c, random_state=2, solver='liblinear')
    lr.fit(X_train_std, y_train)
    y_pred = lr.predict(X_test_std)

    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.6603260869565217
Accuracy score: 0.6698369565217391
Accuracy score: 0.6766304347826086
Accuracy score: 0.6698369565217391
Accuracy score: 0.6589673913043478


In [15]:
# Even with lag variables it seems the data just aren't linearly separable.
# Let's try a non-linear classifier like a multi-layer perceptron

In [16]:
from sklearn.neural_network import MLPClassifier

for a in [0.001, 0.01, 0.1]:
    mlp = MLPClassifier(
        hidden_layer_sizes=(39, 39), 
        solver='lbfgs', 
        max_iter=5000, 
        random_state=2, 
        alpha=a, 
        activation='relu')
    mlp.fit(X_train_std, y_train)
    y_pred = mlp.predict(X_test_std)

    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.9239130434782609
Accuracy score: 0.936141304347826
Accuracy score: 0.9307065217391305


In [17]:
# There are some early signs of optimism using the MLP classifier with
# predictions as high as 93.6% accuracy with an alpha of 0.01. 39 units
# two hidden layers was selected somewhat arbitrarily as 3x13 (# of impt features)
# Other combinations - larger and smaller - were only nominally different and
# often worse, so this is merely to demonstrate a more "successful" case. 
# Let's try some other hyperparameter combinations for fun

In [18]:
for solver in ['adam', 'sgd']:
    for activation in ['relu', 'tanh', 'logistic']:
        mlp = MLPClassifier(
            hidden_layer_sizes=(39, 39), 
            solver=solver,  
            max_iter=1000, 
            random_state=2, 
            alpha=0.01, 
            activation=activation,
            learning_rate='adaptive')
        mlp.fit(X_train_std, y_train)
        y_pred = mlp.predict(X_test_std)

        print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
        

Accuracy score: 0.9008152173913043
Accuracy score: 0.9184782608695652
Accuracy score: 0.9307065217391305
Accuracy score: 0.907608695652174
Accuracy score: 0.9184782608695652
Accuracy score: 0.6345108695652174




In [19]:
# Nothing in the above supercedes our initial MLP (this was an intentional demonstration)
# Perhaps there's a chance we can predict the general "trend" of a stock somewhat accurately 
# with an MLP classifier on a handful of technical indicators along with some lag values