# 1. Import libraries and data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import os
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, f1_score

# load model and input
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
model_path = os.path.join(parent_dir, 'rf.sav')
loaded_model = pickle.load(open(model_path, 'rb'))

model_input_data = os.path.join(parent_dir, 'model_input_nasdaq.csv')
model_input = pd.read_csv(model_input_data)

# select dataset
X = model_input[['call_put_ratio_200', # call put raio over the past 200 days
                'SQZ', 
                'MACD',
                'vix_fix_gauge', # fear index
                'Index', # representing the time series
                'Greater_than_MA125', 
                'Ticker_label', # An arbitary label for ticker, as no categorial/string feature allowed when training
                'Close', # closing price
                'market_sum' # the sum of the close price for the whole market on the day
                                ]]
y = [True if model_input['if_profit_21'][i]==1 else False for i in range(len(model_input))]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. prediction

In [18]:
# accuracy
y_pred = loaded_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

0.9714543573145779


In [13]:
# Calculate MCC
mcc = matthews_corrcoef(y_test, y_pred)
print(f'Matthews Correlation Coefficient: {mcc}')

Matthews Correlation Coefficient: 0.9402894030600676


In [14]:
# Calculate F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

F1 Score: 0.9758592791711251


In [15]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Calculate the True Positive Rate (TPR)
# cm[1,1] is the number of true positives
# cm[1,:].sum() is the number of actual positives (true positives + false negatives)
TPR = cm[1, 1] / cm[1, :].sum()
print(f'True Positive Rate: {TPR}')

True Positive Rate: 0.9808509522060752


# Appendix: Training Process

In [4]:
import numpy as np
import pandas as pd
from filterpy.kalman import KalmanFilter
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
from filterpy.kalman import KalmanFilter
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import confusion_matrix, matthews_corrcoef, f1_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import os

model_input_data = os.path.join(parent_dir, 'model_input_nasdaq.csv')
model_input = pd.read_csv(model_input_data)

# Select features and target
features = ['call_put_ratio_200', # call put raio over the past 200 days
                'SQZ', 
                'MACD',
                'vix_fix_gauge', # fear index
                'Index', # representing the time series
                'Greater_than_MA125', 
                'Ticker_label', # An arbitary label for ticker, as no categorial/string feature allowed when training
                'Close', # closing price
                'market_sum' # the sum of the close price for the whole market on the day
                                ]
target = 'if_profit_21'


X = model_input[features]
y = [True if model_input['if_profit_21'][i]==1 else False for i in range(len(model_input))]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# fix the data imbalance
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(y_train).items())) # verify the balanced data

[(False, 76175), (True, 76175)]


## Logistics

In [6]:
# LogisticRegression
linear_model = LogisticRegression()

linear_model.fit(X_train, y_train)

# make prediction
y_pred = linear_model.predict(X_test)


# testing accuracy on training and testing set
y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

training_accuracy = accuracy_score(y_train, y_pred_train)
print(f'Training Accuracy: {training_accuracy}')
testing_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Testing Accuracy: {testing_accuracy}')

# Calculate MCC
mcc = matthews_corrcoef(y_test, y_pred)
# Calculate F1 Score
f1 = f1_score(y_test, y_pred)
print(f'Matthews Correlation Coefficient: {mcc}')
print(f'F1 Score: {f1}')


Training Accuracy: 0.5603019363308172
Testing Accuracy: 0.5599850620857063
Matthews Correlation Coefficient: 0.10581967747224609
F1 Score: 0.6123646333104866


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Decision Tree

In [9]:
decision_tree_model = DecisionTreeClassifier(class_weight = None, criterion= 'gini', 
                                            max_depth= 10, max_features= 'sqrt',
                                            min_samples_leaf= 1, min_samples_split= 5, splitter= 'random')
decision_tree_model.fit(X_train, y_train)

# make prediction
y_pred = decision_tree_model.predict(X_test)


# testing accuracy on training and testing set
y_pred_train = decision_tree_model.predict(X_train)
y_pred_test = decision_tree_model.predict(X_test)

training_accuracy = accuracy_score(y_train, y_pred_train)
print(f'Training Accuracy: {training_accuracy}')
testing_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Testing Accuracy: {testing_accuracy}')

# Calculate MCC
mcc = matthews_corrcoef(y_test, y_pred)
# Calculate F1 Score
f1 = f1_score(y_test, y_pred)
print(f'Matthews Correlation Coefficient: {mcc}')
print(f'F1 Score: {f1}')


Training Accuracy: 0.5628487036429275
Testing Accuracy: 0.5454828369588897
Matthews Correlation Coefficient: 0.10132847737977792
F1 Score: 0.5735143816615564


## Random Forest

In [12]:
random_forest_model = RandomForestClassifier(criterion= 'gini',
                                            max_depth= None,
                                            max_features= 'sqrt',
                                            min_samples_leaf= 1,
                                            min_samples_split= 2,
                                            min_weight_fraction_leaf= 0.0,
                                            monotonic_cst= None,
                                            n_estimators= 100,
                                            oob_score= False,)
random_forest_model.fit(X_train, y_train)

# make prediction
y_pred = random_forest_model.predict(X_test)


# testing accuracy on training and testing set
y_pred_train = random_forest_model.predict(X_train)
y_pred_test = random_forest_model.predict(X_test)

training_accuracy = accuracy_score(y_train, y_pred_train)
print(f'Training Accuracy: {training_accuracy}')
testing_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Testing Accuracy: {testing_accuracy}')

# Calculate MCC
mcc = matthews_corrcoef(y_test, y_pred)
# Calculate F1 Score
f1 = f1_score(y_test, y_pred)
print(f'Matthews Correlation Coefficient: {mcc}')
print(f'F1 Score: {f1}')


Training Accuracy: 0.9999934361667213
Testing Accuracy: 0.8545109389101546
Matthews Correlation Coefficient: 0.6991445554932794
F1 Score: 0.8768343125115262


## XGBoost

In [13]:
xgboost_model = XGBClassifier(colsample_bytree=0.8, gamma= 0, learning_rate=0.01, max_depth= 3, n_estimators= 100, 
                            reg_alpha= 0.1, reg_lambda= 1.5, subsample= 1.0)
xgboost_model.fit(X_train, y_train)

# make prediction
y_pred = xgboost_model.predict(X_test)


# testing accuracy on training and testing set
y_pred_train = xgboost_model.predict(X_train)
y_pred_test = xgboost_model.predict(X_test)

training_accuracy = accuracy_score(y_train, y_pred_train)
print(f'Training Accuracy: {training_accuracy}')
testing_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Testing Accuracy: {testing_accuracy}')

# Calculate MCC
mcc = matthews_corrcoef(y_test, y_pred)
# Calculate F1 Score
f1 = f1_score(y_test, y_pred)
print(f'Matthews Correlation Coefficient: {mcc}')
print(f'F1 Score: {f1}')


Training Accuracy: 0.6113160485723662
Testing Accuracy: 0.6190831855102231
Matthews Correlation Coefficient: 0.21488019887530396
F1 Score: 0.6751075011944577
