In [16]:
!pip install tensorflow-addons



In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
import matplotlib.pyplot as plt
from sklearn import tree, ensemble, model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# NN
import tensorflow as tf
import tensorflow_hub as hub

from keras.callbacks import EarlyStopping


In [18]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [19]:
train = pd.read_csv('/content/drive/MyDrive/Data Modelling/Stock Prices Prediction/train.csv')
train

X_train = train.drop("change_lead", axis = 1)
y_train = train["change_lead"]

In [20]:
test = pd.read_csv('/content/drive/MyDrive/Data Modelling/Stock Prices Prediction/test.csv')
X_test = test.drop("change_lead", axis = 1)
y_test = test["change_lead"]

# Helper Function

In [21]:
agg_accuracy_score_dict = {}
agg_f1_score_dict={}
agg_roc_auc_dict={}

train_accuracy_score_dict = {}
train_f1_score_dict={}
train_roc_auc_dict={}

val_accuracy_score_dict = {}
val_f1_score_dict={}
val_roc_auc_dict={}

test_accuracy_score_dict = {}
test_f1_score_dict={}
test_roc_auc_dict={}

def evaluation(true, pred, title):
    print("================== Evaluation on {} ==================".format(title))
    # accuracy 
    acc = accuracy_score(true, pred)
    print("accuracy: {}\n".format(acc))
    agg_accuracy_score_dict[title] = acc
    # f1 score
    print("classification report: \n{}\n".format(classification_report(true, pred)))
    print("F1 score: \n{}\n".format(metrics.f1_score(true, pred)))
    f1 = f1_score(true, pred)
    agg_f1_score_dict[title] = f1
    # confusion matrix
    print("Confusion matrix: \n{}\n".format(confusion_matrix(true, pred)))
    # roc auc result
    fpr, tpr, thresholds = roc_curve(true, pred)
    roc_auc = auc(fpr, tpr)
    print("ROC AUC: {}\n".format(roc_auc))
    agg_roc_auc_dict[title] = roc_auc
    print("================== End of Evaluation on {} ==================".format(title))
    
    if "train dataset" in title:
        train_accuracy_score_dict[title] = acc
        train_f1_score_dict[title] = f1
        train_roc_auc_dict[title] = roc_auc
    elif "test dataset" in title:
        test_accuracy_score_dict[title] = acc
        test_f1_score_dict[title] = f1
        test_roc_auc_dict[title] = roc_auc
    else:
        val_accuracy_score_dict[title] = acc
        val_f1_score_dict[title] = f1
        val_roc_auc_dict[title] = roc_auc

# NN for all companies

## Baseline

In [7]:
# baseline model
def create_baseline():
	# create model
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(60, input_shape=(13,), activation='relu'))
	model.add(tf.keras.layers.Dense(24, activation='relu'))
	model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='SGD',metrics=['accuracy'])
	return model

baseline_tf = create_baseline()

# BASELINE MODEL, NO EARLY STOPPING
# baseline_tf.fit(X_train, y_train, epochs=100, batch_size=50,  verbose=1, validation_split=0.1,)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 5, restore_best_weights = True)
baseline_tf.fit(X_train, y_train, batch_size = 50, 
                    epochs = 100, validation_split = 0.1,
                    callbacks = [es])
baseline_ypred = baseline_tf.predict(X_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: early stopping


In [8]:
yprednn = baseline_ypred.copy()
yprednn = pd.DataFrame(yprednn, columns=['y_pred_nn'])
yprednn['y_pred_nn'] = yprednn['y_pred_nn'].apply(lambda x: 0 if x < 0.5 else 1)
print(f1_score(y_test, yprednn['y_pred_nn']))
print(evaluation(y_test, yprednn['y_pred_nn'], "NN evaluation"))

0.5185185185185185
accuracy: 0.5289855072463768

classification report: 
              precision    recall  f1-score   support

           0       0.56      0.52      0.54       147
           1       0.50      0.54      0.52       129

    accuracy                           0.53       276
   macro avg       0.53      0.53      0.53       276
weighted avg       0.53      0.53      0.53       276


F1 score: 
0.5185185185185185

Confusion matrix: 
[[76 71]
 [59 70]]

ROC AUC: 0.5298212308179086

None


## Hyperparameter tuning

In [9]:
import tensorflow as tf
import tensorflow_hub as hub

from keras.callbacks import EarlyStopping
import tensorflow_addons as tfa

# baseline model
def create_baseline(hiddenLayerOne=60, hiddenLayerTwo=24,
	dropout=0.2, learnRate=0.01):
	# create model
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(hiddenLayerOne, input_shape=(13,), activation='relu'))
	model.add(tf.keras.layers.Dense(hiddenLayerTwo, activation='relu'))
	model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=learnRate),metrics=['mse'])
	return model

In [10]:
# define a grid of the hyperparameter search space
hiddenLayerOne = [60, 32, 24]
hiddenLayerTwo = [32, 24, 12]
learnRate = [1e-2, 1e-3, 1e-4]
dropout = [0.3, 0.4, 0.5]
batchSize = [4, 8, 16, 32]
epochs = [10, 20, 30, 40]

# create a dictionary from the hyperparameter grid
grid = dict(
	hiddenLayerOne=hiddenLayerOne,
	learnRate=learnRate,
	hiddenLayerTwo=hiddenLayerTwo,
	# dropout=dropout,
	# batch_size=batchSize,
	# epochs=epochs
)

In [11]:
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# initialize a random search with a 3-fold cross-validation and then
# start the hyperparameter search process
print("[INFO] performing random search...")

model = KerasClassifier(build_fn=create_baseline, epochs=6, batch_size=20, verbose=1)

searcher = RandomizedSearchCV(estimator=model, n_jobs=-1, cv=3,
	param_distributions=grid, scoring="f1")
searchResults = searcher.fit(X_train, y_train)

# summarize grid search information
bestScore = searchResults.best_score_
bestParams = searchResults.best_params_
# print("[INFO] best score is {:.2f} using {}".format(bestScore,
# 	bestParams))

print("bestParams is",
	bestParams)

[INFO] performing random search...


  


Epoch 1/6


  super(SGD, self).__init__(name, **kwargs)


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
bestParams is {'learnRate': 0.01, 'hiddenLayerTwo': 24, 'hiddenLayerOne': 60}


In [12]:
print("[INFO] evaluating the best model...")
bestModel = searchResults.best_estimator_
ypredz = bestModel.predict(X_test)
print(f1_score(y_test, ypredz))

[INFO] evaluating the best model...
0.6331658291457287


In [14]:
print(evaluation(y_test, ypredz, "NN evaluation"))

accuracy: 0.47101449275362317

classification report: 
              precision    recall  f1-score   support

           0       0.57      0.03      0.05       147
           1       0.47      0.98      0.63       129

    accuracy                           0.47       276
   macro avg       0.52      0.50      0.34       276
weighted avg       0.52      0.47      0.32       276


F1 score: 
0.6331658291457287

Confusion matrix: 
[[  4 143]
 [  3 126]]

ROC AUC: 0.5019775352001266

None


In [15]:
print(precision_score(y_test, ypredz))

0.4684014869888476


# NN for AAPL

## Baseline model

In [22]:
train_wx = pd.read_csv('/content/drive/MyDrive/Data Modelling/Stock Prices Prediction/train_wx.csv')
train_wx = train_wx[train_wx['symbol']=="AAPL"]
train_wx = train_wx.drop(columns=['symbol'])

X_train_wx = train_wx.drop("change_lead", axis = 1)
y_train_wx = train_wx["change_lead"]

In [23]:
test_wx = pd.read_csv('/content/drive/MyDrive/Data Modelling/Stock Prices Prediction/test_wx.csv')
test_wx = test_wx[test_wx['symbol']=='AAPL']
test_wx = test_wx.drop(columns=['symbol'])

X_test_wx = test_wx.drop("change_lead", axis = 1)
y_test_wx = test_wx["change_lead"]

In [24]:
# baseline model
def create_baseline():
	# create model
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(60, input_shape=(13,), activation='relu'))
	model.add(tf.keras.layers.Dense(24, activation='relu'))
	model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='SGD',metrics=['accuracy'])
	return model

baseline_tf = create_baseline()

# BASELINE MODEL, NO EARLY STOPPING
# baseline_tf.fit(X_train_wx, y_train_wx, epochs=100, batch_size=50,  verbose=1, validation_split=0.1,)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 5, restore_best_weights = True)
baseline_tf.fit(X_train_wx, y_train_wx, batch_size = 50, 
                    epochs = 100, validation_split = 0.1,
                    callbacks = [es])
baseline_ypred = baseline_tf.predict(X_test_wx)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
 1/11 [=>............................] - ETA: 0s - loss: 0.6989 - accuracy: 0.4800Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping


In [25]:
yprednn = baseline_ypred.copy()
yprednn = pd.DataFrame(yprednn, columns=['y_pred_nn'])
yprednn['y_pred_nn'] = yprednn['y_pred_nn'].apply(lambda x: 0 if x < 0.5 else 1)
print(f1_score(y_test_wx, yprednn['y_pred_nn']))

0.725


In [26]:
evaluation(y_test_wx, yprednn['y_pred_nn'], "NN FOR AAPL")

accuracy: 0.5686274509803921

classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.57      1.00      0.72        29

    accuracy                           0.57        51
   macro avg       0.28      0.50      0.36        51
weighted avg       0.32      0.57      0.41        51


F1 score: 
0.725

Confusion matrix: 
[[ 0 22]
 [ 0 29]]

ROC AUC: 0.5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparameter tuning

In [27]:
import tensorflow as tf
import tensorflow_hub as hub

from keras.callbacks import EarlyStopping
import tensorflow_addons as tfa

# baseline model
def create_baseline(hiddenLayerOne=60, hiddenLayerTwo=24,
	dropout=0.2, learnRate=0.01):
	# create model
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(hiddenLayerOne, input_shape=(13,), activation='relu'))
	model.add(tf.keras.layers.Dense(hiddenLayerTwo, activation='relu'))
	model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=learnRate),metrics=['mse'])
	return model

In [28]:
# define a grid of the hyperparameter search space
hiddenLayerOne = [60, 32, 24]
hiddenLayerTwo = [32, 24, 12]
learnRate = [1e-2, 1e-3, 1e-4]
dropout = [0.3, 0.4, 0.5]
batchSize = [4, 8, 16, 32]
epochs = [10, 20, 30, 40]

# create a dictionary from the hyperparameter grid
grid = dict(
	hiddenLayerOne=hiddenLayerOne,
	learnRate=learnRate,
	hiddenLayerTwo=hiddenLayerTwo,
	# dropout=dropout,
	# batch_size=batchSize,
	# epochs=epochs
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# initialize a random search with a 3-fold cross-validation and then
# start the hyperparameter search process
print("[INFO] performing random search...")

model = KerasClassifier(build_fn=create_baseline, epochs=6, batch_size=20, verbose=1)

searcher = RandomizedSearchCV(estimator=model, n_jobs=-1, cv=3,
	param_distributions=grid, scoring="f1")
searchResults = searcher.fit(X_train_wx, y_train_wx)

# summarize grid search information
bestScore = searchResults.best_score_
bestParams = searchResults.best_params_
# print("[INFO] best score is {:.2f} using {}".format(bestScore,
# 	bestParams))

print("bestParams is",
	bestParams)

[INFO] performing random search...


  


Epoch 1/6


  super(SGD, self).__init__(name, **kwargs)


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
bestParams is {'learnRate': 0.001, 'hiddenLayerTwo': 32, 'hiddenLayerOne': 60}


In [None]:
print("[INFO] evaluating the best model...")
bestModel = searchResults.best_estimator_
ypredz = bestModel.predict(X_test_wx)
print(f1_score(y_test_wx, ypredz))

[INFO] evaluating the best model...
0.0


# NN FOR MSFT

## Baseline model

In [None]:
train_wx = pd.read_csv('/content/drive/MyDrive/Data Modelling/Stock Prices Prediction/train_wx.csv')
train_wx = train_wx[train_wx['symbol']=="MSFT"]
train_wx = train_wx.drop(columns=['symbol'])

X_train_wx = train_wx.drop("change_lead", axis = 1)
y_train_wx = train_wx["change_lead"]

In [None]:
test_wx = pd.read_csv('/content/drive/MyDrive/Data Modelling/Stock Prices Prediction/test_wx.csv')
test_wx = test_wx[test_wx['symbol']=='MSFT']
test_wx = test_wx.drop(columns=['symbol'])

X_test_wx = test_wx.drop("change_lead", axis = 1)
y_test_wx = test_wx["change_lead"]

In [None]:
# baseline model
def create_baseline():
	# create model
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(60, input_shape=(13,), activation='relu'))
	model.add(tf.keras.layers.Dense(24, activation='relu'))
	model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='SGD',metrics=['accuracy'])
	return model

baseline_tf = create_baseline()

# BASELINE MODEL, NO EARLY STOPPING
# baseline_tf.fit(X_train_wx, y_train_wx, epochs=100, batch_size=50,  verbose=1, validation_split=0.1,)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 5, restore_best_weights = True)
baseline_tf.fit(X_train_wx, y_train_wx, batch_size = 50, 
                    epochs = 100, validation_split = 0.1,
                    callbacks = [es])
baseline_ypred = baseline_tf.predict(X_test_wx)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
 1/15 [=>............................] - ETA: 0s - loss: 0.6952 - accuracy: 0.4600Restoring model weights from the end of the best epoch: 23.
Epoch 28: early stopping


In [None]:
yprednn = baseline_ypred.copy()
yprednn = pd.DataFrame(yprednn, columns=['y_pred_nn'])
yprednn['y_pred_nn'] = yprednn['y_pred_nn'].apply(lambda x: 0 if x < 0.5 else 1)
print(f1_score(y_test_wx, yprednn['y_pred_nn']))

0.6236559139784946


In [None]:
evaluation(y_test_wx, yprednn['y_pred_nn'], "NN FOR GOOG")

accuracy: 0.453125

classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.45      1.00      0.62        29

    accuracy                           0.45        64
   macro avg       0.23      0.50      0.31        64
weighted avg       0.21      0.45      0.28        64


F1 score: 
0.6236559139784946

Confusion matrix: 
[[ 0 35]
 [ 0 29]]

ROC AUC: 0.5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparameter tuning

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

from keras.callbacks import EarlyStopping
import tensorflow_addons as tfa

# baseline model
def create_baseline(hiddenLayerOne=60, hiddenLayerTwo=24,
	dropout=0.2, learnRate=0.01):
	# create model
	model = tf.keras.Sequential()
	model.add(tf.keras.layers.Dense(hiddenLayerOne, input_shape=(13,), activation='relu'))
	model.add(tf.keras.layers.Dropout(rate=0.25))
	model.add(tf.keras.layers.Dense(hiddenLayerTwo, activation='relu'))
	model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=learnRate),metrics=['mse'])
	return model

In [None]:
# define a grid of the hyperparameter search space
hiddenLayerOne = [120, 72, 52]
hiddenLayerTwo = [32, 24, 12]
learnRate = [1e-2, 1e-3, 1e-4]
dropout = [0.3, 0.4, 0.5]
batchSize = [4, 8, 16, 32]
epochs = [10, 20, 30, 40]

# create a dictionary from the hyperparameter grid
grid = dict(
	hiddenLayerOne=hiddenLayerOne,
	learnRate=learnRate,
	hiddenLayerTwo=hiddenLayerTwo,
	# dropout=dropout,
	# batch_size=batchSize,
	# epochs=epochs
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# initialize a random search with a 3-fold cross-validation and then
# start the hyperparameter search process
print("[INFO] performing random search...")

model = KerasClassifier(build_fn=create_baseline, epochs=6, batch_size=20, verbose=1)

searcher = RandomizedSearchCV(estimator=model, n_jobs=-1, cv=3,
	param_distributions=grid, scoring="f1")
searchResults = searcher.fit(X_train_wx, y_train_wx)

# summarize grid search information
bestScore = searchResults.best_score_
bestParams = searchResults.best_params_
# print("[INFO] best score is {:.2f} using {}".format(bestScore,
# 	bestParams))

print("bestParams is",
	bestParams)

[INFO] performing random search...


  


Epoch 1/6


  super(SGD, self).__init__(name, **kwargs)


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
bestParams is {'learnRate': 0.01, 'hiddenLayerTwo': 24, 'hiddenLayerOne': 72}


In [None]:
print("[INFO] evaluating the best model...")
bestModel = searchResults.best_estimator_
ypredz = bestModel.predict(X_test_wx)
print(f1_score(y_test_wx, ypredz))

[INFO] evaluating the best model...
0.6236559139784946
