# **Machine Learning Training**

### Initial Imports:

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 300)

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, GRU

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from joblib import dump, load # Save Pre-Trained Model Using Joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set Initial Random State
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

#### Read in CSV as Pandas DataFrame:

In [3]:
# Set path to CSV and read in CSV
csv_path = Path('Returns and Signals.csv')
return_signal_df=pd.read_csv(csv_path)
# Set index as datetime object and drop extraneous columns
return_signal_df.set_index(pd.to_datetime(return_signal_df['Date'], infer_datetime_format=True), inplace=True)
return_signal_df.drop(columns=['Date'], inplace=True)
return_signal_df.head()

Unnamed: 0_level_0,Adj Close,Volume,Returns,ema_crossover_signal,vol_trend_signal,bollinger_signal,obv_crossover_signal,sentiment_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-02,27.929413,11190300.0,0.027788,1.0,-1.0,0.0,1.0,0.0
2019-01-03,27.416327,10803900.0,-0.018371,1.0,1.0,0.0,-1.0,0.0
2019-01-04,28.045584,14593700.0,0.022952,1.0,1.0,0.0,1.0,0.0
2019-01-07,28.529629,13419400.0,0.017259,1.0,-1.0,0.0,1.0,0.0
2019-01-08,29.042719,8179700.0,0.017984,1.0,-1.0,0.0,1.0,0.0


#### Check Data Quality:

In [4]:
return_signal_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 504 entries, 2019-01-02 to 2020-12-30
Data columns (total 8 columns):
Adj Close               504 non-null float64
Volume                  504 non-null float64
Returns                 504 non-null float64
ema_crossover_signal    504 non-null float64
vol_trend_signal        504 non-null float64
bollinger_signal        504 non-null float64
obv_crossover_signal    504 non-null float64
sentiment_signal        504 non-null float64
dtypes: float64(8)
memory usage: 35.4 KB


### Creating X and Dependent Variables:

In [5]:
def variables(return_signal_df):
    '''
    this func takes in return_signal_df dataframe,
    then shiting X (singnals) variables down (to the future) by 1,
    (assuming intraday trading, the position auto closed at T+1),
    then constructing a new column for the dependent variable ("positive return"),
    return a modified return_signal_df.
    '''
    # Set X Variables/Features 
    global x_var_list
    x_var_list = ['ema_crossover_signal','vol_trend_signal','bollinger_signal','obv_crossover_signal','sentiment_signal'] # as a filter
    # Shift 1 down as signals comes after prices 
    return_signal_df[x_var_list] = return_signal_df[x_var_list].shift(1)
    # Construct the dependent variable where if daily return is greater than 0, then 1, else, 0.
    return_signal_df['Positive Return'] = np.where(return_signal_df['Returns'] > 0, 1.0, 0.0)
    
    return_signal_df.dropna(inplace=True)
    return return_signal_df

In [6]:
return_signal_df_modified = variables(return_signal_df)
return_signal_df_modified.head()

Unnamed: 0_level_0,Adj Close,Volume,Returns,ema_crossover_signal,vol_trend_signal,bollinger_signal,obv_crossover_signal,sentiment_signal,Positive Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-01-03,27.416327,10803900.0,-0.018371,1.0,-1.0,0.0,1.0,0.0,0.0
2019-01-04,28.045584,14593700.0,0.022952,1.0,1.0,0.0,-1.0,0.0,1.0
2019-01-07,28.529629,13419400.0,0.017259,1.0,1.0,0.0,1.0,0.0,1.0
2019-01-08,29.042719,8179700.0,0.017984,1.0,-1.0,0.0,1.0,0.0,1.0
2019-01-09,29.323465,10607500.0,0.009667,1.0,-1.0,0.0,1.0,0.0,1.0


### Separate X and Y Training Datasets:
80% (training) 20% (testing) split

In [7]:
def separate(return_signal_df_modified):
    '''
    thif func takes in return_signal_df_modified,
    split the data into trian (75%) and test (25%),
    return trian and test data sets for the X and dependent variables.
    '''
    split = int(0.75 * len(return_signal_df_modified))
    # X Variable
    x_train = return_signal_df_modified[x_var_list][:split]
    x_test = return_signal_df_modified[x_var_list][split:]
    # Dependent Variable
    y_train = return_signal_df_modified['Positive Return'][:split]
    y_test = return_signal_df_modified['Positive Return'][split:]
    return x_train, x_test, y_train, y_test

In [8]:
x_train, x_test, y_train, y_test = separate(return_signal_df_modified)

In [9]:
x_train.head(2)

Unnamed: 0_level_0,ema_crossover_signal,vol_trend_signal,bollinger_signal,obv_crossover_signal,sentiment_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-03,1.0,-1.0,0.0,1.0,0.0
2019-01-04,1.0,1.0,0.0,-1.0,0.0


In [10]:
x_test.head(2)

Unnamed: 0_level_0,ema_crossover_signal,vol_trend_signal,bollinger_signal,obv_crossover_signal,sentiment_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-07-02,1.0,1.0,0.0,1.0,1.0
2020-07-06,1.0,1.0,0.0,1.0,1.0


In [11]:
y_train.head(2)

Date
2019-01-03    0.0
2019-01-04    1.0
Name: Positive Return, dtype: float64

In [12]:
y_test.head(3)

Date
2020-07-02    1.0
2020-07-06    1.0
2020-07-07    1.0
Name: Positive Return, dtype: float64

In [13]:
x_train_reshape = x_train.values.reshape((x_train.shape[0], x_train.shape[1],1))
x_test_reshape = x_test.values.reshape((x_test.shape[0], x_test.shape[1], 1))

### Machine Learning Models Training:

In [14]:
# Helper Function:
def analyse_model(model, predictions):
    '''
    this helper func takes in model, and predictions,
    to print accuracy score for traning and testing,
    to print the classification report for the test data,
    return a result dataframe which contains actual vs. predicted y data.
    '''
    # Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
    result = y_test.to_frame().drop(columns=["Positive Return"])
    result["Predicted Value"] = predictions
    
    # Print performance metrics:
    print("Accuracy score (training): {0:.4f}".format(accuracy_score(y_train, model.predict(x_train))))
    print("Accuracy score (testing): {0:.4f}".format(accuracy_score(y_test, predictions)))
    print(" ")
    print("Classification Report for the test data:")
    print(classification_report(y_test, predictions))
    return result

In [15]:
# Helper Function:
def analyse_RNN_model(RNN_model, predictions):
    '''
    this helper func takes in model, and predictions,
    to print accuracy score for traning and testing,
    to print the classification report for the test data,
    return a result dataframe which contains actual vs. predicted y data.
    '''
    # Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
    result = y_test.to_frame().drop(columns=["Positive Return"])
    result["Predicted Value"] = predictions
    
    # Print performance metrics:
    print("Accuracy score (training): {0:.4f}".format(accuracy_score(y_train, RNN_model.predict_classes(x_train_reshape))))
    print("Accuracy score (testing): {0:.4f}".format(accuracy_score(y_test, predictions)))
    print(" ")
    print("Classification Report for the test data:")
    print(classification_report(y_test, predictions))
    return result

#### 1. Random Forest Model:

**Hyperparameters:**

In [142]:
# trees and 3 steps
n_estimators=15
max_depth=5

In [143]:
def RandomForest(x_train, y_train, x_test, y_test):
    '''
    thif func takes in x_train, y_train, x_test, y_test,
    runing Random Forest Classification model,
    return the RF_model result, and its predicted values (RF_predicitons)
    '''
    # Fit the model using just the training set:
    RF_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
    RF_model.fit(x_train, y_train)
    # Make a prediction of "y" values from the x test dataset
    RF_predictions = RF_model.predict(x_test)
    
    return RF_model, RF_predictions

In [144]:
RF_model, RF_predictions = RandomForest(x_train, y_train, x_test, y_test)
RF_result = analyse_model(RF_model, RF_predictions)
RF_result.sample(5)

Accuracy score (training): 0.5756
Accuracy score (testing): 0.5317
 
Classification Report for the test data:
              precision    recall  f1-score   support

         0.0       0.54      0.43      0.48        63
         1.0       0.53      0.63      0.58        63

    accuracy                           0.53       126
   macro avg       0.53      0.53      0.53       126
weighted avg       0.53      0.53      0.53       126



Unnamed: 0_level_0,Predicted Value
Date,Unnamed: 1_level_1
2020-07-15,1.0
2020-09-25,0.0
2020-12-03,1.0
2020-12-30,0.0
2020-08-07,1.0


#### 2. Gradient Boosting Model:

**Hyperparameters:**

In [145]:
# small learning rate means each tree learns a little from previous tree, 
# and learn more information from the new data.
learning_rate=0.5
n_estimators=15
max_depth=5

In [146]:
def GradientBoost(x_train, y_train, x_test, y_test):
    '''
    thif func takes in x_train, y_train, x_test, y_test,
    runing Gradient Boost Trees Classification model,
    return the GB_model result, and its predicted values (GB_predicitons)
    '''
    # Fit the model using just the training set:
    GB_model = GradientBoostingClassifier(n_estimators=n_estimators,
                                        learning_rate=learning_rate,
                                        max_features=5, # number of signals,
                                        max_depth=max_depth,
                                        random_state=0)
    GB_model.fit(x_train, y_train)
    # Make a prediction of "y" values from the x test dataset
    GB_predictions = GB_model.predict(x_test)   
    
    return GB_model, GB_predictions

In [141]:
GB_model, GB_predictions = GradientBoost(x_train, y_train, x_test, y_test)
GB_result = analyse_model(GB_model, GB_predictions)
GB_result.sample(5)

Accuracy score (training): 0.5782
Accuracy score (testing): 0.5317
 
Classification Report for the test data:
              precision    recall  f1-score   support

         0.0       0.54      0.40      0.46        63
         1.0       0.53      0.67      0.59        63

    accuracy                           0.53       126
   macro avg       0.53      0.53      0.52       126
weighted avg       0.53      0.53      0.52       126



Unnamed: 0_level_0,Predicted Value
Date,Unnamed: 1_level_1
2020-10-21,1.0
2020-11-13,1.0
2020-08-13,1.0
2020-11-16,0.0
2020-11-18,1.0


#### 3. LSTM RNN Model:

**Hyperparameters:**

In [155]:
# Number of neuron per layer
units = 5
# Dropout helps to prevent overfitting, it will randomly set some output to zero.
dropout_rate = 0.2 # will randomly drop 10% of the units
# smaller batch size is recommended
batch_size = 2
# Use at least 10 epochs
epochs = 15

In [156]:
# Mean Absolute Error (MAE)  mae
# categorical_crossentropy
# binary_crossentropy
# mean_squared_error

In [157]:
def LSTM(x_train_reshape, y_train, x_test_reshape, y_test):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    '''
    thif func takes in x_train_reshape, y_train, x_test_reshape, y_test,
    runing LSTM RNN model,
    return the LSTM_model and its predicted values (LSTM_predictions).
    '''
    # STEP I: Define the LSTM RNN model
    LSTM_model = Sequential()
    # Layer 1
    LSTM_model.add(LSTM(
        units=units,
        return_sequences=True,
    input_shape=(x_train_reshape.shape[1], x_train_reshape.shape[2]))
        )
    LSTM_model.add(Dropout(dropout_rate))
    # Layer 2
    LSTM_model.add(LSTM(units=units, return_sequences=True))
    LSTM_model.add(Dropout(dropout_rate))
    # Layer 3
    LSTM_model.add(LSTM(units=units))
    LSTM_model.add(Dropout(dropout_rate))
    # Output layer
    LSTM_model.add(Dense(1))
    
    # Step II: Compile the model
    LSTM_model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
    )
    # Summarize the model
    print(LSTM_model.summary())
    
    # Step III: Training the model
    LSTM_model.fit(
        x_train_reshape,
        y_train,
        epochs=epochs,
        shuffle=False, # to keep the sequential order of the data
        batch_size=batch_size,
        verbose=1,
    )
    # Step IV: Make a prediction of "y" values from the x test dataset
    LSTM_predictions = LSTM_model.predict_classes(x_test_reshape)
    
    return LSTM_model, LSTM_predictions

In [158]:
LSTM_model, LSTM_predicted = LSTM(x_train_reshape, y_train, x_test_reshape, y_test)
LSTM_result=analyse_RNN_model(LSTM_model,LSTM_predicted)
LSTM_result.sample(5)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 5, 5)              140       
_________________________________________________________________
dropout_12 (Dropout)         (None, 5, 5)              0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 5, 5)              220       
_________________________________________________________________
dropout_13 (Dropout)         (None, 5, 5)              0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 5)                 220       
_________________________________________________________________
dropout_14 (Dropout)         (None, 5)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                

Unnamed: 0_level_0,Predicted Value
Date,Unnamed: 1_level_1
2020-10-29,0
2020-10-08,1
2020-12-22,1
2020-12-10,1
2020-12-21,1


#### 4. GRU RNN Model:

In [159]:
def GRU(x_train_reshape, y_train, x_test_reshape, y_test):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout, GRU
    '''
    thif func takes in x_train_reshape, y_train, x_test_reshape, y_test,
    runing GRU RNN model,
    return the GRU_model and its predicted values (GRU_predictions).
    '''
    # STEP I: Define the LSTM RNN model
    GRU_model = Sequential()
    # Layer 1
    GRU_model.add(GRU(
        units=units,
        return_sequences=True,
        input_shape=(x_train_reshape.shape[1], x_train_reshape.shape[2]))
        )
    GRU_model.add(Dropout(dropout_rate))
    # Layer 2
    GRU_model.add(LSTM(units=units, return_sequences=True))
    GRU_model.add(Dropout(dropout_rate))
    # Layer 3
    GRU_model.add(LSTM(units=units))
    GRU_model.add(Dropout(dropout_rate))
    # Output layer
    GRU_model.add(Dense(1))
    
    # Step II: Compile the model
    GRU_model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
    )
    # Summarize the model
    print(GRU_model.summary())
    
    # Step III: Training the model
    GRU_model.fit(
        x_train_reshape,
        y_train,
        epochs=epochs,
        shuffle=False, # to keep the sequential order of the data
        batch_size=batch_size,
        verbose=1,
    )
    # Step IV: Make a prediction of "y" values from the x test dataset
    GRU_predictions = GRU_model.predict_classes(x_test_reshape)
    
    return GRU_model, GRU_predictions

In [160]:
GRU_model, GRU_predictions = GRU(x_train_reshape, y_train, x_test_reshape, y_test)
GRU_result=analyse_RNN_model(GRU_model, GRU_predictions)
GRU_result.sample(5)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 5, 5)              120       
_________________________________________________________________
dropout_15 (Dropout)         (None, 5, 5)              0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 5, 5)              220       
_________________________________________________________________
dropout_16 (Dropout)         (None, 5, 5)              0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 5)                 220       
_________________________________________________________________
dropout_17 (Dropout)         (None, 5)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

Unnamed: 0_level_0,Predicted Value
Date,Unnamed: 1_level_1
2020-10-26,1
2020-08-28,1
2020-11-16,1
2020-08-05,0
2020-09-16,1


In [164]:
combined_result = pd.concat([y_test.to_frame(),RF_result,GB_result,LSTM_result,GRU_result],axis="columns",join="outer")
combined_result.columns=["Actual","RF","GB","LSTM","GRU"]
combined_result.sample(10)

Unnamed: 0_level_0,Actual,RF,GB,LSTM,GRU
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-30,0.0,1.0,1.0,1,1
2020-08-14,1.0,1.0,1.0,0,1
2020-09-09,1.0,0.0,0.0,0,0
2020-09-18,0.0,0.0,0.0,1,1
2020-11-24,1.0,1.0,1.0,1,1
2020-12-21,0.0,0.0,1.0,1,1
2020-10-27,1.0,1.0,1.0,1,1
2020-08-07,1.0,1.0,1.0,0,1
2020-09-21,1.0,0.0,0.0,1,1
2020-10-12,0.0,0.0,0.0,0,0


In [162]:
combined_result.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 126 entries, 2020-07-02 to 2020-12-30
Data columns (total 5 columns):
Actual    126 non-null float64
RF        126 non-null float64
GB        126 non-null float64
LSTM      126 non-null int32
GRU       126 non-null int32
dtypes: float64(3), int32(2)
memory usage: 4.9 KB
