### Import Package

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from stockstats import StockDataFrame

from sklearn.model_selection import train_test_split
from sklearn import linear_model

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, layers, models
from sklearn.model_selection import GridSearchCV

### Set the data source path

In [2]:
# Set the data source path
interval = "daily"
region = "us"
ex_product = "nasdaq stocks"
section = "1"
stock = "aapl"
data_path = "test_data/"+interval+"/"+region+"/"+ex_product+"/"+section+"/"+stock+"."+region+".txt"

# Use Apple .Inc stock for training

# Extract only the OLHC
column_to_use = ["OPEN","LOW","HIGH","CLOSE"]


### Load the stock data

In [3]:
# Load the data
ori_data = pd.read_csv(data_path, sep=",")

# Rename the column names
ori_data.columns = [colname[1:-1] for colname in ori_data.columns]

# Drop the unnecessary
ori_data.index = ori_data["DATE"]
ori_data = ori_data.drop(columns=['DATE','PER','TIME', 'TICKER', 'OPENINT'])
ori_data.columns = ["open","high","low","close","volume"]

In [4]:
ori_data

Unnamed: 0_level_0,open,high,low,close,volume
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19840907,0.10150,0.10274,0.10028,0.10150,96970899
19840910,0.10150,0.10181,0.09905,0.10090,75265237
19840911,0.10181,0.10456,0.10181,0.10274,177479896
19840912,0.10274,0.10334,0.09966,0.09966,155043826
19840913,0.10518,0.10548,0.10518,0.10518,241475025
...,...,...,...,...,...
20211021,148.81000,149.64000,147.87000,149.48000,61420990
20211022,149.69000,150.18000,148.64000,148.69000,58883443
20211025,148.68000,149.37000,147.62110,148.64000,50720556
20211026,149.33000,150.84000,149.01010,149.32000,60893395


In [5]:
# Use online package to generate additional features
x = StockDataFrame(ori_data)
data = x[['open','high','low','close','volume',
          'boll', 'boll_ub', 'boll_lb',
          'macd', 'macdh', 'macds',
          'rsi_11', 'rsi_14', 'rsi_21']]


### Split the train and test data

In [6]:
def custom_split(data,start,end):
    train = (data.index >= start) & (data.index <= end)
    train_X = data[train]
    
    return train_X

In [7]:
train_X = custom_split(data,start = 20130101,end = 20171031)
valid_X = custom_split(data,start = 20171101,end = 20181231)
test_X = custom_split(data,start = 20190101,end = 20201231)

### Label the target result

In [8]:
# Assume we use 10 days price data to predict opening price of the 11th day
num_day_to_predict = 10


In [9]:
def produce_result_target_price(X,num_day,result_col_name = "result_price"):
    y = pd.DataFrame(np.nan, index=X.index, columns=[result_col_name])
    for i in range(len(X)-num_day):
        y.iloc[i+num_day_to_predict,0] = X.iloc[i+num_day,0]
    return y

In [10]:
train_y = produce_result_target_price(train_X,num_day_to_predict)
valid_y = produce_result_target_price(valid_X,num_day_to_predict)
test_y = produce_result_target_price(test_X,num_day_to_predict)

In [11]:
valid_y

Unnamed: 0_level_0,result_price
DATE,Unnamed: 1_level_1
20171101,
20171102,
20171103,
20171106,
20171107,
...,...
20181224,36.007
20181226,36.044
20181227,37.876
20181228,38.280


### Transform the X, y data into tensor

In [12]:
def transform_data_to_tensor(X,y,num_day):
    # Initiate tensor for X
    x_first = X.iloc[0:num_day,:]
    x_mean = x_first.mean(axis=0) # Get the mean of the 10-day frame
    x_std = x_first.std(axis=0) # Get the std of the 10-day frame
    x_first = x_first.sub(x_mean, axis=1).div(x_std, axis=1) # Normalize the 10-day frame here
    
    # Initiate tensor for y
    x_open = X.iloc[0:num_day,0]
    y_val = y.iloc[num_day,:] # Get the corresponding y
    y_val = y_val.sub(x_open.mean(axis=0)).div(x_open.std(axis=0)) # Normalize the y
    
    x_tf_data = [tf.convert_to_tensor(np.array(x_first),dtype = tf.float32)]
    y_tf_data = [tf.convert_to_tensor(np.array(y_val),dtype = tf.float32)]
    
    for i in range(1,len(X)-num_day):   
        x_window = X.iloc[i:i+num_day,:] # Set the window as a 10-day frame 
        x_mean = x_window.mean(axis=0) # Get the mean of the 10-day frame
        x_std = x_window.std(axis=0) # Get the std of the 10-day frame
        x_window = x_window.sub(x_mean, axis=1).div(x_std, axis=1) # Normalize the 10-day frame here
        
        x_open = X.iloc[i:i+num_day,0] # Get the opening price of the 10-day frame
        y_val = y.iloc[i+num_day,:] # Get the corresponding y
        y_val = y_val.sub(x_open.mean(axis=0)).div(x_open.std(axis=0)) # Normalize the y
        
        x_next_tf = tf.convert_to_tensor(np.array(x_window),dtype = tf.float32)
        x_tf_data = tf.concat([x_tf_data, [x_next_tf]], 0)
        
        y_next_tf = tf.convert_to_tensor(np.array(y_val),dtype = tf.float32)
        y_tf_data = tf.concat([y_tf_data, [y_next_tf]], 0)
    return (tf.reshape(x_tf_data,(-1,10,14,1)),y_tf_data)


In [13]:
tf_train_X,tf_train_y = transform_data_to_tensor(train_X,train_y,num_day_to_predict)
tf_valid_X,tf_valid_y = transform_data_to_tensor(valid_X,valid_y,num_day_to_predict)
tf_test_X,tf_test_y = transform_data_to_tensor(test_X,test_y,num_day_to_predict)


2022-03-09 19:43:12.523098: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-09 19:43:12.523598: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 12. Tune using inter_op_parallelism_threads for best performance.


In [14]:
print(tf_train_X.shape)
print(tf_train_y.shape)
print(tf_train_X.dtype)
print(tf_train_y.dtype)

print(tf_valid_X.shape)
print(tf_valid_y.shape)
print(tf_valid_X.dtype)
print(tf_valid_y.dtype)

print(tf_test_X.shape)
print(tf_test_y.shape)
print(tf_test_X.dtype)
print(tf_test_y.dtype)

(1208, 10, 14, 1)
(1208, 1)
<dtype: 'float32'>
<dtype: 'float32'>
(282, 10, 14, 1)
(282, 1)
<dtype: 'float32'>
<dtype: 'float32'>
(495, 10, 14, 1)
(495, 1)
<dtype: 'float32'>
<dtype: 'float32'>


### Build the Model

In [15]:

def myModel(input_shape,
            encoder_unit = 100,
            repeat_vector_n = 10):
    
    inputs = layers.Input(input_shape)
    
    print("Input: ",inputs.shape)
    
    # First Convolution + MaxPooling + Dropout
    x = layers.Conv2D(filters = 64,kernel_size=(3,3), strides = (1,1), activation='relu', padding='valid')(inputs)
    x = layers.MaxPooling2D(pool_size=(2,2),strides=(2,1), padding='valid')(x)
    x = layers.Dropout(rate = 0.01)(x)
    print("1 Cov: ",x.shape)
    
    # Second Convolution + MaxPooling + Dropout
    x = layers.Conv2D(filters = 16,kernel_size=(3,3), strides = (1,1), activation='relu', padding='valid')(x)
    x = layers.MaxPooling2D(pool_size=(2,2),strides=(2,1), padding='valid')(x)
    x = layers.Dropout(rate = 0.01)(x)
    print("2 Cov: ",x.shape)
    
    # Flatten Layer
    x = layers.Flatten()(x)
    print("Flatten: ",x.shape)
    
    # Repeat Vector Layer
    x = layers.RepeatVector(n = repeat_vector_n)(x)
    print("RepeatVector: ",x.shape)
    
    # Connect to LSTM
    x = layers.LSTM(units = encoder_unit, input_shape=(5,1))(x)
    print("LSTM: ",x.shape)
    
    # Add the Dense Layer with relu activation
    x = layers.Dense(units = int(encoder_unit/2),activation = "relu")(x)
    print("1 Dense: ",x.shape)
    
    # Add the last Dense Layer with sigmoid activation
    outputs = layers.Dense(units = 1,activation = "sigmoid")(x)
    print("Output: ",outputs.shape)
    
    return keras.Model(inputs=inputs, outputs=outputs)


### Model Training and Fitting and Validation


In [16]:
loss_list = ["MAE"]
metric_list = ["MAE"]
optimizer_list = ["Adam"]
epoch_list = [30,50]
batch_list = [100]
encoder_list = [100]
lr_list = [0.005]
train_df = pd.DataFrame(columns = ["Epoch","Batch","Optimizer","LR","Encoder Unit","Loss","Metrics","Validation"])
best_model = ""
best_valid = 99999
metrics = [tf.keras.metrics.Accuracy()]

for los in loss_list:
    for met in metric_list:
        for opti in optimizer_list:
            for epochs in epoch_list:
                for batchs in batch_list:
                    for lr in lr_list:
                        for encoder_u in encoder_list:

                            model = myModel(input_shape=(num_day_to_predict,train_X.shape[1],1),
                                            encoder_unit = encoder_u,
                                            repeat_vector_n = 50
                                           )

                            if opti == "Adam":
                                optimizer = keras.optimizers.Adam(learning_rate=lr)
                                
                            if los == "MAE":
                                loss = keras.losses.MeanAbsoluteError()
                            elif los == "MSE":
                                loss = keras.losses.MeanSquaredError()
                                
                            if met == "MAE":
                                metrics.append(keras.metrics.MeanAbsoluteError())
                            elif met == "MSE":
                                metrics.append(keras.metrics.MeanSquaredError())
                                
                            model.compile(
                                optimizer=optimizer,
                                loss=loss,
                                metrics=metrics,
                            )

                            history = model.fit(
                                    tf_train_X,
                                    tf_train_y,
                                    epochs = epochs,
                                    steps_per_epoch = batchs,
                                )

                            results = model.evaluate(tf_valid_X, tf_valid_y, batch_size=batchs)
                            print(results)
                            print("===== Summary =====")
                            print("Epoch: ",epochs)
                            print("Batch Size: ",batchs)
                            print("Optimizer: ",opti)
                            print("Learning Rate: ",lr)
                            print("Encoder Units: ",encoder_u)
                            print("Loss Function: ", los)
                            print("Metrics: ", met)
                            print("Validation: ",results)
                            if results < best_valid:
                                best_valid = results
                                best_model = model
                            train_df = train_df.append({"Epoch": epochs,
                                                        "Batch": batchs,
                                                        "Optimizer": opti,
                                                        "LR": lr,
                                                        "Encoder Unit": encoder_u,
                                                        "Loss": los,
                                                        "Metrics": met,
                                                        "Validation":results}, ignore_index=True)
best_model.save("model/cnn_lstm_best")

Input:  (None, 10, 14, 1)
1 Cov:  (None, 4, 11, 64)
2 Cov:  (None, 1, 8, 16)
Flatten:  (None, 128)
RepeatVector:  (None, 50, 128)
LSTM:  (None, 100)
1 Dense:  (None, 50)
Output:  (None, 1)
Train on 1208 samples
Epoch 1/30


2022-03-09 19:43:30.398207: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_standard_lstm_18955_19440' and '__inference___backward_standard_lstm_18955_19440_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_19627' both implement 'lstm_7dca6800-06c8-486b-bfeb-b710db9c3913' but their signatures do not match.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


2022-03-09 19:48:49.223654: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_standard_lstm_32014_specialized_for_model_lstm_StatefulPartitionedCall_at___inference_distributed_function_32379' and '__inference_standard_lstm_32014' both implement 'lstm_c22aa3fd-2533-4344-869a-3890225ce6f3' but their signatures do not match.




[1.0466524802201183, 0.0, 1.0466526]
===== Summary =====
Epoch:  30
Batch Size:  100
Optimizer:  Adam
Learning Rate:  0.005
Encoder Units:  100
Loss Function:  MAE
Metrics:  MAE
Validation:  [1.0466524802201183, 0.0, 1.0466526]


TypeError: '<' not supported between instances of 'list' and 'int'

In [None]:
train_df.sort_values(by=["Validation"], inplace = True)

In [None]:
print(train_df)

### Model Testing

In [17]:
loaded_model = tf.keras.models.load_model('model/cnn_lstm_best')

predictions = loaded_model.predict(tf_test_X)
print("predictions shape:", predictions.shape)
print(predictions)

2022-03-09 19:54:25.620411: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_cudnn_lstm_with_fallback_32763' and '__inference_standard_lstm_32652_specialized_for_model_lstm_StatefulPartitionedCall_at___inference_distributed_function_32962' both implement 'lstm_1fd2bea7-b47a-4e50-8364-97206a7cabc7' but their signatures do not match.


predictions shape: (495, 1)
[[9.96850729e-01]
 [9.99966264e-01]
 [9.99988079e-01]
 [9.99998629e-01]
 [9.99381721e-01]
 [3.37434918e-01]
 [1.94761038e-01]
 [3.90087605e-01]
 [6.92700088e-01]
 [9.99821782e-01]
 [9.99999642e-01]
 [9.99999166e-01]
 [9.99994099e-01]
 [9.99997675e-01]
 [9.99997199e-01]
 [9.99991596e-01]
 [9.00944948e-01]
 [2.66435653e-01]
 [4.79991138e-01]
 [5.13820231e-01]
 [5.30334055e-01]
 [2.09321260e-01]
 [2.26758868e-01]
 [1.70374095e-01]
 [9.99958873e-01]
 [9.99999046e-01]
 [9.99996781e-01]
 [9.99999762e-01]
 [9.99998093e-01]
 [9.99999404e-01]
 [9.99983251e-01]
 [9.36651051e-01]
 [9.99983668e-01]
 [9.95517254e-01]
 [7.37708807e-02]
 [3.57925892e-05]
 [1.10268593e-06]
 [9.74969506e-01]
 [9.94908333e-01]
 [9.99997735e-01]
 [9.99991596e-01]
 [9.99999523e-01]
 [9.99999166e-01]
 [9.99998569e-01]
 [9.99996781e-01]
 [9.99999523e-01]
 [9.99999404e-01]
 [4.20003682e-01]
 [1.59379303e-01]
 [1.68055296e-04]
 [3.03685665e-05]
 [4.37172353e-02]
 [1.33652478e-01]
 [9.17213440e-01]


In [18]:
def getMeanAndStd(X,y,num_day):
    mean_list = []
    std_list = []
    for i in range(0,len(X)-num_day): 
        x_open = X.iloc[i:i+num_day,0]
        mean_list.append(x_open.mean(axis=0))
        std_list.append(x_open.std(axis=0))
    mean_df = pd.DataFrame(mean_list, columns = ["mean"])
    std_df = pd.DataFrame(std_list, columns = ["std"])
    return (mean_df,std_df)

In [None]:
test_mean, test_std = getMeanAndStd(test_X, test_y, num_day_to_predict)
final_pred = predictions*np.array(test_std) + np.array(test_mean)
final_test_y = test_y.iloc[num_day_to_predict: , :]


In [None]:
final_test_y

### Plot the graph

In [None]:
plot_start_date = 20190101
plot_end_date = 20201231
keep = (final_test_y.index >= plot_start_date) & (final_test_y.index <= plot_end_date)
final_pred = pd.DataFrame(data=final_pred,index = final_test_y.index, columns = ["Predicted"])
plot_test_y = final_test_y[keep]
plot_pred = final_pred[keep]

string_index =  plot_test_y.index.map(str)

plt.plot(string_index, plot_test_y["result_price"], label = "Actual", color = 'Black')
plt.plot(string_index, plot_pred["Predicted"], label = "Predicted", color = 'Orange')
plt.xlabel("timestamp")
plt.ylabel("Price (USD)")
plt.title("Prediction of "+stock.upper()+" using CNN-LSTM")

plt.legend()
plt.savefig("plot/CNN_LSTM/"+stock.upper()+"-day("+str(num_day_to_predict)+").jpg",
            dpi=600)
plt.show()







In [None]:
abc = pd.concat([plot_test_y,plot_pred], ignore_index=True, sort=False,axis=1)
abc.columns = ["Actual","Predicted"]
abc

In [None]:
plot_pred

In [None]:
plot_test_y