In [102]:
import yfinance as yf
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Tensorflow
from tensorflow.keras.layers import Input, Dense, LSTM
from tensorflow.keras.models import Model, load_model
from keras.callbacks import ModelCheckpoint

# Display
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# One stock

In [103]:
STOCK_SYMBOL = 'GE'

## Parameters

In [104]:
period = '10y'
time_step = 10
split_ratio = 0.8
output_index = 3  # Close

## Data

### Download

In [105]:
def download_stock_data(stock_symbol, indicators=['max'], period='max'):
    """
    Get stocks
    :param stock_symbol: Name of wanted stock, string
    :param indicators: Indicators to extract (available indicators: 'Open', 'High', 'Low', 'Close', 'Volume', 'max'), list of strings
    :param period: Data period to download ('1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '5y', '10y', 'ytd', 'max')
    :return:
        - stocks (nb samples, features)
    """
    ticker = yf.Ticker(stock_symbol)
    history = ticker.history(period=period)

    if 'max' in indicators:
        indicators = ['Open', 'High', 'Low', 'Close', 'Volume']
    
    stock_data = history[indicators].to_numpy()
    
    return stock_data

In [106]:
stock = download_stock_data(STOCK_SYMBOL, indicators=['max'], period=period)

print('Number of samples:', stock.shape[0])

Number of samples: 2517


### Split

In [107]:
def split_train_test(x, split_ratio):
    """
    Split data into 2 datasets: train and test
    :param x: data, ndarray
    :param split_ratio: ratio to split dataset, float [0-1]
    :return:
        - x_train
        - x_test
    """
    
    nb_samples_train = int(x.shape[0] * split_ratio)
    
    x_train = x[:nb_samples_train]
    x_test = x[nb_samples_train:]
    
    return x_train, x_test

In [108]:
x_train, x_test = split_train_test(stock, split_ratio)

print('TRAIN: Shape of input:', x_train.shape)
print('TEST: Shape of input:', x_test.shape)

TRAIN: Shape of input: (2013, 5)
TEST: Shape of input: (504, 5)


### Log returns

In [109]:
def log_return(data):
    logR = np.diff(np.log(data.transpose())).transpose()
    return logR

In [110]:
# x_train = log_return(x_train)
# x_test = log_return(x_test)

In [111]:
# display_curves([x_train[:,0], x_train[:,1], x_train[:,2], x_train[:,3]])

In [112]:
# np.std(x_train, axis=0)

In [113]:
# x = np.divide(x_train, np.std(x_train, axis=0))
# x.shape

In [114]:
# display_curves([np.divide(x_train, np.std(x_train, axis=0))[:,0]])

### Normalization

In [115]:
scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [116]:
def inverse_transform(vec, scaler, nb_features, index_column):
    """
    Inverse normalization of a single vector
    :param vec: vector to reverse normalization, ndarray
    :param scaler: scaler use to normalize data
    :param nb_features: number of features used to normalize data
    :param index_column: index of the column to inverse normalize
    :return:
        - inverse normalization of the vector
    """
    
    nb_samples = vec.shape[0]
    
    data = np.zeros((nb_samples, nb_features))    
    data[:, index_column] = vec.reshape((nb_samples, 1))[:, 0]
    
    data_transform = scaler.inverse_transform(data)
    
    data_out = data_transform[:, index_column].reshape((nb_samples, 1))    
    return data_out

In [117]:
# Unit test
data_in = np.asarray([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]).reshape((2,-1)).transpose()

scalerTest = MinMaxScaler()
data_in = scalerTest.fit_transform(data_in)

data_out = data_in[:, 1].reshape((5,1))

data_transform = inverse_transform(data_out, scalerTest, nb_features=2, index_column=1)

assert data_out.shape == data_transform.shape

del data_in, data_out, scalerTest, data_transform

### Build time series

In [118]:
def build_timeserie(data, time_step):
    """
    Build timeseries dataset
    :param data: stock data, ndarray (nb_sample, features)
    :param time_step: time step used to build dataset
    :return:
        - timeseries dataset input
    """
    
    nb_samples = data.shape[0]
    nb_features = data.shape[1]
    
    x = np.zeros((nb_samples - time_step, time_step, nb_features))
    
    for i_index in range(nb_samples - time_step):
        x[i_index] = data[i_index:i_index + time_step, :]
        
    return x 

In [119]:
# Unit test
vec = np.asarray([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]).reshape((2,-1)).transpose()

x = build_timeserie(vec, 2)

assert x.shape == (3, 2, 2)

np.testing.assert_array_equal(x[0][0], [1., 6.])
np.testing.assert_array_equal(x[2][1], [4., 9.])

del x, vec

In [120]:
def build_dataset(x_train, x_test, time_step):
    """
    Build time series dataset input and output
    """
    
    x_train_timeseries = build_timeserie(x_train, time_step)
    x_test_timeseries = build_timeserie(x_test, time_step)
    
    nb_samples_timeseries_train = x_train_timeseries.shape[0]
    nb_samples_timeseries_test = x_test_timeseries.shape[0]
    
    y_train = x_train[-nb_samples_timeseries_train:, output_index].reshape((nb_samples_timeseries_train, 1))
    y_test = x_test[-nb_samples_timeseries_test:, output_index].reshape((nb_samples_timeseries_test, 1))
    
    return x_train_timeseries, y_train, x_test_timeseries, y_test

In [121]:
x_train_timeseries, y_train, x_test_timeseries, y_test = build_dataset(x_train, x_test, time_step)

In [122]:
# x_train_timeseries = build_dataset(x_train, time_step)
# x_test_timeseries = build_dataset(x_test, time_step)

nb_samples_timeseries_train = x_train_timeseries.shape[0]
nb_samples_timeseries_test = x_test_timeseries.shape[0]

print('TRAIN: Shape of input:', x_train_timeseries.shape)
print('TEST: Shape of input:', x_test_timeseries.shape)

y_train = x_train[-nb_samples_timeseries_train:, output_index].reshape((nb_samples_timeseries_train, 1))
y_test = x_test[-nb_samples_timeseries_test:, output_index].reshape((nb_samples_timeseries_test, 1))

print('TRAIN: Shape of output:', y_train.shape)
print('TEST: Shape of output:', y_test.shape)

TRAIN: Shape of input: (2003, 10, 5)
TEST: Shape of input: (494, 10, 5)
TRAIN: Shape of output: (2003, 1)
TEST: Shape of output: (494, 1)


## Visualization

In [123]:
def display_stock_prediction(reality, prediction):
    """
    Display a Candlestick graph
    :param data: stock data, ndarray
    :param symbol: symbol of the stock, string
    """
    fig = go.Figure()
    
    # 'Open', 'Low', 'High', 'Close'
    fig.add_trace(go.Candlestick(open=reality[:, 0],
                                 high=reality[:, 1],
                                 low=reality[:, 2],
                                 close=reality[:, 3],
                                 name=STOCK_SYMBOL))
    
    # Prediction
    prediction = prediction.reshape((prediction.shape[0], 1))
    fig.add_trace(go.Scatter(y=prediction[:, 0], line_color='blue', name='Prediction',
                             mode='lines+markers', marker=dict(size=4), line=dict(width=1)))
    
    # Parameters
    fig.update_layout(title=('Prediction vs Reality'), xaxis_rangeslider_visible=False)
    fig.update_yaxes(title_text='Stock')

    fig.show()    

In [124]:
display_stock_prediction(stock, stock[:, 1])

In [125]:
def display_history(history, losses=['loss'], accuracies=[]):
    """
    Display history of learning step
    :param history: history from fit function
    """
    fig = go.Figure()
    fig = make_subplots(rows=1+1*(len(accuracies)>0), cols=1, shared_xaxes=True, subplot_titles=('Loss', 'Accuracy'))
    
    # loss
    for i in range(len(losses)):
        fig.add_trace(go.Scatter(y=history.history[losses[i]], name=losses[i], mode='lines+markers'), row=1, col=1)
    
    # Validation loss
    for i in range(len(accuracies)):
        fig.add_trace(go.Scatter(y=history.history[accuracies[i]], name=accuracies[i], mode='lines+markers'), row=2, col=1)
        
    # Parameters
    fig.update_layout(title='Loss during training')
    fig.update_xaxes(title='Epochs')
    
    fig.show()    

In [126]:
def display_curves(list_data):
    """
    Display curves in the same figure
    :param list_data: first data, ndarray
    """
    fig = go.Figure()
    
    for i, data in enumerate(list_data):
        data = data.reshape((data.shape[0], 1))
        fig.add_trace(go.Scatter(y=data[:, 0], name='data ' + str(i), mode='lines+markers'))
            
    fig.show()        

## Metrics

In [127]:
def rmse(reality, prediction):
    """
    Compute Root Mean Squared Error
    :param reality: target signal, ndarray
    :param prediction: predicted signal, ndarray
    :return:
        - rmse, float
    """
    return np.sqrt(((reality - prediction)**2).mean())

## Model

In [128]:
def lstm(num_timesteps, num_features, num_outputs=1, nb_layers=4, units=50, dropout=0.2, activation=None):
    """
    Build lstm model
    :param num_timesteps: number of timesteps in the input
    :param num_features: number of features in the input
    :param num_outputs: number of outputs
    :param nb_layers: number of layers
    :param units: numbers of lstm neurons
    :param dropout: fraction of the input units to drop
    :param activation: activation function of the last (Dense) layer
    :return:
        - model: lstm model
    """

    # Specify the input shape
    inputs = Input(shape=(num_timesteps, num_features))

    # Build the layers
    tensor = inputs
    for i_layer in range(nb_layers-1):
        tensor = LSTM(units=units, return_sequences=True)(tensor)
        tensor = Dropout(dropout)(tensor)

    tensor = LSTM(units=units)(tensor)
    tensor = Dropout(dropout)(tensor)

    # Specify the output shape
    outputs = Dense(units=num_outputs, activation=activation)(tensor)

    # Create the model with defined inputs and outputs
    model = Model(inputs=inputs, outputs=outputs)

    return model

In [139]:
my_lstm = lstm(time_step, stock.shape[1], num_outputs=1, nb_layers=1, units=40, dropout=0.2, activation='sigmoid')
my_lstm.summary()

Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 10, 5)]           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 40)                7360      
_________________________________________________________________
dropout_4 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 41        
Total params: 7,401
Trainable params: 7,401
Non-trainable params: 0
_________________________________________________________________


### Learning

In [140]:
filepath = 'models/lstm/my_best_model.hdf5'
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [141]:
my_lstm.compile(optimizer='adam', loss='mean_squared_error')
history = my_lstm.fit(x_train_timeseries, y_train,
                      validation_data=(x_test_timeseries, y_test),
                      epochs=150, batch_size=8, shuffle=True,
                      callbacks=[checkpoint])

Epoch 1/150
Epoch 00001: val_loss improved from inf to 0.00784, saving model to models/lstm\my_best_model.hdf5
Epoch 2/150
Epoch 00002: val_loss improved from 0.00784 to 0.00558, saving model to models/lstm\my_best_model.hdf5
Epoch 3/150
Epoch 00003: val_loss improved from 0.00558 to 0.00265, saving model to models/lstm\my_best_model.hdf5
Epoch 4/150
Epoch 00004: val_loss improved from 0.00265 to 0.00238, saving model to models/lstm\my_best_model.hdf5
Epoch 5/150
Epoch 00005: val_loss improved from 0.00238 to 0.00147, saving model to models/lstm\my_best_model.hdf5
Epoch 6/150
Epoch 00006: val_loss did not improve from 0.00147
Epoch 7/150
Epoch 00007: val_loss did not improve from 0.00147
Epoch 8/150
Epoch 00008: val_loss did not improve from 0.00147
Epoch 9/150
Epoch 00009: val_loss improved from 0.00147 to 0.00146, saving model to models/lstm\my_best_model.hdf5
Epoch 10/150
Epoch 00010: val_loss improved from 0.00146 to 0.00120, saving model to models/lstm\my_best_model.hdf5
Epoch 11/

Epoch 00033: val_loss did not improve from 0.00065
Epoch 34/150
Epoch 00034: val_loss did not improve from 0.00065
Epoch 35/150
Epoch 00035: val_loss did not improve from 0.00065
Epoch 36/150
Epoch 00036: val_loss improved from 0.00065 to 0.00064, saving model to models/lstm\my_best_model.hdf5
Epoch 37/150
Epoch 00037: val_loss did not improve from 0.00064
Epoch 38/150
Epoch 00038: val_loss improved from 0.00064 to 0.00057, saving model to models/lstm\my_best_model.hdf5
Epoch 39/150
Epoch 00039: val_loss did not improve from 0.00057
Epoch 40/150
Epoch 00040: val_loss did not improve from 0.00057
Epoch 41/150
Epoch 00041: val_loss did not improve from 0.00057
Epoch 42/150
Epoch 00042: val_loss improved from 0.00057 to 0.00056, saving model to models/lstm\my_best_model.hdf5
Epoch 43/150
Epoch 00043: val_loss did not improve from 0.00056
Epoch 44/150
Epoch 00044: val_loss did not improve from 0.00056
Epoch 45/150
Epoch 00045: val_loss improved from 0.00056 to 0.00055, saving model to mode

Epoch 67/150
Epoch 00067: val_loss did not improve from 0.00042
Epoch 68/150
Epoch 00068: val_loss did not improve from 0.00042
Epoch 69/150
Epoch 00069: val_loss did not improve from 0.00042
Epoch 70/150
Epoch 00070: val_loss did not improve from 0.00042
Epoch 71/150
Epoch 00071: val_loss improved from 0.00042 to 0.00038, saving model to models/lstm\my_best_model.hdf5
Epoch 72/150
Epoch 00072: val_loss did not improve from 0.00038
Epoch 73/150
Epoch 00073: val_loss improved from 0.00038 to 0.00037, saving model to models/lstm\my_best_model.hdf5
Epoch 74/150
Epoch 00074: val_loss did not improve from 0.00037
Epoch 75/150
Epoch 00075: val_loss did not improve from 0.00037
Epoch 76/150
Epoch 00076: val_loss did not improve from 0.00037
Epoch 77/150
Epoch 00077: val_loss did not improve from 0.00037
Epoch 78/150
Epoch 00078: val_loss did not improve from 0.00037
Epoch 79/150
Epoch 00079: val_loss did not improve from 0.00037
Epoch 80/150
Epoch 00080: val_loss improved from 0.00037 to 0.00

Epoch 101/150
Epoch 00101: val_loss did not improve from 0.00031
Epoch 102/150
Epoch 00102: val_loss did not improve from 0.00031
Epoch 103/150
Epoch 00103: val_loss did not improve from 0.00031
Epoch 104/150
Epoch 00104: val_loss did not improve from 0.00031
Epoch 105/150
Epoch 00105: val_loss did not improve from 0.00031
Epoch 106/150
Epoch 00106: val_loss did not improve from 0.00031
Epoch 107/150
Epoch 00107: val_loss did not improve from 0.00031
Epoch 108/150
Epoch 00108: val_loss improved from 0.00031 to 0.00031, saving model to models/lstm\my_best_model.hdf5
Epoch 109/150
Epoch 00109: val_loss did not improve from 0.00031
Epoch 110/150
Epoch 00110: val_loss did not improve from 0.00031
Epoch 111/150
Epoch 00111: val_loss did not improve from 0.00031
Epoch 112/150
Epoch 00112: val_loss improved from 0.00031 to 0.00028, saving model to models/lstm\my_best_model.hdf5
Epoch 113/150
Epoch 00113: val_loss did not improve from 0.00028
Epoch 114/150
Epoch 00114: val_loss did not improve

Epoch 135/150
Epoch 00135: val_loss improved from 0.00027 to 0.00026, saving model to models/lstm\my_best_model.hdf5
Epoch 136/150
Epoch 00136: val_loss did not improve from 0.00026
Epoch 137/150
Epoch 00137: val_loss did not improve from 0.00026
Epoch 138/150
Epoch 00138: val_loss did not improve from 0.00026
Epoch 139/150
Epoch 00139: val_loss did not improve from 0.00026
Epoch 140/150
Epoch 00140: val_loss did not improve from 0.00026
Epoch 141/150
Epoch 00141: val_loss did not improve from 0.00026
Epoch 142/150
Epoch 00142: val_loss did not improve from 0.00026
Epoch 143/150
Epoch 00143: val_loss did not improve from 0.00026
Epoch 144/150
Epoch 00144: val_loss did not improve from 0.00026
Epoch 145/150
Epoch 00145: val_loss did not improve from 0.00026
Epoch 146/150
Epoch 00146: val_loss did not improve from 0.00026
Epoch 147/150
Epoch 00147: val_loss did not improve from 0.00026
Epoch 148/150
Epoch 00148: val_loss did not improve from 0.00026
Epoch 149/150
Epoch 00149: val_loss di

In [142]:
display_history(history, losses=['loss', 'val_loss'])

### Prediction

In [143]:
my_lstm = load_model(filepath)

In [144]:
prediction = my_lstm.predict(x_test_timeseries)

In [145]:
display_stock_prediction(x_test[time_step:, ], prediction)

### KPI

In [146]:
y_true = inverse_transform(y_test, scaler, nb_features=5, index_column=output_index)
y_pred = inverse_transform(prediction, scaler, nb_features=5, index_column=output_index)

In [147]:
error_lstm = rmse(y_true, y_pred)
print('LSTM error:', error_lstm)

LSTM error: 0.36626092094121737
