## Import Libraries

In [1]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

Using TensorFlow backend.


## Read Data

In [2]:
_open = pd.read_csv("data/open.csv", index_col="DATE", parse_dates=["DATE"])
high = pd.read_csv("data/high.csv", index_col="DATE", parse_dates=["DATE"])
low = pd.read_csv("data/low.csv", index_col="DATE", parse_dates=["DATE"])
close = pd.read_csv("data/close.csv", index_col="DATE", parse_dates=["DATE"])
vol = pd.read_csv("data/vol.csv", index_col="DATE", parse_dates=["DATE"])

In [3]:
close.head()

Unnamed: 0_level_0,CASH,F_AD,F_BO,F_BP,F_C,F_CC,F_CD,F_CL,F_CT,F_DX,...,F_RY,F_SH,F_SX,F_TR,F_EB,F_VF,F_VT,F_VW,F_GD,F_F
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,,78440.0,20130.0,85406.25,17662.5,19360.0,80065.0,60370.0,38750.0,91143.0,...,16936250.0,25820.0,92690.0,39312.5,250837.5,127890.0,148370.0,112210.0,110675.0,251825.0
2018-01-03,,78510.0,20340.0,84931.25,17650.0,19070.0,79860.0,61970.0,39055.0,91466.0,...,16923750.0,26130.0,93810.0,39587.5,250837.5,127970.0,148600.0,112215.0,111812.5,251825.0
2018-01-04,,78780.0,20322.0,85137.5,17550.0,19050.0,80215.0,61890.0,39625.0,91151.0,...,17035000.0,26290.0,94290.0,39775.0,250837.5,127940.0,148680.0,112210.0,111850.0,251825.0
2018-01-05,,78760.0,20256.0,85231.25,17562.5,18950.0,80685.0,61590.0,39005.0,91243.0,...,17062500.0,26520.0,94440.0,39737.5,250837.5,127960.0,148660.0,112210.0,110925.0,251825.0
2018-01-08,,78530.0,20124.0,85237.5,17362.5,19140.0,80620.0,61900.0,39070.0,91670.0,...,16931250.0,26630.0,94470.0,39475.0,250837.5,128040.0,148870.0,112225.0,111012.5,251825.0


In [4]:
# Extract one column and convert it into a series
def convert_to_series(data: pd.DataFrame, column: str):
    col = data[column]
    return pd.Series(col)

# Take out one column, in this case the E-mini S&P 500 Index, just for example
open_data = convert_to_series(_open, "F_ES")
high_data = convert_to_series(high, "F_ES")
low_data = convert_to_series(low, "F_ES")
close_data = convert_to_series(close, "F_ES")
vol_data = convert_to_series(vol, "F_ES")

close_data.head()

DATE
2018-01-02    134700.0
2018-01-03    135462.5
2018-01-04    136250.0
2018-01-05    137125.0
2018-01-08    137350.0
Name: F_ES, dtype: float64

## Generate Indicators

In [5]:
indicators = pd.DataFrame(index=close_data.index)

##### Daily Returns

In [6]:
indicators["DAILY RETURNS"] = close_data.pct_change()
indicators.head()

Unnamed: 0_level_0,DAILY RETURNS
DATE,Unnamed: 1_level_1
2018-01-02,
2018-01-03,0.005661
2018-01-04,0.005813
2018-01-05,0.006422
2018-01-08,0.001641


##### Spread

In [7]:
indicators["SPREAD"] = high_data - low_data
indicators.head()

Unnamed: 0_level_0,DAILY RETURNS,SPREAD
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,,1075.0
2018-01-03,0.005661,1100.0
2018-01-04,0.005813,1025.0
2018-01-05,0.006422,975.0
2018-01-08,0.001641,600.0


##### Volume

In [8]:
indicators["VOLUME"] = vol_data
indicators.head()

Unnamed: 0_level_0,DAILY RETURNS,SPREAD,VOLUME
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02,,1075.0,966768.0
2018-01-03,0.005661,1100.0,1077020.0
2018-01-04,0.005813,1025.0,1139096.0
2018-01-05,0.006422,975.0,1088652.0
2018-01-08,0.001641,600.0,872508.0


##### Volatility

In [9]:
def volatility(close: pd.Series, lookback: int):
    vol = close.rolling(lookback).std(ddof=1)
    return vol * np.sqrt(252)

indicators["VOLATILITY"] = volatility(close=close_data, lookback=21)
indicators.head()

Unnamed: 0_level_0,DAILY RETURNS,SPREAD,VOLUME,VOLATILITY
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-02,,1075.0,966768.0,
2018-01-03,0.005661,1100.0,1077020.0,
2018-01-04,0.005813,1025.0,1139096.0,
2018-01-05,0.006422,975.0,1088652.0,
2018-01-08,0.001641,600.0,872508.0,


##### Moving Average Crossover

In [10]:
def mac(close: pd.Series, slow_periods: int, fast_periods: int):
    data = pd.DataFrame(index=close.index)
    
    data["slow_sma"] = close.rolling(slow_periods).mean()
    data["fast_sma"] = close.rolling(fast_periods).mean()
    data["fast_sma_delta"] = data["fast_sma"].pct_change()
 
    # Returns 1 for golden cross, -1 for death cross, and 0 otherwise
    def crossover(slow_sma, fast_sma, fast_sma_delta):
        if fast_sma > slow_sma and fast_sma_delta > 0:
            return 1
        elif fast_sma < slow_sma and fast_sma_delta < 0:
            return -1
        else:
            return 0
        
    return data.apply(lambda row: crossover(row["slow_sma"], row["fast_sma"], row["fast_sma_delta"]),
                      axis=1)

indicators["MAC"] = mac(close=close_data, slow_periods=50, fast_periods=15)
indicators.head()

Unnamed: 0_level_0,DAILY RETURNS,SPREAD,VOLUME,VOLATILITY,MAC
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02,,1075.0,966768.0,,0
2018-01-03,0.005661,1100.0,1077020.0,,0
2018-01-04,0.005813,1025.0,1139096.0,,0
2018-01-05,0.006422,975.0,1088652.0,,0
2018-01-08,0.001641,600.0,872508.0,,0


##### Relative Strength Index

In [11]:
def rsi(close: pd.Series, lookback: int):
    
    returns = close.pct_change()
    
    assert lookback <= len(returns), f"Not enough rows to calculate RSI with lookback of {lookback} periods"
    
    # Set initial average gains and losses
    first_date = returns.index[lookback]
    initial_returns = returns[:first_date]
    initial_average_gain = initial_returns[initial_returns >= 0].mean()
    initial_average_loss = -initial_returns[initial_returns < 0].mean()
    average_gains = pd.Series(data=[initial_average_gain], index=[first_date], dtype=float)
    average_losses = pd.Series(data=[initial_average_loss], index=[first_date], dtype=float)
    
    # Set subsequent average gains and losses
    for date in returns.index[lookback + 1:]:
        current_returns = returns[date]        
    
        average_gains[date] = (average_gains[-1] * 13 + max(current_returns, 0)) / 14
        average_losses[date] = (average_losses[-1] * 13 - min(current_returns, 0)) / 14
        
    # Compute RSI
    rsi = pd.Series(index=average_gains.index, dtype=float)
    for date in rsi.index:
        try:
            ratio = average_gains[date] / average_losses[date]
            rsi[date] = 100 - 100 / (1 + ratio)
        except ZeroDivisionError:
            rsi[date] = 0
            
    return rsi

indicators["RSI"] = rsi(close=close_data, lookback=14)
indicators.head()

Unnamed: 0_level_0,DAILY RETURNS,SPREAD,VOLUME,VOLATILITY,MAC,RSI
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,,1075.0,966768.0,,0,
2018-01-03,0.005661,1100.0,1077020.0,,0,
2018-01-04,0.005813,1025.0,1139096.0,,0,
2018-01-05,0.006422,975.0,1088652.0,,0,
2018-01-08,0.001641,600.0,872508.0,,0,


In [12]:
indicators = indicators.dropna()
indicators.head()

Unnamed: 0_level_0,DAILY RETURNS,SPREAD,VOLUME,VOLATILITY,MAC,RSI
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-30,-0.010514,1975.0,2051677.0,39837.589818,0,58.722466
2018-01-31,0.001063,1337.5,1757674.0,36326.630211,0,59.235711
2018-02-01,-0.000973,1387.5,1594493.0,33035.625656,0,58.518186
2018-02-02,-0.023464,3787.5,2427988.0,31121.189526,0,44.515161
2018-02-05,-0.049597,8362.5,3865420.0,43301.48886,0,28.817731


## Build Predictors and Labels Data Structures

In [13]:
daily_returns_lagged = indicators["DAILY RETURNS"].shift(-1).dropna()

indicators["LABEL"] = daily_returns_lagged.map(lambda returns: 1 if returns >= 0 else 0)
indicators = indicators.dropna()

indicators.head()

Unnamed: 0_level_0,DAILY RETURNS,SPREAD,VOLUME,VOLATILITY,MAC,RSI,LABEL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-30,-0.010514,1975.0,2051677.0,39837.589818,0,58.722466,1.0
2018-01-31,0.001063,1337.5,1757674.0,36326.630211,0,59.235711,0.0
2018-02-01,-0.000973,1387.5,1594493.0,33035.625656,0,58.518186,0.0
2018-02-02,-0.023464,3787.5,2427988.0,31121.189526,0,44.515161,0.0
2018-02-05,-0.049597,8362.5,3865420.0,43301.48886,0,28.817731,1.0


In [14]:
labels = indicators["LABEL"].to_numpy()
predictors = indicators.drop("LABEL", axis=1).to_numpy()

In [15]:
scaler = MinMaxScaler()
predictors = scaler.fit_transform(predictors)

In [16]:
# Function adds an additional dimension to the input array
# The new dimension would be the second one
def reshape_2d_to_3d(data: np.ndarray, size: int):
    assert len(predictors.shape) == 2, "Input array is not 2-dimensional"
    assert len(predictors) >= size, \
        f"There are not enough rows to introduce an additional dimension of size {size}"
    
    output = None
    for index in range(size, len(data) + 1):
        row = data[index - size: index]
        row = row.reshape(1, size, row.shape[1])
        output = row if output is None else np.concatenate((output, row), axis=0)
    return output

# Reshape predictors from a 2D array into a 3D array with the dimensions representing
# [samples, time steps, features]
def reshape_predictors_and_labels(predictors: np.ndarray, labels: np.ndarray, time_step: int):
    return reshape_2d_to_3d(data=predictors, size=time_step), labels[time_step - 1:]

predictors, labels = reshape_predictors_and_labels(predictors=predictors, labels=labels, time_step=5)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(predictors, labels, shuffle=False, test_size=0.2)

## Build LSTM Model

In [28]:
model = Sequential()

model.add(LSTM(100, activation="relu", input_shape=(predictors.shape[1], predictors.shape[2])))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 100)               42800     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 42,901
Trainable params: 42,901
Non-trainable params: 0
_________________________________________________________________


In [57]:
model.fit(X_train, y_train, epochs=50, shuffle=False, verbose=0)

<keras.callbacks.callbacks.History at 0x7fe3dec40090>

In [58]:
model.evaluate(X_test, y_test, verbose=0)

[1.4596291780471802, 0.4791666567325592]

## Make Prediction

In [59]:
y_pred = model.predict(X_test)
y_pred = y_pred.reshape(y_pred.shape[0],)
y_pred = np.where(y_pred >= 0.5, 1, 0)

confusion_matrix(y_test, y_pred)

array([[17, 24],
       [26, 29]])

In [60]:
accuracy_score(y_test, y_pred)

0.4791666666666667