<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">

# AI-First Finance

**Features for Market Prediction**

Dr Yves J Hilpisch | The AI Machine

http://aimachine.io | http://twitter.com/dyjh

## Imports

For the `tpqoa` package see http://github.com/yhilpisch/tpqoa.

In [None]:
!git clone https://github.com/tpq-classes/ai_in_finance.git
import sys
sys.path.append('ai_in_finance')


In [None]:
!pip install git+https://github.com/yhilpisch/tpqoa

In [None]:
import math
import tpqoa
import numpy as np
import pandas as pd
from pylab import plt
plt.style.use('seaborn-v0_8')

In [None]:
import warnings
warnings.simplefilter('ignore')

## Data

In [None]:
symbol = 'EUR_USD'
start =  '2019-01-01'
end = '2019-03-27'
granularity = 'M10'
price = 'A'
# adjust path
fn = f'/content/ai_in_finance/oanda_{symbol}_{start}_{end}_{granularity}_{price}.csv'
fn

In [None]:
%%time
try:
    raw = pd.read_csv(fn, index_col=0, parse_dates=True)
except:
    api = tpqoa.tpqoa('../oanda.cfg')   # adapt path
    raw = api.get_history(symbol, start, end, granularity, price)
    raw.to_csv(fn)

In [None]:
raw.info()

In [None]:
data = raw.copy()
data['r'] = np.log(data['c'] / data['c'].shift(1))
data['rs'] = (data['r'] - data['r'].mean()) / data['r'].std()
data['d'] = np.where(data['r'] > 0, 1, 0)
data['c-o'] = data['c'] - data['o']
data['u-d'] = np.where(data['c'] - data['o'] > 0, 1, 0)
data['h-l'] = data['h'] - data['l']
data['h-o'] = data['h'] - data['o']
data['o-l'] = data['o'] - data['l']
data['h-c'] = data['h'] - data['c']
data['c-l'] = data['c'] - data['l']
data['v1'] = data['r'].rolling(20).std()
data['v2'] = data['r'].rolling(100).std()
data['sma1'] = data['c'].rolling(20).mean()
data['sma2'] = data['c'].rolling(100).mean()
data['mom1'] = data['r'].rolling(5).mean()
data['mom2'] = data['r'].rolling(20).mean()
data.dropna(inplace=True)

In [None]:
features = list(data.columns)
features.remove('complete')
# features

In [None]:
ld = len(data)
ld

In [None]:
split = int(len(data) * 0.6)
val_size = int(split * 0.15)

In [None]:
train = data.iloc[:split]
val = train[-val_size:]
train = train[:-val_size]
test = data.iloc[split:].copy()

In [None]:
lags = 10

In [None]:
def gaussian(x):
    mean = x.mean()
    std = x.std()
    return (x - mean) / std, mean, std

In [None]:
def normalize_and_lag():
    global cols
    cols = []
    for f in features:
        for lag in range(1, lags + 1):
            col = f'{f}_lag_{lag}'
            if f in ['r', 'rs', 'd', 'u-d']:
                train[col] = train[f].shift(lag)
                val[col] = val[f].shift(lag)
                test[col] = test[f].shift(lag)
            else:
                train[col], mean, std = gaussian(train[f].shift(lag))
                val[col] = (val[f].shift(lag) - mean) / std
                test[col] = (test[f].shift(lag) - mean) / std
            cols.append(col)
    train.dropna(inplace=True)
    val.dropna(inplace=True)
    test.dropna(inplace=True)

In [None]:
normalize_and_lag()

In [None]:
len(cols)

In [None]:
train.head(5)

## Estimation

### Scikit-Learn

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
model = MLPRegressor(hidden_layer_sizes=(128, 128),
                      activation='relu',
                      learning_rate_init=0.001,
                      random_state=100,
                      max_iter=500,
                      validation_fraction=0.1,
                      shuffle=False,
                      early_stopping=True,
                      verbose=False)

In [None]:
%time model.fit(train[cols], train['r'])

In [None]:
test['p'] = model.predict(test[cols])
test['p'] = np.where(test['p'] > 0, 1, -1)

In [None]:
test['s'] = test['p'] * test['r']

In [None]:
test[['r', 's']].sum().apply(np.exp)

In [None]:
sum(test['p'].diff() != 0)

In [None]:
test['p'].value_counts()

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot(figsize=(10, 6));

### Keras

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
np.random.seed(100)
tf.random.set_seed(100)

In [None]:
model = Sequential()

model.add(Dense(256, activation='relu',
               # kernel_regularizer=l2(0.001),
                input_shape=(len(cols),)))
model.add(Dropout(0.3, seed=100))
model.add(Dense(256, activation='relu',
               # kernel_regularizer=l2(0.001)
               ))
model.add(Dropout(0.3, seed=100))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='rmsprop', loss='mae', metrics=['mae'])

In [None]:
model.summary()

In [None]:
callbacks = [EarlyStopping(monitor='val_mae', patience=75)]

In [None]:
%%time
history = model.fit(train[cols], train['r'] * 5000,
                    epochs=250,
                    batch_size=32,
                    verbose=False,
                    validation_data=(val[cols], val['r'] * 5000),
                    callbacks=callbacks)

In [None]:
res = pd.DataFrame(history.history)

In [None]:
res.tail(3)

In [None]:
to_plot = ['mae', 'val_mae']
ax = res[to_plot].plot(figsize=(10, 6), style=['--', '--', '-', '-'])
x = np.arange(len(res))
reg = np.polyfit(x, res['val_mae'], deg=2)
plt.plot(x, np.polyval(reg, x), 'r');

In [None]:
model.evaluate(test[cols], test['r'] * 5000)

In [None]:
test['p'] = model(tf.convert_to_tensor(test[cols], dtype=tf.float32), training=False).numpy()
test['p'] = np.where(test['p'] > 0, 1, -1)

In [None]:
test['s'] = test['p'] * test['r']

In [None]:
test[['r', 's']].sum().apply(np.exp)

In [None]:
sum(test['p'].diff() != 0)

In [None]:
test['p'].value_counts()

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot(figsize=(10, 6));

## Classification

### Scikit-Learn

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model = MLPClassifier(hidden_layer_sizes=(256, 256),
                      activation='relu',
                      alpha=0.0001,
                      random_state=100,
                      max_iter=200,
                      validation_fraction=0.1,
                      shuffle=False,
                      early_stopping=False,
                      verbose=False)

In [None]:
%time model.fit(train[cols], train['d'])

In [None]:
test['p'] = model.predict(test[cols])
test['p'] = np.where(test['p'] > 0, 1, -1)

In [None]:
test['s'] = test['p'] * test['r']

In [None]:
test[['r', 's']].sum().apply(np.exp)

In [None]:
sum(test['p'].diff() != 0)

In [None]:
test['p'].value_counts()

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot(figsize=(10, 6));

### Keras

In [None]:
np.random.seed(100)
tf.random.set_seed(100)

In [None]:
model = Sequential()

model.add(Dense(128, activation='relu',
                # kernel_regularizer=l2(0.001),
                input_shape=(len(cols),)
               )
         )
model.add(Dropout(0.3, seed=100))
model.add(Dense(128, activation='relu',
                # kernel_regularizer=l2(0.001)
               )
         )
model.add(Dropout(0.3, seed=100))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
callbacks = [EarlyStopping(monitor='val_accuracy', patience=25)]

In [None]:
%%time
model.fit(train[cols], train['d'],
          epochs=250, batch_size=32, verbose=False,
          validation_data=(val[cols], val['d']),
          callbacks=callbacks);

In [None]:
res = pd.DataFrame(model.history.history)

In [None]:
res.tail(3)

In [None]:
res.plot(figsize=(10, 6), style=['--', '--', '-', '-']);

In [None]:
model.evaluate(test[cols], test['d'])

In [None]:
test['p'] = np.where(model(tf.convert_to_tensor(test[cols], dtype=tf.float32), training=False).numpy() > 0.5, 1, 0)
test['p'] = np.where(test['p'] > 0, 1, -1)

In [None]:
test['s'] = test['p'] * test['r']

In [None]:
test[['r', 's']].sum().apply(np.exp)

In [None]:
sum(test['p'].diff() != 0)

In [None]:
test['p'].value_counts()

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot(figsize=(10, 6));

## Feature Selection

### Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
selector = SelectKBest(f_classif, k=50)

In [None]:
cols_sel = selector.fit(train[cols], train['d']).get_support(indices=True)

In [None]:
cols_sel

In [None]:
csel = train.columns[cols_sel]
csel

### Scikit-Learn

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model = MLPClassifier(hidden_layer_sizes=(128, 128),
                      activation='relu',
                      learning_rate_init=0.001,
                      random_state=100,
                      max_iter=500,
                      validation_fraction=0.1,
                      shuffle=False,
                      early_stopping=False,
                      verbose=False)

In [None]:
%time model.fit(train[csel], train['d'])

In [None]:
test['p'] = model.predict(test[csel])
test['p'] = np.where(test['p'] > 0, 1, -1)

In [None]:
test['s'] = test['p'] * test['r']

In [None]:
test[['r', 's']].sum().apply(np.exp)

In [None]:
sum(test['p'].diff() != 0)

In [None]:
test['p'].value_counts()

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot(figsize=(10, 6));

### Keras

In [None]:
np.random.seed(100)
tf.random.set_seed(100)

In [None]:
model = Sequential()

model.add(Dense(128, activation='relu',
                input_shape=(len(csel),)
               )
         )
#model.add(Dropout(0.3, seed=100))
model.add(Dense(128, activation='relu',
               )
         )
#model.add(Dropout(0.3, seed=100))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# ModelCheckpoint?

In [None]:
# callbacks = [EarlyStopping(monitor='val_acc', patience=50)]
callbacks = [ModelCheckpoint(filepath='/content/ai_in_finance/.weights.h5',
                             monitor='val_accuracy',
                             verbose=0,
                             save_best_only=True,
                             save_weights_only=True,
                             mode='auto')]

In [None]:
%%time
model.fit(train[csel], train['d'],
          epochs=125, batch_size=32, verbose=False,
          validation_data=(val[csel], val['d']),
          callbacks=callbacks
         );

### Regular Results

In [None]:
res = pd.DataFrame(model.history.history)

In [None]:
res.tail(3)

In [None]:
res[['accuracy', 'val_accuracy']].plot(figsize=(10, 6), style=['--', '--', '-', '-']);

In [None]:

model.evaluate(test[csel], test['d'])

In [None]:
test['p'] = np.where(model.predict(test[csel]) > 0.5, 1, 0)
test['p'] = np.where(test['p'] > 0, 1, -1)

In [None]:
test['s'] = test['p'] * test['r']

In [None]:
test[['r', 's']].sum().apply(np.exp)

In [None]:
sum(test['p'].diff() != 0)

In [None]:
test['p'].value_counts()

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot(figsize=(10, 6));

### Best Weights (Validation)

In [None]:
model.load_weights('/content/ai_in_finance/.weights.h5')

In [None]:

model.evaluate(test[csel], test['d'])

In [None]:
test['p'] = np.where(model.predict(test[csel]) > 0.5, 1, 0)
test['p'] = np.where(test['p'] > 0, 1, -1)

In [None]:
test['s'] = test['p'] * test['r']

In [None]:
test[['r', 's']].sum().apply(np.exp)

In [None]:
sum(test['p'].diff() != 0)

In [None]:
test['p'].value_counts()

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot(figsize=(10, 6));

<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">