<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Reinforcement Learning for Finance

**Chapter 09 &mdash; Optimal Execution**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## The Imports

In [None]:
!git clone https://github.com/tpq-classes/rl_4_finance.git
import sys
sys.path.append('rl_4_finance')


In [None]:
import math
import random
import numpy as np
import pandas as pd
from pylab import plt, mpl

In [None]:
from numpy.random import default_rng

## Execution Environment

In [None]:
class observation_space:
    def __init__(self, n):
        self.shape = (n,)

In [None]:
class action_space:
    def __init__(self, n):
        self.n = n
    def seed(self, seed):
        random.seed(seed)
    def sample(self):
        return random.random()  # <1>

In [None]:
class Execution:
    def __init__(self, T, N, S0, sigma, X, eta, gamma, lamb):
        self.T = T              
        self.N = N           
        self.dt = T / N
        self.S0 = S0
        self.sigma = sigma
        self.X = X
        self.eta = eta
        self.gamma = gamma
        self.lamb = lamb
        self.episode = 0
        self.observation_space = observation_space(5)
        self.osn = self.observation_space.shape[0]
        self.action_space = action_space(1)

In [None]:
class Execution(Execution):
    def _simulate_data(self, seed=None):
        self.dt = T / N
        if hasattr(self, 'seed_'):
            seed = self.seed_
        rng = default_rng(seed=seed)
        self.S = np.zeros(self.N + 1)
        self.S[0] = self.S0
        for t in range(1, self.N + 1):
            dZ = rng.normal(0, np.sqrt(self.dt))
            self.S[t] = self.S[t - 1] * np.exp(
                (self.sigma ** 2 / 2) * self.dt
                 + self.sigma * dZ)
        return self.S

In [None]:
class Execution(Execution):
    def _get_state(self):
        St = self.S[self.bar]
        return np.array([St, self.X, self.X_, self.x]), {}
    def seed(self, seed=None):
        self.seed = seed
    def reset(self):
        self.bar = 0
        self.x = 0
        self.treward = 0
        self.episode += 1
        self.X_ = self.X
        self.xt = np.zeros(N + 1)
        self.tc, self.pc, self.er = 0, 0, 0
        self.tec = pd.DataFrame(
            {'tc': 0, 'pc': 0, 'er': 0}, index=[0])
        self._simulate_data()
        self.state, _ = self._get_state()
        return self.state, _

In [None]:
class Execution(Execution):
    def step(self, action):
        self.bar += 1
        self.X_ -= action
        self.xt[self.bar] = action
        self.x = action
        tc = np.sum(self.gamma * (self.xt / self.dt) ** 2 * self.dt)
        pc = np.sum(self.eta * np.cumsum(self.xt) * self.xt)
        er = self.lamb * self.sigma ** 2 * np.sum(
            (np.cumsum(self.xt[::-1])[::-1] / self.dt) ** 2 * self.dt)
        df = pd.DataFrame({'tc': tc, 'pc': pc, 'er': er}, index=[0])
        self.tec = pd.concat((self.tec, df))
        tec = self.tec.diff().fillna(0).iloc[-1]
        self.tc, self.pc, self.er = tec['tc'], tec['pc'], tec['er']
        self.state, _ = self._get_state()
        reward = tec.sum()
        if self.bar == self.N:
            pen = self.X_
            done = True
        else:
            pen = self.X_
            done = False
        return self.state, -(reward + pen), done, False, {}

In [None]:
T = 10
N = 10
S0 = 100
sigma = 0.25
X = 10000
eta = 0.001
gamma = 0.01
lamb = 2e-2

In [None]:
execution = Execution(T, N, S0, sigma, X, eta, gamma, lamb)

In [None]:
s0, _ = execution.reset()
s0

In [None]:
execution.step(1000)

In [None]:
execution.step(2000)

In [None]:
execution.step(500)

## Example Strategies

### High Risk Aversion Case

In [None]:
execution = Execution(T, N, S0, sigma, X, eta, gamma, lamb=lamb)

In [None]:
xt = np.array([   0., 2984., 2098., 1477., 1042.,  739.,  530.,  387.,  294.,
        237.,  211.])
xt

In [None]:
execution.reset()

In [None]:
for x in xt[1:]:
    execution.step(x)
    # print(execution.step(x))

In [None]:
execution.tec.iloc[-1]

In [None]:
execution.tec.iloc[-1].sum()

### Low Risk Aversion Case

In [None]:
execution = Execution(T, N, S0, sigma, X, eta, gamma, lamb=1e-4)

In [None]:
xt_ = np.array([   0., 1018., 1012., 1007., 1003.,  999.,  996.,  993.,  992.,
        990.,  990.])
xt_

In [None]:
execution.reset()

In [None]:
for x in xt_[1:]:
    execution.step(x)
    # print(execution.step(x))

In [None]:
execution.tec.iloc[-1]

In [None]:
execution.tec.iloc[-1].sum()

<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>