<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Reinforcement Learning for Finance

**Chapter 02 &mdash; Deep Q-Learning**

&copy; Dr. Yves J. Hilpisch

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

### Please use the "Python 3.10, Tensorflow 2.10" kernel.

## CartPole

### The Game Environment 

In [None]:
!git clone https://github.com/tpq-classes/rl_4_finance.git
import sys
sys.path.append('rl_4_finance')


In [1]:
import gymnasium as gym

In [2]:
env = gym.make('CartPole-v1')

In [3]:
env.action_space

Discrete(2)

In [4]:
env.action_space.n  # <1>

2

In [5]:
[env.action_space.sample() for _ in range(10)]  # <1>

[0, 1, 1, 0, 0, 1, 0, 1, 1, 1]

In [6]:
env.observation_space

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

In [7]:
env.observation_space.shape  # <2>

(4,)

In [8]:
env.reset(seed=100)  # <1>

(array([ 0.03349816,  0.0096554 , -0.02111368, -0.04570484], dtype=float32),
 {})

In [9]:
env.step(0)  # <2>

(array([ 0.03369127, -0.18515752, -0.02202777,  0.24024247], dtype=float32),
 1.0,
 False,
 False,
 {})

In [10]:
env.step(1)  # <2>

(array([ 0.02998812,  0.01027205, -0.01722292, -0.05930644], dtype=float32),
 1.0,
 False,
 False,
 {})

In [11]:
class RandomAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
    def play(self, episodes=1):
        self.trewards = list()
        for e in range(episodes):
            self.env.reset()
            for step in range(1, 100):
                a = self.env.action_space.sample()
                state, reward, done, trunc, info = self.env.step(a)
                if done:
                    self.trewards.append(step)
                    break

In [12]:
ra = RandomAgent()

In [13]:
ra.play(15)

In [14]:
ra.trewards

[32, 16, 15, 21, 40, 25, 28, 21, 14, 12, 18, 17, 11, 9, 15]

In [15]:
round(sum(ra.trewards) / len(ra.trewards), 2)  # <1>

19.6

In [16]:
import os
import random
import warnings
import numpy as np
warnings.simplefilter('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow import keras
from collections import deque
from keras.layers import Dense
from keras.models import Sequential

In [17]:
tf.__version__

'2.10.0'

In [18]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()  # <1>

In [19]:
opt = keras.optimizers.legacy.Adam(learning_rate=0.0005)  # <2>

In [20]:
tf.random.set_seed(100)

In [21]:
# deque?

In [22]:
class DQLAgent:
    def __init__(self):
        self.epsilon = 1.0  # <1>
        self.epsilon_decay = 0.9975  # <2>
        self.epsilon_min = 0.1  # <3>
        self.memory = list()  # <4>
        # self.memory = deque(maxlen=2000)
        self.batch_size = 32  # <5>
        self.gamma = 0.9  # <6>
        self.trewards = deque(maxlen=2000)  # <7>
        self.max_treward = 0  # <8>
        self._create_model()  # <9>
        self.env = gym.make('CartPole-v1')  # <10>
    def _create_model(self):
        self.model = Sequential()
        self.model.add(Dense(24, activation='relu', input_dim=4))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(2, activation='linear'))
        self.model.compile(loss='mse', optimizer=opt)

In [23]:
class DQLAgent(DQLAgent):
    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()  # <1>
        return np.argmax(self.model.predict(state)[0])  # <2>
    def replay(self):
        batch = random.sample(self.memory, self.batch_size)  # <3>
        for state, action, next_state, reward, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)[0])  # <4>
            target = self.model.predict(state)  # <5>
            target[0, action] = reward  # <6>
            self.model.fit(state, target, epochs=2, verbose=False)  # <7>
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay  # <8>

In [24]:
class DQLAgent(DQLAgent):
    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()  # <1>
            state = np.reshape(state, [1, 4])  # <2>
            for f in range(1, 5000):
                action = self.act(state)  # <3>
                next_state, reward, done, trunc, _ = self.env.step(action)  # <4>
                next_state = np.reshape(next_state, [1, 4])  # <2>
                self.memory.append(
                    [state, action, next_state, reward, done])  # <4>
                state = next_state  # <5>
                if done or trunc:
                    self.trewards.append(f)  # <6>
                    self.max_treward = max(self.max_treward, f)  # <7>
                    templ = f'episode={e:4d} | treward={f:4d}'
                    templ += f' | max={self.max_treward:4d}'
                    print(templ, end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()  # <8>
        print()

In [25]:
class DQLAgent(DQLAgent):
    def test(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            state = np.reshape(state, [1, 4])
            for f in range(1, 5001):
                action = np.argmax(self.model.predict(state)[0])  # <1>
                state, reward, done, trunc, _ = self.env.step(action)
                state = np.reshape(state, [1, 4])
                if done or trunc:
                    print(f, end=' ')
                    break

In [26]:
agent = DQLAgent()

In [27]:
%time agent.learn(1000)

episode= 613 | treward= 233 | max= 500

KeyboardInterrupt: 

In [None]:
agent.epsilon

In [None]:
agent.test(15)

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>