<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Python for Finance Basics

&copy; Dr. Yves J. Hilpisch | The Python Quants GmbH

http://tpq.io | [training@tpq.io](mailto:trainin@tpq.io) | [@dyjh](http://twitter.com/dyjh)

## Reinforcement Learning

In [None]:
!git clone https://github.com/tpq-classes/pff_basics.git
import sys
sys.path.append('pff_basics')


In [None]:
import numpy as np
import pandas as pd
from pylab import plt
np.set_printoptions(suppress=True)
plt.style.use('seaborn-v0_8')
%config InlineBackend.figure_format = 'svg'

## `CartPole` Game

**Environment & Agents**

Topics:

**Reinforcement Learning**

* environment
* state
* agent
* action
* step
* reward/penalty
* objective
* policy
* episode


**Deep Q-Learning**

* reward function
* action policy
* representation
* deep neural network
* exploration/exploitation
* replay & policy update


## Gym Environment

In [None]:
import gymnasium as gym

In [None]:
env = gym.make('CartPole-v1')

In [None]:
env.observation_space

In [None]:
env.action_space

In [None]:
env.action_space.n

In [None]:
env.action_space.contains(0)

In [None]:
env.action_space.contains(1)

In [None]:
for _ in range(10):
    print(env.action_space.sample(), end=' ')

In [None]:
a = env.action_space.sample()
a

In [None]:
env.reset()

In [None]:
env.step(a)

In [None]:
env.step(0)

## DQL Agent

In [None]:
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [None]:
class DQLAgent:
    def __init__(self):
        self.epsilon = 1.0
        self.epsilon_decay = 0.99
        self.epsilon_min = 0.1
        self.memory = list()
        self.batch_size = 32
        self.gamma = 0.95
        self._create_model()
    def _create_model(self):
        self.model = Sequential()
        self.model.add(Dense(24, activation='relu', input_dim=4))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(2, activation='linear'))
        self.model.compile(loss='mse', optimizer='adam')

    def act(self, state):
        if random.random() < self.epsilon:
            return env.action_space.sample()
        q = self.model(tf.convert_to_tensor(state, dtype=tf.float32), training=False)
        return int(tf.argmax(q[0]).numpy())

    def replay(self):
        batch = random.sample(self.memory, self.batch_size)

        states      = np.vstack([b[0] for b in batch]).astype(np.float32)   # (B, osn)
        actions     = np.array([b[1] for b in batch], dtype=np.int32)       # (B,)
        rewards     = np.array([b[3] for b in batch], dtype=np.float32)     # (B,)
        next_states = np.vstack([b[2] for b in batch]).astype(np.float32)   # (B, osn)
        dones       = np.array([b[4] for b in batch], dtype=np.bool_)       # (B,)

        q_states = self.model(states, training=False).numpy()      # (B, lags, 2)
        # Q(s', :)
        q_next = self.model(next_states, training=False).numpy()   # (B, lags, 2)

        # keep original logic: use time index 0
        max_q_next = np.max(q_next, axis=1)               # (B,)

        targets = q_states.copy()
        updated = rewards + (1.0 - dones.astype(np.float32)) * (self.gamma * max_q_next)
        targets[np.arange(self.batch_size), actions] = updated

        self.model.train_on_batch(states, targets)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = env.reset()
            state = np.reshape(state, [1, 4])
            for f in range(1, 201):
                action = self.act(state)
                next_state, reward, done, trunc, _ = env.step(action)
                next_state = np.reshape(next_state, [1, 4])
                self.memory.append([state, action, next_state, reward, done])
                state = next_state
                if done:
                    print(f, end=' ')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()

In [None]:
agent = DQLAgent()

In [None]:
# agent.act()

In [None]:
agent.learn(50)

In [None]:
agent.epsilon

In [None]:
# agent.memory

In [None]:
# agent.replay()

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="mailto:training@tpq.io">training@tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a>