<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">

# Reinforcement Learning

## OpenAI Gym

Dr Yves J Hilpisch | The AI Machine

http://aimachine.io | http://twitter.com/dyjh

## Lunar Lander

In [None]:
!git clone https://github.com/tpq-classes/rl_for_finance.git
import sys
sys.path.append('rl_for_finance')


In [None]:
pip install pyvirtualdisplay

In [None]:
import gymnasium as gym
import random
import numpy as np
import pandas as pd
from pylab import plt

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
from IPython import display
plt.ion()
%matplotlib inline

from collections import deque
plt.style.use('seaborn-v0_8')
import warnings; warnings.simplefilter('ignore')

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

## Environment

See https://gym.openai.com/envs/LunarLander-v2/.

In [None]:
pip install swig gymnasium[box2d]

In [None]:
env = gym.make('LunarLander-v3', render_mode='rgb_array')

## Action Space

In [None]:
env.action_space  # type of action space

In [None]:
env.action_space.n  # number of actions

In [None]:
# do nothing, fire left orientation engine, fire main engine, fire right orientation engine

In [None]:
env.action_space.sample()  # sample action

In [None]:
env.action_space.sample()  # sample action

In [None]:
[env.action_space.sample() for _ in range(10)]

## Observation Space

In [None]:
np.set_printoptions(precision=4, suppress=True)

In [None]:
env.observation_space  # type of observation space

In [None]:
env.observation_space.high.astype(np.float16) # upper bounds for observations

In [None]:
env.observation_space.low.astype(np.float16)  # lower bounds for observations

In [None]:
o = env.reset()
o

## Taking Action

The following visualizes the effect of a number of random actions taken. See https://gist.github.com/thomelane/79e97630ba46c45985a946cae4805885

In [None]:
a = env.action_space.sample()  # random action
a

In [None]:
r = env.step(a)  # taking action, capturing new observations
r  # (observation, reward, done, info)

In [None]:
env.reset()
img = plt.imshow(env.render()) # initialize bitmap embedding
for e in range(201):
    img.set_data(env.render()) # updating the data
    display.display(plt.gcf())
    display.clear_output(wait=True)
    a = env.action_space.sample()  # random action choice
    # a = 3  # costant action choice
    obs, rew, done, trunc, info = env.step(a)  # taking action
    if done and (e + 1) < 200:
        print('*** FAILED ***')
        break

## Setting Seeds

In [None]:
def set_seeds(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    env.action_space.seed(seed)

## Q Learning

In [None]:
class DQLAgent:
    def __init__(self):
        self.epsilon = 1.0  # initial epsilon
        self.epsilon_min = 0.01  # minimal epsilon
        self.epsilon_decay = 0.995  # epsilon decay
        self.gamma = 0.95  # discount factor
        self.batch_size = 128  # batch size for replay
        self.max_treward = -1e6
        self.averages = list()
        self.memory = deque(maxlen=2000)  # fixed memory
        self.osn = env.observation_space.shape[0]
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(1024, input_dim=self.osn,
                        activation='relu'))
        # model.add(Dense(256, activation='relu'))
        # multiple labels, discrete actions
        # estimation problem (activation is linear)
        model.add(Dense(env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=keras.optimizers.Adam(learning_rate=0.001))
        return model

    def act(self, state):
        if random.random() <= self.epsilon:
            return env.action_space.sample()
        action = self.model(tf.convert_to_tensor(state, dtype=tf.float32), training=False).numpy()
        return np.argmax(action)  # choose action with highest value

    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            reward -= np.mean(state[:1] ** 2)  # distance to origin
            if not done:
                reward += self.gamma * np.amax(
                    self.model(tf.convert_to_tensor(next_state, dtype=tf.float32), training=False).numpy())
            target = self.model(tf.convert_to_tensor(state, dtype=tf.float32), training=False).numpy()
            target[0, action] = reward
            self.model.fit(state, target, epochs=1,
                           verbose=False)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def learn(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state, _ = env.reset()
            state = np.reshape(state, [1, self.osn])
            treward = 0
            for _ in range(5000):
                action = self.act(state)
                next_state, reward, done, trunc, info = env.step(action)
                next_state = np.reshape(next_state,
                                        [1, self.osn])
                self.memory.append([state, action, reward,
                                     next_state, done])
                state = next_state
                treward += float(reward)
                if done:
                    trewards.append(treward)
                    av = sum(trewards[-25:]) / 25
                    self.averages.append(av)
                    self.max_treward = max(self.max_treward, treward)
                    templ = 'episode: {:4d}/{} | treward: {:7.1f} | '
                    templ += 'av: {:7.1f} | max: {:7.1f}'
                    print(templ.format(e, episodes, treward, av,
                                       self.max_treward), end='\r')
                    break
            if len(self.memory) > self.batch_size:
                self.replay()
            if treward > 200:
                break
        print()

    def test(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state, _ = env.reset()
            treward = 0
            for _ in range(1001):
                state = np.reshape(state, [1, self.osn])
                action = self.act(state)
                next_state, reward, done, trunc, info = env.step(action)
                state = next_state
                treward += float(reward)
                if done:
                    trewards.append(treward)
                    print('episode: {:4d}/{} | treward: {:7.1f}'
                          .format(e, episodes, treward), end='\r')
                    break
        return trewards

In [None]:
set_seeds(100)
agent = DQLAgent()

In [None]:
# agent.epsilon = 0.1

In [None]:
episodes = 50 # 1000

In [None]:
%time agent.learn(episodes)

In [None]:
agent.epsilon

In [None]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='regression')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend();

## Testing the Agent

In [None]:
trewards = agent.test(20)

In [None]:
sum(trewards) / len(trewards)

In [None]:
seeds = list()
for seed in range(10000, 15001, 100):
    env.action_space.seed(seed)
    state, _ = env.reset()
    for e in range(501):
        state = np.reshape(state, [1, agent.osn])
        a = np.argmax(agent.model(tf.convert_to_tensor(state, dtype=tf.float32), training=False).numpy())  # learned action
        state, reward, done, trunc, info = env.step(a)
        if done and reward == 100:
            print(f'*** FINISHED *** ({seed} | {e})'  + 20 * ' ', end='\r')
            seeds.append((seed, e))
            break
        elif done and reward == -100:
            print(f'*** FAILED *** ({e})' + 20 * ' ', end='\r')
            break
    if not done:
        print(f'*** REACHED ITERATION MAX ({seed}) ***', end='\r')

In [None]:
seeds

In [None]:
env.action_space.seed(11500)
state, _ = env.reset()
img = plt.imshow(env.render()) # initialize bitmap embedding
for e in range(501):
    img.set_data(env.render()) # updating the data
    display.display(plt.gcf())
    display.clear_output(wait=True)
    state = np.reshape(state, [1, agent.osn])
    a = np.argmax(agent.model(tf.convert_to_tensor(state, dtype=tf.float32), training=False).numpy())  # learned action
    state, reward, done, trunc, info = env.step(a)
    if done and reward == 100:
        print(f'*** FINISHED ({e}) ***')
        break
    elif done and reward == -100:
        print(f'*** FAILED *** ({e})')
        break
if not done:
    print('*** REACHED ITERATION MAX ***')

<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">

<br><br><br><a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:ai@tpq.io">ai@tpq.io</a>