<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">

# Reinforcement Learning

## OpenAI Gym

Dr Yves J Hilpisch | The AI Machine

http://aimachine.io | http://twitter.com/dyjh

## CartPole

In [None]:
!git clone https://github.com/tpq-classes/rl_for_finance.git
import sys
sys.path.append('rl_for_finance')


In [None]:
import warnings; warnings.simplefilter('ignore')

In [None]:
import gymnasium as gym
import random
import numpy as np
import pandas as pd
from pylab import plt
from IPython import display
plt.style.use('seaborn-v0_8')
np.random.seed(100)

In [None]:
import tensorflow.compat.v1 as tf
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score
tf.random.set_random_seed(100)
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [None]:
env = gym.make('CartPole-v1')

## Setting Seeds

In [None]:
def set_seeds(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    # env.seed(seed)

## Q Learning

See https://keon.io/deep-q-learning/

In [None]:
from collections import deque

In [None]:
class DQLAgent:
    def __init__(self, finish=False):
        self.finish = finish
        self.epsilon = 1.0  # initial epsilon
        self.epsilon_min = 0.01  # minimal epsilon
        self.epsilon_decay = 0.995  # epsilon decay
        self.gamma = 0.95  # discount factor
        self.batch_size = 32  # batch size for replay
        self.max_treward = 0
        self.averages = list()
        self.memory = deque(maxlen=2000)  # fixed memory
        self.osn = env.observation_space.shape[0]
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.osn,
                        activation='relu'))
        model.add(Dense(24, activation='relu'))
        # two labels (= two actions)
        # estimation problem (activation is linear)
        model.add(Dense(env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer='adam')
        return model
        
    def act(self, state):
        if random.random() <= self.epsilon:
            return env.action_space.sample()
        action = self.model.predict(state)
        return np.argmax(action)  # choose action with highest value
    
    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)[0])
            target = self.model.predict(state)
            target[0, action] = reward
            self.model.fit(state, target, epochs=1,
                           verbose=False)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def learn(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state, info = env.reset()
            state = np.reshape(state, [1, self.osn])
            for _ in range(5000):
                action = self.act(state)
                next_state, reward, done, trunc, info = env.step(action)
                next_state = np.reshape(next_state,
                                        [1, self.osn])
                self.memory.append([state, action, reward,
                                     next_state, done])
                state = next_state
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    av = sum(trewards[-25:]) / 25
                    self.averages.append(av)
                    self.max_treward = max(self.max_treward, treward)
                    templ = 'episode: {:4d}/{} | treward: {:4d} | '
                    templ += 'av: {:5.1f} | max: {:4d}'
                    print(templ.format(e, episodes, treward, av,
                                       self.max_treward), end='\r')
                    break
            if av > 197.5 and self.finish:
                break
            if len(self.memory) > self.batch_size:
                self.replay()
    def test(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state, info = env.reset()
            for _ in range(1001):
                state = np.reshape(state, [1, self.osn])
                action = self.act(state)
                next_state, reward, done, trunc, info = env.step(action)
                state = next_state
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    print('episode: {:4d}/{} | treward: {:4d}'
                          .format(e, episodes, treward), end='\r')
                    break
        return trewards

In [None]:
set_seeds(100)
agent = DQLAgent(finish=True)

In [None]:
episodes = 1000

In [None]:
agent.learn(episodes)

In [None]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='regression')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend();

In [None]:
trewards = agent.test(100)

In [None]:
sum(trewards) / len(trewards)

<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">

<br><br><br><a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:ai@tpq.io">ai@tpq.io</a>