<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">

# Reinforcement Learning

## OpenAI Gym

Dr Yves J Hilpisch | The AI Machine

http://aimachine.io | http://twitter.com/dyjh

## CartPole

In [None]:
!git clone https://github.com/tpq-classes/rl_for_finance.git
import sys
sys.path.append('rl_for_finance')


In [None]:
import warnings; warnings.simplefilter('ignore')

In [None]:
import gymnasium as gym
import random
import numpy as np
import pandas as pd
from pylab import plt
from IPython import display
plt.style.use('seaborn-v0_8')
np.random.seed(100)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

In [None]:
env = gym.make('CartPole-v1')

## Setting Seeds

In [None]:
def set_seeds(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    # env.seed(seed)

## Neural Network Agent

In [None]:
class NNAgent:
    def __init__(self):
        self.max = 0
        self.scores = []
        self.memory = []
        self.epsilon = 0.5
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(64, input_dim=4, activation='relu'))  # was 1024
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer=Adam(learning_rate=0.001))
        return model

    def act(self, state):
        if random.random() <= self.epsilon:
            return env.action_space.sample()
        p = self.model(tf.convert_to_tensor(state, dtype=tf.float32), training=False)
        return int(p[0, 0] > 0.5)

    def train_model(self, state, action):
        x = state.astype(np.float32)
        y = np.array([[action]], dtype=np.float32)
        self.model.train_on_batch(x, y)

    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state, info = env.reset()
            for t in range(201):
                state = np.reshape(state, [1, 4])
                action = self.act(state)
                next_state, reward, done, trunc, info = env.step(action)

                if done:
                    score = t + 1
                    self.scores.append(score)
                    self.max = max(score, self.max)
                    print(f'episode: {e:4d}/{episodes} | score: {score:3d} | max: {self.max:3d}', end='\r')
                    break

                self.train_model(state, action)
                self.memory.append((state, action))
                state = next_state


In [None]:
set_seeds(100)
agent = NNAgent()

In [None]:
episodes = 10000

In [None]:
%time agent.learn(episodes)

In [None]:
sum(agent.scores) / len(agent.scores)  # average score

## Accuracy Score

In [None]:
f = np.array([m[0][0] for m in agent.memory])  # features
f

In [None]:
l = np.array([m[1] for m in agent.memory])  # labels
l

In [None]:
accuracy_score(np.where(agent.model.predict(f) > 0.5, 1, 0), l)  # prediction accuracy

In [None]:
accuracy_score(np.where(agent.model(tf.convert_to_tensor(f, dtype=tf.float32), training=False).numpy() > 0.5, 1, 0), l)  # prediction accuracy

<img src='http://hilpisch.com/taim_logo.png' width="350px" align="right">

<br><br><br><a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:ai@tpq.io">ai@tpq.io</a>