# Basic Concepts

## Reward Function
The reward function $R$ assigns a numerical reward to each state-action $(S, A)$ pair.

$$
R: S \times A  \to \mathbb{R}.
$$

## Action Policy

An action policy $Q$ assigns to each state $S$ and allowed action $A$ a numerical value. The numerical value is composed of the *immediate reward* of taking
action $A$ and the *discounted delayed reward* - given an optimal action taken in the subsequent state.

$$
Q: S \times A  \to \mathbb{R},
$$
$$
Q(S_t, A_t) = R(S_t, A_t) + \gamma\cdot \underset{a}{\text{max}}\ Q(S_{t+1},a).
$$

## Representation
In general, the optimal action policy $Q$ can not be specified in closed form (e.g. in the form of a table). Therefore, $Q$-learning relies in general on approximate representations for the optimal policy $Q$.

## Neural Network
Due to the approximation capabilities of neural networks ("Universal Approximation Theorems"), neural networks are typically used to represent optimal action policies $Q$. Features are the parameters that describe the state of the environment. Labels are values attached to each allowed action.

## Exploration
This refers to actions taken by an agent that are random in nature. The purpose is to explore random actions and their associated values beyond what the current optimal policy would dictate.

## Exploitation
This refers to actions taken in accordance with the current optimal policy.

## Replay
This refers to the (regular) updating of the optimal action policy given past and memorized experiences (by re-training the neural network).

## Important Variables

### `gamma`
The parameter gamma represents the discount factor by which delayed
rewards are taken into account.

### `epsilon`
The parameter epsilon defines the ratio with which the algorithm relies
on exploration as compared to exploitation.

### `epsilon_decay`
The parameter epsilon_decay specifies the rate at which epsilon is
reduced.

In [10]:
import warnings
import os
import math
import random
import numpy as np
import pandas as pd
from pylab import plt, mpl
plt.style.use('seaborn-v0_8')
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['font.family'] = 'serif'
np.set_printoptions(precision=4, suppress=True)
os.environ['PYTHONHASHSEED'] = '0'
warnings.simplefilter('ignore')

# OpaAI Gym CartPole Game

In [11]:
import gym
env = gym.make('CartPole-v0')
# env.seed(100)
env.action_space.seed(100)

[100]

In [12]:
env.observation_space

Box([-4.8000e+00 -3.4028e+38 -4.1888e-01 -3.4028e+38], [4.8000e+00 3.4028e+38 4.1888e-01 3.4028e+38], (4,), float32)

In [13]:
env.observation_space.low.astype(np.float16)

array([-4.8  ,   -inf, -0.419,   -inf], dtype=float16)

In [14]:
env.observation_space.high.astype(np.float16)

array([4.8  ,   inf, 0.419,   inf], dtype=float16)

In [15]:
state = env.reset()
state

(array([-0.0375,  0.0237,  0.0234, -0.0275], dtype=float32), {})

# Deep Learning Approach

In [16]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '5'

In [17]:
import tensorflow as tf
from tensorflow import keras

In [18]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [19]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from sklearn.metrics import accuracy_score
import random
import numpy as np

In [20]:

def set_seeds(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    # env.seed(seed)
    env.action_space.seed(seed)


set_seeds(100)

In [21]:
class NNAgent:
    def __init__(self):
        self.max = 0
        self.scores = list()
        self.memory = list()
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=4, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer=keras.optimizers.legacy.RMSprop(learning_rate=0.001))

        return model

    def act(self, state):
        if random.random() <= 0.5:
            return env.action_space.sample()
        action = np.where(self.model.predict(state, batch_size=None)[0, 0] > 0.5, 1, 0)
        return action

    def train_model(self, state, action):
        self.model.fit(state, epochs=1, verbose=False)

    def learn(self, episodes):
        for e in range(1, episodes + 1):
            state = env.reset()[0]
            for _ in range(201):
                state = np.reshape(state, [1, 4])
                action = self.act(state)
                next_state, reward, done, trunc, info = env.step(action)
                if done:
                    score = _ + 1
                    self.scores.append(score)
                    self.max = max(score, self.max)
                    print('episode: {:4d}/{} | score: {:3d} | max: {:3d}'.format(e, episodes, score, self.max), end='\r')
                    break
                self.memory.append((state, action))
                self.train_model(state, action)
                state = next_state


agent = NNAgent()

In [22]:
episodes = 1000
agent.learn(episodes)

episode:   34/1000 | score:  17 | max:  22

episode: 1000/1000 | score:  16 | max:  45

In [23]:
accuracy_score(np.where(agent.model.predict(f) > 0.5, 1, 0), l)

NameError: name 'f' is not defined

In [None]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='regression')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend()

NameError: name 'plt' is not defined

# Q Learning

In [24]:
from collections import deque


class DQLAgent:
    def __init__(self, gamma=0.95, hu=24, opt=keras.optimizers.legacy.Adam,
                 lr=0.001, finish=False):
        self.finish = finish
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.gamma = gamma
        self.batch_size = 32
        self.max_treward = 0
        self.averages = list()
        self.memory = deque(maxlen=2000)
        self.osn = env.observation_space.shape[0]
        self.model = self._build_model(hu, opt, lr)

    def _build_model(self, hu, opt, lr):
        model = Sequential()
        model.add(Dense(hu, input_dim=self.osn,
                        activation='relu'))
        model.add(Dense(hu, activation='relu'))
        model.add(Dense(env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=opt(learning_rate=lr))
        return model

    def act(self, state):
        if random.random() <= self.epsilon:
            return env.action_space.sample()
        action = self.model.predict(state)[0]
        return np.argmax(action)

    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)[0])
            target = self.model.predict(state)
            target[0, action] = reward
            self.model.fit(state, target, epochs=1,
                           verbose=False)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def learn(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state = env.reset()[0]
            state = np.reshape(state, [1, self.osn])
            for _ in range(5000):
                action = self.act(state)
                next_state, reward, done, trunc, info = env.step(action)
                next_state = np.reshape(next_state,
                                        [1, self.osn])
                self.memory.append([state, action, reward,
                                    next_state, done])
                state = next_state
                if trunc:
                    treward = _ + 1
                    trewards.append(treward)
                    av = sum(trewards[-25:]) / 25
                    self.averages.append(av)
                    self.max_treward = max(self.max_treward, treward)
                    templ = 'episode: {:4d}/{} | treward: {:4d} | '
                    templ += 'av: {:6.1f} | max: {:4d}'
                    print(templ.format(e, episodes, treward, av,
                                       self.max_treward), end='\r')
                    break
            if av > 195 and self.finish:
                print()
                break
            if len(self.memory) > self.batch_size:
                self.replay()

    def test(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state = env.reset()[0]
            for _ in range(5001):
                state = np.reshape(state, [1, self.osn])
                action = np.argmax(self.model.predict(state)[0])
                next_state, reward, done, trunc, info = env.step(action)
                state = next_state
                if trunc:
                    treward = _ + 1
                    trewards.append(treward)
                    print('episode: {:4d}/{} | treward: {:4d}'
                          .format(e, episodes, treward), end='\r')
                    break
        return trewards