In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense
import tensorflow.keras.utils as utils
import numpy as np
import math
import gym
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib nbagg

In [None]:
def create_model():
    model = Sequential()
    model.add(Input(shape = (4,)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    return model

In [None]:
def take_probabilistic_action(model, state):
    probabilities = model.predict(state.reshape(1, -1))
    action = np.random.choice(2, p=probabilities[0])
    return action

In [None]:
def play_and_display(gym_env, model):
    state = gym_env.reset()
    done = False
    while not done:
        gym_env.render()
        action = take_probabilistic_action(model, state)
        state, reward, done, info = gym_env.step(action)
    gym_env.close()

In [None]:
def compute_discounted_R(R, discount_rate=.99):
    discounted_r = np.zeros_like(R, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(R))):
        running_add = running_add * discount_rate + R[t]
        discounted_r[t] = running_add

    # use simple Baseline
    discounted_r = (discounted_r - discounted_r.mean()) / discounted_r.std()

    return discounted_r

In [None]:
def train(gym_env, model, duration=100, render=False):
    state = gym_env.reset()
    state_history = []
    action_history = []
    reward_history = []
    cummulative_reward = 0

    done = False
    # simulation
    while not done:
        if render:
            gym_env.render()
        action = take_probabilistic_action(model, state)
        action_history.append(action)
        state_history.append(state)
        state, reward, done, info = gym_env.step(action)
        cummulative_reward += reward
        reward_history.append(reward)
    gym_env.close()
    
    # update weights
    loss = model.train_on_batch(
        x = np.array(state_history),
        y = utils.to_categorical(action_history),
        sample_weight = compute_discounted_R(reward_history))
    
    return (loss, np.sum(reward_history))

In [None]:
env = gym.make('CartPole-v0')
model = create_model()
total_reward_history = []

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
line1, = ax.plot(total_reward_history, color="blue", label="Total")
line2, = ax.plot([], color="red", label="Moving average")
ax.set_xlim(0, 500)
ax.set_ylim(0, 250)
ax.set_xlabel("Simulations")
ax.set_ylabel("Reward")
ax.legend()

for i in range(500):
    loss, total_reward = train(env, model)
    total_reward_history.append(total_reward)
    
    # draw dynamic plot just because we can
    line1.set_xdata(np.arange(len(total_reward_history)))
    line1.set_ydata(total_reward_history)
    line2.set_xdata(np.arange(len(total_reward_history)))
    line2.set_ydata(pd.DataFrame(total_reward_history).rolling(window=10).mean())
    fig.canvas.draw()
    fig.canvas.flush_events()

In [None]:
play_and_display(env, model)

In [None]:
model.save('model')

## Load Model

In [None]:
model = keras.models.load_model('01_CartpolePOC/model')