In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense
import tensorflow.keras.utils as utils
import numpy as np
import pandas as pd
import math
import gym
import matplotlib.pyplot as plt
from multiprocessing import Process
from threading import Thread

%matplotlib nbagg

In [None]:
#def custom_loss_function(y_true, y_pred):
def custom_loss_function(reward, action_prob):
    loss = K.log(action_prob) * reward
    loss = K.sum(loss)
    return - loss

In [None]:
def create_actor_model():
    model = Sequential()
    model.add(Input(shape = (4,)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss=custom_loss_function)
    return model

In [None]:
def create_critic_model():
    model = Sequential()
    model.add(Input(shape = (4,)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss="Huber")
    return model

In [None]:
def take_probabilistic_action(model, state):
    probabilities = model.predict(state.reshape(1, -1))
    action = np.random.choice(2, p=probabilities[0])
    return (action, probabilities)

In [None]:
def play_and_display(gym_env, model):
    state = gym_env.reset()
    done = False
    while not done:
        gym_env.render()
        action, _ = take_probabilistic_action(model, state)
        state, reward, done, info = gym_env.step(action)
    gym_env.close()

In [None]:
# old
def compute_discounted_reward(reward_history, discount_rate=0.99):
    discounted_rewards = []
    discounted_sum = 0
    for r in reward_history[::-1]:
        discounted_sum = r + discount_rate * discounted_sum
        discounted_rewards.insert(0, discounted_sum)
        
    # use simple Baseline
    discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / np.std(discounted_rewards)
    
    return discounted_rewards


def compute_discounted_reward(reward_history, discount_factor=0.99):
    dsr = tf.scan(lambda agg, x: discount_factor * agg + x, rewards, reverse=True)
    # normalize
    return (dsr - tf.math.reduce_mean(dsr)) / tf.math.reduce_std(dsr)

In [None]:
#def format_rewards(rewards, action_space=2):
#    return np.full((action_space, rewards.shape[0]), rewards).T
def format_rewards(action_history, reward_history, action_space=2):
    formated_ah = utils.to_categorical(action_history, num_classes=action_space)
    formated_rw = np.full((action_space, reward_history.shape[0]), reward_history).T
    return formated_ah * formated_rw

In [None]:
def run_simulation(gym_env, actor, critic):
    state = gym_env.reset()
    state_history = []
    action_history = []
    critic_history = []
    reward_history = []

    done = False
    while not done:
        action, _ = take_probabilistic_action(actor, state)
        critic_history.append(critic.predict(state.reshape(1, -1)))
        action_history.append(action)
        state_history.append(state)
        state, reward, done, info = gym_env.step(action)
        reward_history.append(reward)
        
    return (state_history, action_history, critic_history, reward_history)

In [None]:
def train(gym_env, actor, critic):
    state_history, action_history, critic_history, reward_history = run_simulation(gym_env, actor, critic)
    discounted_rewards = compute_discounted_reward(reward_history)
    
    # update critic
    critic_loss = critic.train_on_batch(x=np.array(state_history), y=discounted_rewards)
    
    # calculate advantage
    advantage = discounted_rewards - np.array(critic_history).flatten()
    
    # update actor
    actor_loss = actor.train_on_batch(
        x = np.array(state_history),
        y = format_rewards(action_history, advantage))
    
    return actor_loss, critic_loss, np.sum(reward_history)

## Play!

In [None]:
env = gym.make('CartPole-v0')
actor = create_actor_model()
critic = create_critic_model()
total_reward_history = []

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
line1, = ax.plot(total_reward_history, color="blue", label="Total")
line2, = ax.plot([], color="red", label="Moving average")
ax.set_xlabel("Simulations")
ax.set_ylabel("Reward")
ax.set_xlim(0, 500)
ax.set_ylim(0, 250)
ax.legend()

for i in range(500):
    actor_loss, critic_loss, total_reward = train(env, actor, critic)
    total_reward_history.append(total_reward)
    
    # draw dynamic plot just because we can
    line1.set_xdata(np.arange(len(total_reward_history)))
    line1.set_ydata(total_reward_history)
    line2.set_xdata(np.arange(len(total_reward_history)))
    line2.set_ydata(pd.DataFrame(total_reward_history).rolling(window=10).mean())
    fig.canvas.draw()
    fig.canvas.flush_events()

In [None]:
actor.save("actor_model")
critic.save("critic_model")

## Load / Display

In [None]:
env = gym.make('CartPole-v0')
actor = keras.models.load_model("actor_model", compile=False)
actor.compile(optimizer='adam', loss=custom_loss_function)

In [None]:
play_and_display(env, actor)