In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense
import tensorflow.keras.utils as utils
import numpy as np
import pandas as pd
import math
import gym
import matplotlib.pyplot as plt
import ipywidgets as widgets
from multiprocessing import Process
from threading import Thread

%matplotlib nbagg

Normalerweise sieht die Loss-Funktion so aus:
```
def custom_loss_function(y_true, y_pred):
```
In unserem Fall verwenden wir y_true um den Reward and die Trainingsroutine zu übergeben.

y_pred ist die die action-Probability, welche während des Trainings aus dem State berechnet wird.

In [None]:
#def custom_loss_function(y_true, y_pred):
def custom_loss_function(reward, action_prob):
    loss = - K.log(action_prob) * reward
    #loss = action_prob * reward
    loss = K.mean(loss)
    return loss

In [None]:
def create_model():
    model = Sequential()
    model.add(Input(shape = (4,)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss=custom_loss_function)
    return model

In [None]:
def take_probabilistic_action(model, state):
    probabilities = model.predict(state.reshape(1, -1))
    action = np.random.choice(2, p=probabilities[0])
    return (action, probabilities)

In [None]:
def play_and_display(gym_env, model):
    state = gym_env.reset()
    done = False
    while not done:
        gym_env.render()
        action, _ = take_probabilistic_action(model, state)
        state, reward, done, info = gym_env.step(action)
    gym_env.close()

In [None]:
#play_and_display(gym.make('CartPole-v0'), create_model())

In [None]:
def compute_discounted_reward(reward_history, discount_rate=0.99):
    discounted_rewards = []
    discounted_sum = 0
    for r in reward_history[::-1]:
        discounted_sum = r + discount_rate * discounted_sum
        discounted_rewards.insert(0, discounted_sum)
        
    # use simple Baseline
    discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / np.std(discounted_rewards)
    
    return discounted_rewards

In [None]:
#def format_rewards(rewards, action_space=2):
#    return np.full((action_space, rewards.shape[0]), rewards).T
def format_rewards(action_history, reward_history, action_space=2):
    formated_ah = utils.to_categorical(action_history, num_classes=action_space)
    formated_rw = np.full((action_space, reward_history.shape[0]), reward_history).T
    return formated_ah * formated_rw

In [None]:
def train(gym_env, model):
    state = gym_env.reset()
    state_history = []
    action_history = []
    reward_history = []

    done = False
    # simulation
    while not done:
        action, _ = take_probabilistic_action(model, state)
        action_history.append(action)
        state_history.append(state)
        state, reward, done, info = gym_env.step(action)
        reward_history.append(reward)
    gym_env.close()
    
    # update weights
    loss = model.train_on_batch(
        x = np.array(state_history),
        y = format_rewards(action_history, compute_discounted_reward(reward_history)))
    
    return (loss, np.sum(reward_history))

## Play!

In [None]:
env = gym.make('CartPole-v0')
model = create_model()
total_reward_history = []

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
line1, = ax.plot(total_reward_history, color="blue", label="Total")
line2, = ax.plot([], color="red", label="Moving average")
ax.set_xlim(0, 500)
ax.set_ylim(0, 250)
ax.set_xlabel("Simulations")
ax.set_ylabel("Reward")
ax.legend()

for i in range(500):
    loss, total_reward = train(env, model)
    total_reward_history.append(total_reward)
    
    # draw dynamic plot just because we can
    line1.set_xdata(np.arange(len(total_reward_history)))
    line1.set_ydata(total_reward_history)
    line2.set_xdata(np.arange(len(total_reward_history)))
    line2.set_ydata(pd.DataFrame(total_reward_history).rolling(window=10).mean())
    fig.canvas.draw()
    fig.canvas.flush_events()

## Multithreading

In [None]:
# simple struct-class to keep track of a simulation's history 
class History:
    def __init__(self):
        self.state = []
        self.action = []
        self.reward = []
    
    def append(self, state, action, reward):
        self.state.append(state)
        self.action.append(action)
        self.reward.append(reward)

In [None]:
def create_batch(model_weights):
#def create_batch(model_weights, history_ref=[{}], i=0):
    gym_env = gym.make('CartPole-v0')
    model = create_model()
    model.set_weights(model_weights)
    state = gym_env.reset()
    history = History()

    done = False
    # simulation
    while not done:
        action, _ = take_probabilistic_action(model, state)
        new_state, reward, done, info = gym_env.step(action)
        history.append(state, action, reward)
        state = new_state
    
    return history

In [None]:
def create_multibatch(model, n):
    results = [{} for i in range(n)]
    threads = []

    for i in range(n):
        t = Process(target=create_batch, args=(model.get_weights(), results, i))
        threads.append(t)
        t.start()

    for t in threads:
        t.join()
    
    return results

In [None]:
def weasel_histories(model, histories):
    # states
    x = np.vstack(
        [histories[i].state for i in range(len(histories))]
    )
    # formated action & reward data
    y = np.vstack(
        [format_rewards(histories[i].action, compute_discounted_reward(histories[i].reward)) for i in range(len(histories))]
    )
    
    loss = model.train_on_batch(x, y)
    average_reward = np.mean([np.sum(histories[i].reward) for i in range(4)])
    return (loss, average_reward)

## Lets Go

In [None]:
# multithreading! :D
THREADS = 5
model = create_model()
total_reward_history = []

In [None]:
from multiprocessing import Process, Pool


model = create_model()

with Pool(5) as p:
    print(p.map(create_batch, [model.get_weights(), model.get_weights()]))

In [None]:
#batch = create_multibatch(model, games)
#_, total_reward = weasel_histories(model, batch)

results = [{} for i in range(5)]
threads = []

for i in range(5):
    t = Thread(target=create_batch, args=(model.get_weights(), results, i))
    threads.append(t)
    t.start()

for t in threads:
    t.join()


In [None]:
results

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
line1, = ax.plot(total_reward_history, color="blue", label="Total")
line2, = ax.plot([], color="red", label="Moving average")
ax.set_xlim(0, 500)
ax.set_ylim(0, 250)
ax.set_xlabel("Simulations")
ax.set_ylabel("Reward")
ax.legend()

for i in range(10):
    batch = create_multibatch(model, THREADS)
    _, total_reward = weasel_histories(model, batch)
    #loss, total_reward = train(env, model)
    total_reward_history.append(total_reward)
    
    # draw dynamic plot just because we can
    line1.set_xdata(np.arange(len(total_reward_history)))
    line1.set_ydata(total_reward_history)
    line2.set_xdata(np.arange(len(total_reward_history)))
    line2.set_ydata(pd.DataFrame(total_reward_history).rolling(window=10).mean())
    fig.canvas.draw()
    fig.canvas.flush_events()

In [None]:
play_and_display(gym.make('CartPole-v0'), model)

In [None]:
play_and_display(env, model)

In [None]:
model.save('01_CartpolePOC/model')

## Load the old Model

In [None]:
model = keras.models.load_model('01_CartpolePOC/model')