In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense
import tensorflow.keras.utils as utils
import numpy as np
import math
import gym

In [None]:
def create_model():
    model = Sequential()
    model.add(Input(shape = (4,)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    return model

In [None]:
def take_probabilistic_action(model, state):
    probabilities = model.predict(state.reshape(1, -1))
    action = np.random.choice(2, p=probabilities[0])
    return action

In [None]:
def play_and_display(gym_env, model):
    state = gym_env.reset()
    done = False
    while not done:
        gym_env.render()
        action = take_probabilistic_action(model, state)
        state, reward, done, info = gym_env.step(action)
    gym_env.close()

In [None]:
def compute_discounted_R(R, discount_rate=.99):
    discounted_r = np.zeros_like(R, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(R))):
        running_add = running_add * discount_rate + R[t]
        discounted_r[t] = running_add

    # use simple Baseline
    discounted_r = (discounted_r - discounted_r.mean()) / discounted_r.std()

    return discounted_r

In [None]:
def train(gym_env, model, duration=100, render=False):
    state = gym_env.reset()
    state_history = []
    action_history = []
    reward_history = []
    cummulative_reward = 0

    done = False
    # simulation
    while not done:
        if render:
            gym_env.render()
        action = take_probabilistic_action(model, state)
        action_history.append(action)
        state_history.append(state)
        state, reward, done, info = gym_env.step(action)
        cummulative_reward += reward
        reward_history.append(reward)
    gym_env.close()
    
    # update weights
    loss = model.train_on_batch(
        x = np.array(state_history),
        y = utils.to_categorical(action_history),
        sample_weight = compute_discounted_R(reward_history))
    
    return (loss, np.sum(reward_history))

In [None]:
env = gym.make('CartPole-v0')
model = create_model()

In [None]:
moving_average = 0
for i in range(500):
    if i%10 == 0:
        print("Iteration: ", i)
    loss, total_reward = train(env, model)
    moving_average = (moving_average + total_reward)*0.9
    print("Keras loss-value ", loss, " --- Cummulative Reward: ", total_reward, " --- Moving AVG:", moving_average/9)

In [None]:
play_and_display(env, model)