In [124]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
import time
import scipy.signal
from tqdm import tqdm

# TrajectoryStorage


In [125]:
class Storage:
    def __init__(self):
        # init creating a storage for all observation variables during trajectory
        self.observations = []
        # create arrays for chosen actions and rewards
        self.actions = []
        self.logits = []
        self.rewards = []
        self.BaselineEstimate = []
        # finished episodes will be completely stored in this list 
        self.episodes = []


    def store(self,observation, action, logits, reward, BaselineEstimate):
        self.observations.append(observation)
        self.actions.append(action)
        self.logits.append(logits)
        self.rewards.append(reward)
        self.BaselineEstimate.append(BaselineEstimate) # value of critics network
        

    def conclude_episode(self):
        # append all already stored values to finished episodes
        self.episodes.append(
            [self.observations,
             self.actions, 
             self.logits,
             self.rewards,
             self.BaselineEstimate, 
             sum(self.rewards)]) # get the return of the whole episode
             
        # empty the arrays for new trajectory
        self.observations.clear()
        # create arrays for chosen actions and rewards
        self.actions.clear()
        self.logits.clear()
        self.rewards.clear()
        self.BaselineEstimate.clear()


    # return array of finished trajectories stored in self.episodes and the amount of episodes
    def get_episodes(self):
        return self.episodes, len(self.episodes)
        
        

# Actor Model


In [126]:
class Actor(Model):
    def __init__(self):
        super(Actor, self).__init__()


        self.l = [
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(4, activation="softmax")
        ]

    #@tf.function        
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

        

#@tf.function
def sample_action(observation):
    logits = actor(observation)
   # tf.print(type(logits))
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
   # tf.print(action)
    return logits, action

# Critic Model

In [127]:
class Critic(Model):
    def __init__(self):
        super(Critic, self).__init__()

        self.l = [
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]

    #@tf.function 
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

In [128]:
# define Hyperparameters
epochs = 1
steps_per_epoch = 1000 # ~10 Episodes per epoch, then compute new parameters (smaller batching)
lr_actor = 3e-4
lr_critic = 3e-4
train_policy_iterations = 80
train_value_iterations = 80
clip_ratio = 0.2
target_kl = 0.01
optimizer = Adam()

render = False

In [129]:
tf.keras.backend.clear_session()

# define environment
env = gym.make("LunarLander-v2")
# get observation_dims and amount of possible actions (1 for CartPole-v1)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage for observations, actions, rewards etc during trajectory
T = Storage()

# init the actor and critics model
observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
actor = Actor()
critic = Critic()

# Initialize the observation, episode return and episode length
observation, episode_return, episode_length = env.reset(), 0, 0

In [130]:
episodes_total = 0
for epoch in range(epochs):

    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in tqdm(range(steps_per_epoch)):
        if render:
            env.render()

        observation = observation.reshape(1,-1)

        # obtain action and logits for this observation by our actor
        logits, action = sample_action(observation=observation)
        
        # make a step in environment and obtain the rewards for it
        observation_new, reward, done, _ = env.step(action[0].numpy())

        # sum up rewards over this episode and count amount of frames
        episode_return += reward
        episode_length += 1

        # get the Base-Estimate from the Critics network
        base_estimate = critic(observation)

        # store Variables collected in this step
        T.store(observation=observation, action=action, logits=logits, reward=reward, BaselineEstimate=base_estimate)
        #update the observations
        observation = observation_new
        # check if terminal state is reached in env, if so save episode and refresh storage, reset env
        if done:
            T.conclude_episode()
            observation, episode_return, episode_length = env.reset(), 0, 0

    # obtain all episodes saved in storage
    episodes, amount_episodes = T.get_episodes()

  

        


100%|██████████| 1000/1000 [00:03<00:00, 297.80it/s]


In [131]:
  # episodes [episode][particular values // 0: Observations, 1: actions, 2: logits, 3, rewards, 4: BaselineEstimates from Critics]
  #print(episodes[0][4])
  print(f'Number of Episodes = {amount_episodes}')



  ### Advantagefunction

  # estimated Value of the current situtation from the critics network
  b_estimates = episodes[0][4] 
  # for i in b_estimates:
  #   print(i.numpy())

  # Discounted sum of rewards
  print(episodes[0][3]) 
  rewards = episodes[0][3]
  gamma = 0.99

    



Number of Episodes = 11
[-0.023313775499132133, 1.1764470098363813, 1.711304128904062, 1.6116477699257825, 1.1936177758052509, 0.30419220512476497, 0.9673390278992144, -0.23056043074254148, -0.4466325746305688, -0.710873833428791, -1.0975583846965435, -3.7870338066730254, 0.08511483214221016, -2.9944370794186055]


In [132]:
def discounted_reward(rewards, gamma):
    i = 0
    discounted_rewards = []
    for r in rewards:
        disc = 0
        for t in rewards[i:-1]:
            discount_t = gamma ** t
            disc += t * discount_t
        i += 1
        discounted_rewards.append(disc)
    return discounted_rewards

    


In [142]:
# get discounted sum of rewards 
disc_sum = discounted_reward(rewards, gamma)

# convert lists to np arrays and flatten
disc_sum_np = np.array(disc_sum)
b_estimates_np = np.array(b_estimates)
b_estimates_np = b_estimates_np.flatten()

# substract arrays to obtain advantages
advantages = np.subtract(disc_sum_np, b_estimates_np)
print(advantages)





[ 4.93418938e-01  5.16735853e-01 -6.45881934e-01 -2.32800279e+00
 -3.91375530e+00 -5.09314033e+00 -5.39640611e+00 -6.35438628e+00
 -6.12329431e+00 -5.67465357e+00 -4.95868324e+00 -3.84895013e+00
  8.50003914e-02 -4.10063149e-05]
