In [205]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
import time
import scipy.signal
from tqdm import tqdm

# TrajectoryStorage


In [206]:
class Storage:
    def __init__(self):
        # init creating a storage for all observation variables during trajectory
        self.observations = []
        # create arrays for chosen actions and rewards
        self.actions = []
        self.logits = []
        self.rewards = []
        self.BaselineEstimate = []
        # finished episodes will be completely stored in this list 
        self.episodes = []


    def store(self,observation, action, logits, reward, BaselineEstimate):
        self.observations.append(observation)
        self.actions.append(action)
        self.logits.append(logits)
        self.rewards.append(reward)
        self.BaselineEstimate.append(BaselineEstimate) # value of critics network
        

    def conclude_episode(self):
        # append all already stored values to finished episodes
        self.episodes.append(
            [self.observations,
             self.actions, 
             self.logits,
             self.rewards,
             self.BaselineEstimate, 
             sum(self.rewards)]) # get the return of the whole episode
             
        # empty the arrays for new trajectory
        self.observations.clear()
        # create arrays for chosen actions and rewards
        self.actions.clear()
        self.logits.clear()
        self.rewards.clear()
        self.BaselineEstimate.clear()


    # return array of finished trajectories stored in self.episodes and the amount of episodes
    def get_episodes(self):
        return self.episodes, len(self.episodes)
        
        

# Actor Model


In [207]:
class Actor(Model):
    def __init__(self):
        super(Actor, self).__init__()


        self.l = [
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(4, activation="softmax")
        ]

    #@tf.function        
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

        

#@tf.function
def sample_action(observation):
    logits = actor(observation)
   # tf.print(type(logits))
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
   # tf.print(action)
    return logits, action

# Critic Model

In [208]:
class Critic(Model):
    def __init__(self):
        super(Critic, self).__init__()

        self.l = [
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]

    #@tf.function 
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

In [209]:
# define Hyperparameters
epochs = 1
steps_per_epoch = 1000 # ~10 Episodes per epoch, then compute new parameters (smaller batching)
lr_actor = 3e-4
lr_critic = 3e-4
train_policy_iterations = 80
train_value_iterations = 80
clip_ratio = 0.2
target_kl = 0.01
optimizer = Adam()

render = False

In [210]:
tf.keras.backend.clear_session()

# define environment
env = gym.make("LunarLander-v2")
# get observation_dims and amount of possible actions (1 for CartPole-v1)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage for observations, actions, rewards etc during trajectory
T = Storage()

# init the actor and critics model
observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
actor = Actor()
critic = Critic()

# Initialize the observation, episode return and episode length
observation, episode_return, episode_length = env.reset(), 0, 0

In [211]:
episodes_total = 0
for epoch in range(epochs):

    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in tqdm(range(steps_per_epoch)):
        if render:
            env.render()

        observation = observation.reshape(1,-1)

        # obtain action and logits for this observation by our actor
        logits, action = sample_action(observation=observation)
        
        # make a step in environment and obtain the rewards for it
        observation_new, reward, done, _ = env.step(action[0].numpy())

        # sum up rewards over this episode and count amount of frames
        episode_return += reward
        episode_length += 1

        # get the Base-Estimate from the Critics network
        base_estimate = critic(observation)

        # store Variables collected in this step
        T.store(observation=observation, action=action, logits=logits, reward=reward, BaselineEstimate=base_estimate)
        #update the observations
        observation = observation_new
        # check if terminal state is reached in env, if so save episode and refresh storage, reset env
        if done:
            T.conclude_episode()
            observation, episode_return, episode_length = env.reset(), 0, 0

    # obtain all episodes saved in storage
    episodes, amount_episodes = T.get_episodes()

  

        


100%|██████████| 1000/1000 [00:03<00:00, 280.08it/s]


In [212]:
# episodes [episode][particular values // 0: Observations, 1: actions, 2: logits, 3, rewards, 4: BaselineEstimates from Critics]
#print(episodes[0][4])
print(f'Number of Episodes = {amount_episodes}')



### Advantagefunction

# estimated Value of the current situtation from the critics network
b_estimates = episodes[0][4] 
# for i in b_estimates:
#   print(i.numpy())

# Discounted sum of rewards
print(episodes[0][3]) 
rewards = episodes[0][3]
gamma = 0.99

    



Number of Episodes = 10
[-0.4790546468764194, 1.6024155095453068, -0.8532222435471215, -2.5304715012188965, -1.6946305504659438, 2.236095066822327, -2.1760836749764962, -2.467359432374819, 2.9053223620032727, 1.4876082282435845, -1.990866033047098, -2.022995974206111, 1.4645913421566206, -2.212581998883222, -1.7352526075670756, -1.9677994285837872, -0.5094523280217402, 0.5343437857893718, -2.2335737105127578, -2.1834460207286086, 2.130561208901111, 2.298631632959837, 0.7327982247573288, -2.449238524607125, -2.4901511196217427, 0.26669237567527376, -2.2859865631965874, -2.2279349523033147, -2.3305846320511976, -2.1585738872512352, -2.1506324638698957, -2.023824062795056, -1.9005912990468516, 0.6577599112616894, -2.1124023881890808, -2.225041217963222, 1.767636115434226, 1.4475674083359251, -1.8783990707850353, -1.9123640750072468, 3.532896661745025, 2.9337701293436966, -1.6234852695887116, -1.646085711590331, -1.3082425922829646, -1.339505249615371, -1.6495327781140066, -1.7495098645589

In [213]:
def discounted_reward(rewards, gamma):
    i = 0
    discounted_rewards = []
    for r in rewards:
        disc = 0
        for t in rewards[i:-1]:
            discount_t = gamma ** t
            disc += t * discount_t
        i += 1
        discounted_rewards.append(disc)
    return discounted_rewards
    
    


In [214]:
# get discounted sum of rewards 
disc_sum = discounted_reward(rewards, gamma)

# convert lists to np arrays and flatten
disc_sum_np = np.array(disc_sum)
b_estimates_np = np.array(b_estimates)
b_estimates_np = b_estimates_np.flatten()

# substract arrays to obtain advantages
advantages = np.subtract(disc_sum_np, b_estimates_np)
print(advantages)





[-8.61598146e+01 -8.56784459e+01 -8.72552626e+01 -8.63946900e+01
 -8.37990386e+01 -8.20752969e+01 -8.42617004e+01 -8.20375001e+01
 -7.95081913e+01 -8.23299072e+01 -8.37954404e+01 -8.17643374e+01
 -7.96997887e+01 -8.11429801e+01 -7.88806456e+01 -7.71148630e+01
 -7.51077559e+01 -7.45956882e+01 -7.51271704e+01 -7.28428888e+01
 -7.06109947e+01 -7.26964210e+01 -7.49425600e+01 -7.56699815e+01
 -7.31597033e+01 -7.06064434e+01 -7.08724218e+01 -6.85333052e+01
 -6.62549188e+01 -6.38690956e+01 -6.16631823e+01 -5.94655550e+01
 -5.74001447e+01 -5.54628984e+01 -5.61163243e+01 -5.39585908e+01
 -5.16832275e+01 -5.34197389e+01 -5.48463993e+01 -5.29322033e+01
 -5.09827265e+01 -5.43923838e+01 -5.72409148e+01 -5.55907229e+01
 -5.39171762e+01 -5.25916194e+01 -5.12339576e+01 -4.95568472e+01
 -4.77762991e+01 -5.11926587e+01 -4.91981648e+01 -5.28220460e+01
 -5.04085543e+01 -4.82695943e+01 -4.58818530e+01 -4.32705483e+01
 -4.12095245e+01 -3.93965948e+01 -3.75548861e+01 -3.79844833e+01
 -3.57590371e+01 -3.84306

# LogProbs and Ratio Computation

We need the ratio of probabilities for an action at state t of the 'new' model vs the old model (maybe because of entropy there is a difference)

In [275]:
# using one Episode as example to get the prob ratio of old vs new
logits_old = episodes[0][2]
actions = episodes[0][1]
obs = episodes[0][0]
logits_new = []
for i in obs:
    tensor = tf.convert_to_tensor(i)
    new, action = sample_action(tensor)
    logits_new.append(new) 


#creating oneHot vector with size actions space and getting the log for the probability of choosing this action
a = actions[0].numpy()
a = a.flatten()
print(a)
# getting the log
logits_old = tf.nn.log_softmax(logits_old[0]) #tryout with only a single action
print(f'log old: {logits_old}')
logits_old = tf.nn.log_softmax(logits_new)

# creating one_hot vectors of the chosen actions for old and new actor logits
logprobability_old = tf.reduce_sum(
        tf.one_hot(a, num_actions) * logits_old, axis=1
    )
logprobability_new = tf.reduce_sum(
        tf.one_hot(actions, num_actions) * logits_new, axis=1
    )
log_prob_old = logprobability_old[a[0]]
# compute the ratio - missing is only that we have old and new probability for an action and not the oneHot

# print(f'actions Length {len(actions)} {actions}')
print(f'LogProbs_old first {log_prob_old[a[0]]}') # check this out, this is the value we want / make streamlining / check why this is not working above


## current problem is that we still get a one_hot and not a single log-prob which we need to compute the ratios of old and new

[3]
log old: [[-1.3863043 -1.386295  -1.3862795 -1.3862981]]
LogProbs_old first -1.3862972259521484
