In [2]:
import tensorflow as tf
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
import time
import scipy.signal
from tqdm import tqdm

Init Plugin
Init Graph Optimizer
Init Kernel


# TrajectoryStorage


In [3]:
class Storage:
    def __init__(self):
        # init creating a storage for all observation variables during trajectory
        self.observations = []
        # create arrays for chosen actions and rewards
        self.actions = []
        self.logits = []
        self.rewards = []
        self.BaselineEstimate = []
        # finished episodes will be completely stored in this list 
        self.episodes = []


    def store(self,observation, action, logits, reward, BaselineEstimate):
        self.observations.append(observation)
        self.actions.append(action)
        self.logits.append(logits)
        self.rewards.append(reward)
        self.BaselineEstimate.append(BaselineEstimate) # value of critics network
        

    def conclude_episode(self):
        # append all already stored values to finished episodes
        self.episodes.append(
            [self.observations,
             self.actions, 
             self.logits,
             self.rewards,
             self.BaselineEstimate, 
             sum(self.rewards)]) # get the return of the whole episode
             
        # empty the arrays for new trajectory
        self.observations.clear()
        # create arrays for chosen actions and rewards
        self.actions.clear()
        self.logits.clear()
        self.rewards.clear()
        self.BaselineEstimate.clear()


    # return array of finished trajectories stored in self.episodes and the amount of episodes
    def get_episodes(self):
        return self.episodes, len(self.episodes)
        
        

# Actor Model


In [4]:
class Actor(Model):
    def __init__(self):
        super(Actor, self).__init__()


        self.l = [
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(4, activation="softmax")
        ]

    #@tf.function        
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

        

#@tf.function
def sample_action(observation):
    logits = actor(observation)
   # tf.print(type(logits))
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
   # tf.print(action)
    return logits, action

# Critic Model

In [5]:
class Critic(Model):
    def __init__(self):
        super(Critic, self).__init__()

        self.l = [
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]

    #@tf.function 
    def call(self, x):
        for l in self.l:
            x = l(x)
        return x

In [6]:
# define Hyperparameters
epochs = 1
steps_per_epoch = 1000 # ~10 Episodes per epoch, then compute new parameters (smaller batching)
lr_actor = 3e-4
lr_critic = 3e-4
train_policy_iterations = 80
train_value_iterations = 80
clip_ratio = 0.2
target_kl = 0.01
optimizer = Adam()

render = False

In [7]:
tf.keras.backend.clear_session()

# define environment
env = gym.make("LunarLander-v2")
# get observation_dims and amount of possible actions (1 for CartPole-v1)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage for observations, actions, rewards etc during trajectory
T = Storage()

# init the actor and critics model
observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
actor = Actor()
critic = Critic()

# Initialize the observation, episode return and episode length
observation, episode_return, episode_length = env.reset(), 0, 0

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-04-01 12:47:34.348637: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-01 12:47:34.348758: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
episodes_total = 0
for epoch in range(epochs):

    sum_return = 0
    sum_length = 0
    num_episodes = 0

    for t in tqdm(range(steps_per_epoch)):
        if render:
            env.render()

        observation = observation.reshape(1,-1)

        # obtain action and logits for this observation by our actor
        logits, action = sample_action(observation=observation)
        
        # make a step in environment and obtain the rewards for it
        observation_new, reward, done, _ = env.step(action[0].numpy())

        # sum up rewards over this episode and count amount of frames
        episode_return += reward
        episode_length += 1

        # get the Base-Estimate from the Critics network
        base_estimate = critic(observation)

        # store Variables collected in this step
        T.store(observation=observation, action=action, logits=logits, reward=reward, BaselineEstimate=base_estimate)
        #update the observations
        observation = observation_new
        # check if terminal state is reached in env, if so save episode and refresh storage, reset env
        if done:
            T.conclude_episode()
            observation, episode_return, episode_length = env.reset(), 0, 0

    # obtain all episodes saved in storage
    episodes, amount_episodes = T.get_episodes()

  

        


100%|██████████| 1000/1000 [00:03<00:00, 271.54it/s]


In [9]:
# episodes [episode][particular values // 0: Observations, 1: actions, 2: logits, 3, rewards, 4: BaselineEstimates from Critics]
#print(episodes[0][4])
print(f'Number of Episodes = {amount_episodes}')



### Advantagefunction

# estimated Value of the current situtation from the critics network
b_estimates = episodes[0][4] 
# for i in b_estimates:
#   print(i.numpy())

# Discounted sum of rewards
print(episodes[0][3]) 
rewards = episodes[0][3]
gamma = 0.99

    



Number of Episodes = 3
[-0.34609371917652537, -1.1441734587085353, -3.147144717599832, 0.6080432825797584, 0.5335714213060327, -0.7459594019751876, 0.8882751918218628, 1.2298262085237195, -1.1146788925114766, -0.3167105481699082, -0.45325840379115334, -1.8187136440687357, -0.9073751916136814, -1.9950317094458103, -1.2813329511352833, 0.6390800574922479, -0.4134950484714739, -1.4069343361077415, -2.1420791680450564, -1.3636319865985798, -0.2823183444087636, -0.16836524663113892, -2.4542026352435458, -2.668711889272997, -3.013809187485036, -1.9076649127179053, -1.0192979412152556, -0.5644276992288713, -2.6733384430334, -1.0738551172540542, -1.6914103449409765, -0.18254849058708372, 0.06025454138797387, -1.6537010139899735, -2.856008815280346, -0.9794023038472688, -3.037035460413249, -2.024112343835668, -1.12548695838139, -2.017509509223771, -1.0578023429695225, -2.9283359917820335, -2.010839893032937, -1.9934663444672651, -1.233241717642726, -1.7956646481499945, -1.0236247587074263, 2.02

In [10]:
def discounted_reward(rewards, gamma):
    i = 0
    discounted_rewards = []
    for r in rewards:
        disc = 0
        for t in rewards[i:-1]:
            discount_t = gamma ** t
            disc += t * discount_t
        i += 1
        discounted_rewards.append(disc)
    return discounted_rewards
    
    


In [11]:
# get discounted sum of rewards 
disc_sum = discounted_reward(rewards, gamma)

# convert lists to np arrays and flatten
disc_sum_np = np.array(disc_sum)
b_estimates_np = np.array(b_estimates)
b_estimates_np = b_estimates_np.flatten()

# substract arrays to obtain advantages
advantages = np.subtract(disc_sum_np, b_estimates_np)
print(advantages)





[-2.35283260e+02 -2.34935956e+02 -2.33778553e+02 -2.30530271e+02
 -2.31134611e+02 -2.31665327e+02 -2.30913749e+02 -2.31794131e+02
 -2.33008851e+02 -2.31881619e+02 -2.31563902e+02 -2.31108577e+02
 -2.29256318e+02 -2.28340632e+02 -2.26305194e+02 -2.25007255e+02
 -2.25642241e+02 -2.25227027e+02 -2.23800056e+02 -2.21611360e+02
 -2.20228909e+02 -2.19945791e+02 -2.19777143e+02 -2.17261652e+02
 -2.14520389e+02 -2.11413892e+02 -2.09469299e+02 -2.08439508e+02
 -2.07871874e+02 -2.05125732e+02 -2.04040228e+02 -2.02319820e+02
 -2.02136934e+02 -2.02197151e+02 -2.00515737e+02 -1.97576560e+02
 -1.96587467e+02 -1.93456302e+02 -1.91390592e+02 -1.90252301e+02
 -1.88193467e+02 -1.87124362e+02 -1.84108562e+02 -1.82056672e+02
 -1.80022864e+02 -1.78774245e+02 -1.76945881e+02 -1.75911674e+02
 -1.77898696e+02 -1.77353453e+02 -1.74853551e+02 -1.75222053e+02
 -1.74639334e+02 -1.71780002e+02 -1.69993101e+02 -1.68227590e+02
 -1.67326396e+02 -1.64657098e+02 -1.63422462e+02 -1.60809907e+02
 -1.59814817e+02 -1.58160

# LogProbs and Ratio Computation

We need the ratio of probabilities for an action at state t of the 'new' model vs the old model (maybe because of entropy there is a difference)

In [30]:

#creating oneHot vector with size actions space and getting the log for the probability of choosing this action

print(f'log old: {logits_old}')
print(f'log new: {logits_new}')


### this function currently only takes one single action and 2 sets of logits and computes the ratio of that

def get_ratio(action, logits_old, logits_new):

    #get the Logarithmic version of all logits for computational efficiency
    log_prob_old = tf.nn.log_softmax(logits_old)
    log_prob_new = tf.nn.log_softmax(logits_new)

    # encode in OneHotVector and reduce to sum, giving the log_prob for the action the agent took for both policies
    logprobability_old = tf.reduce_sum(
        tf.one_hot(action, num_actions) * log_prob_old, axis=1
    )
    logprobability_new = tf.reduce_sum(
        tf.one_hot(action, num_actions) * log_prob_new, axis=1
    )
    # get the ratio of new over old prob
    ratio = tf.exp(logprobability_new - logprobability_old)

    print(ratio)


# using one Episode as example to get the prob ratio of old vs new
logits_old = episodes[0][2][0]
action = episodes[0][1][0].numpy()
obs = episodes[0][0]
logits_new = []
for i in obs:
    tensor = tf.convert_to_tensor(i)
    new, _ = sample_action(tensor)
    logits_new.append(new) 

logits_new = logits_new[0]

get_ratio(action, logits_old, logits_new)



log old: [[0.24999921 0.24999891 0.24999474 0.25000718]]
log new: [[0.24999921 0.24999891 0.24999474 0.25000718]]
tf.Tensor([1.], shape=(1,), dtype=float32)
