In [145]:
# Neural Network
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
# Environment


import gym
# Further support
import numpy as np
import time
import scipy.signal
from tqdm.notebook import tqdm_notebook
import datetime

%load_ext tensorboard


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# TrajectoryStorage


In [146]:
class Storage:
    '''
    Contains all information the agent collects interacting with the environment.
    '''


    def __init__(self):
        '''
        Initializes empty lists as storages all observation variables during trajectory
        '''
        # Saves information about the current state of the agent at each step
        self.observations = []

        # Saves actions made and rewards achieved
        self.actions = []
        self.rewards = []
        # Outputs from the actor network, an action is sampled from (Probabilities)
        self.logits = []
        # Outputs from the crtitics network (Values)
        self.BaselineEstimate = []

        # finished episodes will be completely stored in this list 
        self.episodes = []


    def store(self, observation, action, logits, reward, BaselineEstimate):
        '''
        Adds given information to the storage.

        Args:
        observation(obj): information (e.g. pixel values) about current state of agent
        action(float): Output of the actor network. Describes the action taken
        logits():
        reward(floats): Rewards collected by agent
        BaselineEstimate():
        '''
        self.observations.append(observation)
        self.actions.append(action)
        self.logits.append(logits)
        self.rewards.append(reward)
        self.BaselineEstimate.append(BaselineEstimate) 
        

    def conclude_episode(self):
        '''
        Append all collected values to episodes list once one episode is finished.
        Computes all rewards collected in one episode. Prepares storage for next episode.
        '''
        self.episodes.append(
            [self.observations,
             self.actions, 
             self.logits,
             self.rewards,
             self.BaselineEstimate,
             # Get the return of the whole episode 
             sum(self.rewards)])
             
        # Empty the arrays for new trajectory
        self.observations.clear()
        self.actions.clear()
        self.logits.clear()
        self.rewards.clear()
        self.BaselineEstimate.clear()

     
    def get_episodes(self):
        '''
        Returns list containing finished trajectories stored in self.episodes
        and the amount of episodes passed.
        '''
        return self.episodes, len(self.episodes)
        
        

# Actor Model


In [147]:
class Actor(Model):
    '''
    Neural network computing the actions the agent will take
    '''


    def __init__(self):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Actor, self).__init__()
        
        self.l = [
            # Three Dense Layers with random initial parameters having a standart deviation of 0.01
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            
            # Output layer with softmax activation function applied to for neurons.
            # Outputs prpobability for each of our for actions 
            # (Do nothing, fire left orientation engine, fire main engine, fire right orientation engine)
            Dense(4, activation="softmax", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]


    #@tf.function        
    def call(self, x):
        '''
        Iterates input x through network to create softmax ouutput.

        Args:
        x(): Network input. Pixel values representing the current state of the agent
        '''
        for l in self.l:
            x = l(x)
        return x


    #####  logits = actor(observation) -> actor must be in capitol, gets instantiated twice, maybe idea is wrong
    #@tf.function
    def sample_action(self,observation):
        '''
        Calls the actor network with state of the agent and returns the network object + the samnpled action

        Args:
        observation(): Representation of actors state. Same as x in the call function. 
        '''
        # Output of softmax function
        #logits = self.call(observation)
        logits = self(observation)
    # tf.print(type(logits))
        # Sample action from the Softmax output of the network
        action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    # tf.print(action)
        return logits, action

# Critic Model

In [148]:
class Critic(Model):
    '''
    Represents the value function of the network. 
    Input is a certain state and output a float value for that state.
    '''


    def __init__(self):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Critic, self).__init__()
        self.l = [
            # Three Dense Layers with ReLu activation function
            # Random initial parameters having a standart deviation of 0.01
            
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),

            # Output layer with Tanh activation function to get float output value ([-1;1])
            # Random initial parameters having a standart deviation of 0.01
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]


    #@tf.function 
    def call(self, x):
        '''
        Iterates input x through network to create tanh output between -1 and 1 
        giving input state x a value.

        Args:
        x(): Network input. Pixel values representing the current state of the agent.
        '''
        for l in self.l:
            x = l(x)
        return x

In [149]:
'''
Adjust Hyperparameters
'''

# Number of iterations
epochs = 1
# Leads to ~10 Episodes per epoch, then compute new parameters (smaller batching)
steps_per_epoch = 1000 

# Learning rate for actor and critic
lr_actor = 3e-4
lr_critic = 3e-4

# Movements in environment (state-space) to collect training data
train_policy_iterations = 80
train_value_iterations = 80

# Parameter to decide how strongly the policy ratio gets clipped therefore how much policy (actor network)
#  updates we allow
# The selected 0.2 is the number proposed by the original paper by OpenAI
clip_ratio = 0.2
# Weighs loss of critic model
c_1 = 0.5

#
target_kl = 0.01


# Update weights with Adam optimizer
optimizer = Adam()

# To toggle displaying of environment
render = False

# Discount variable for rewards to whey immediate rewards stronger
gamma = 0.99

In [150]:
# Reset all states generated by Keras
tf.keras.backend.clear_session()

# Define environment
env = gym.make("LunarLander-v2")
# Get dimensions of state and amount of possible actions (4 for LunarLander-v2)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage object to save observations, actions, rewards etc. during trajectory
#storage = Storage()

# initialize actor and critics model
#observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
# actor = Actor()
# critic = Critic()

# Initialize: observation(agent state), 
# episode return(summed rewards for singe ) and 
# episode length(amount of steps taken (=frames) before agent finished)
# observation, episode_return, episode_length = env.reset(), 0, 0

In [151]:
class Agent:
    '''
    ###Skizze - Not used yet

    Currently contains:
    - Collects data
    - Training process (iterator, updater, actor loss fun)
    - get advantage function
    - dicount rewards function
    - Get ratio function

    Whats missing: 
    - All the FUCKING self's before variable assignment and for functions (fuck you python, even though i love you)    
    '''

    def __init__(self):
        ''' 
        Initialize Parameters.
        ###Maybe pass hyperparameters?
        '''
        self.actor = Actor()
        self.critic = Critic()
        self.storage = Storage()
        #print(self.actor.trainable_variables())


    def collect_train_data(self):
        '''
        Agent takes steps in environment according to current policy. Information gets saved to update policy.
        -> Data collection
        '''
        observation, episode_return, episode_length = env.reset(), 0, 0
        episodes_total = 0
        # Iteration of whole training process
        for epoch in tqdm_notebook(range(epochs), desc = 'Epochs'):

            # Initialize values for return, length and episodes
            sum_return = 0
            sum_length = 0
            num_episodes = 0

            # Each timestep t of steps_per_epoch (in paper denoted as capital T)
            #  allows takes on action in a state and saves the information in storage object
            for t in tqdm_notebook(range(steps_per_epoch), desc = 'Epoch:' + str(epoch)):

                # Toggles displaying of environment
                if render or epoch == epochs-1 and epochs != 1:
                    env.render()

                # Reshaping observation to fit as input for Actor network (policy)
                observation = observation.reshape(1,-1)
                
                # Obtain action and logits for this observation by our actor
                logits, action = self.actor.sample_action(observation)
                
                # Take action in environment and obtain the rewards for it
                # Variable done represents wether agent has finished 
                # The last variable would be diagnostic information, not needed for training
                observation_new, reward, done, _ = env.step(action[0].numpy())

                # Sum up rewards over this episode and count amount of frames
                episode_return += reward
                episode_length += 1

                # Get the Base-Estimate from the Critics network
                base_estimate = self.critic(observation)

                # Store Variables collected in this timestep t
                self.storage.store(observation=observation, action=action, logits=logits, reward=reward, BaselineEstimate=base_estimate)
                # Save the new state of our agent
                observation = observation_new
                
                # Check if terminal state is reached in environment
                if done:
                    # Save information about episode
                    self.storage.conclude_episode()
                    # Refresh environment and reset return and length value
                    observation, episode_return, episode_length = env.reset(), 0, 0

            # obtain all episodes saved in storage
            # episodes, amount_episodes = self.storage.get_episodes()


    def actor_loss_fun(self, actions, logits_old, logits_new, rewards, b_estimates_new, clip_param):
        '''
        Computes loss for Actor Network output.

        Args:
        logits_old():
        logits_new():
        reward():
        b_estimates_new():
        clip_param():
        '''
        
        ratio = self.get_ratio_episode(actions, logits_old, logits_new)

        ### FIND OUT WHICH: SINGLE OR MULTIPLE ELEMENTS ARE WANTED AND ADJUST EITHER IN GET_ADV OR THE UPPER TWO FUNCTIONS
        advantage = self.get_advantage(rewards, b_estimates_new)
        
        # Unclipped value
        l1 = ratio * advantage
        # Clipped ratio between values determined by Hyperparam and multiplied by advantage (see objective function)
        l2 = np.clip(ratio, a_min=1 - clip_param, a_max=1 + clip_param) * advantage
        #l1 = np.array(l1, dtype="float32")
        #l2 = np.array(l2, dtype="float32")
        

        # Compute minimum of both and take the mean to return float loss
        #actor_loss = -tf.reduce_mean(tf.minimum(l1, l2))
        l1 = tf.convert_to_tensor(np.array([tf.convert_to_tensor(l, dtype=tf.float32) for l in l1]), dtype=tf.float32)
        l2 = tf.convert_to_tensor(np.array([tf.convert_to_tensor(l, dtype=tf.float32) for l in l2]), dtype=tf.float32)
        return tf.convert_to_tensor(l1, dtype=tf.float32), tf.convert_to_tensor(l2, dtype=tf.float32)


    def train_step(self, states, actions, optimizer, train_logits, train_rewards, clip_param, c_1, c_2):
        '''
        Updates actor network parameters and returns the loss to evaluate performance.

        Args:
        model(object): Object of the actor model.
        input(list): contains floats describing the actors state.
        loss_function(function): Clipped objective function for PPO.
        optimizer(object): Optimizer used to train actor.
        train_logits():
        train_rewards():
        clip_param():
        c_1(): 
        c_2():
        '''

        # use tf.gradientTape to compute loss, then gradients and apply these to the model to modify the parameters
        with tf.GradientTape() as tape, tf.GradientTape() as tape2:
            # print(self.actor.trainable_variables())
            # Obtain action and logits for this state selected by policy
            #print(f' Observation shape/type {observation}')
            #print(f'Trainables: {self.actor.layers[0].weights}')


            # logits_new, actions_new = sample_action(states)
            logits_new = []
            b_estimates_new = []

            # Compute values with updated critic network
            # b_estimates_new = critic(states)

            # till we work with np arrays we need to sample each action for this by looping through it
            for i in states:
                logits, _ = self.actor.sample_action(i)
                logits_new.append(logits)
                b_estimate = self.critic(i)
                
                b_estimates_new.append(b_estimate)

            # Compute & weigh entropy 
            #entropy = c_2 * np.mean(-(logits_new * train_logits))   # <----- DOESNT WORK YET Musste ich erstmal rausnehmen für den Rest vom Debugging
            # entropy = 0.01

            # Computes MSE between output of the critics network (value) the discounted sum of rewards
            #  which represents an estimate based on rewards collected during training
            # critic_loss = c_1 * tf.keras.losses.MeanSquaredError(b_estimates_new, self.discounted_reward(train_rewards)).numpy()
            #print('Weewoo')
            #print(tf.reduce_mean((np.array(train_rewards) - tf.convert_to_tensor(b_estimates_new, dtype=tf.float32)) ** 2))
            print('type critic')
            print(type((np.array(train_rewards) - tf.convert_to_tensor(b_estimates_new, dtype=tf.float32)) ** 2))
            print((np.array(train_rewards) - tf.convert_to_tensor(b_estimates_new, dtype=tf.float32)) ** 2)

            critic_loss = tf.reduce_mean((np.array(train_rewards) - tf.convert_to_tensor(b_estimates_new, dtype=tf.float32)) ** 2)
            #actor_loss = entropy * self.actor_loss_fun(actions, train_logits, logits_new, train_rewards, b_estimates_new, clip_param)
            l1,l2 = self.actor_loss_fun(actions, train_logits, logits_new, train_rewards, b_estimates_new, clip_param)
            #print('minimum')
            #print(-tf.reduce_mean(tf.minimum(l1, l2)))
            #print(type(tf.minimum(l1,l2)))

            actor_loss = -tf.reduce_mean(tf.minimum(l1, l2))
            #critic_loss = tf.cast(critic_loss, dtype=tf.float32)
            #print(f'Critics loss:{type(critic_loss)}. Actor Loss {actor_loss.dtype}')

            #print('Actor weights')
            #print(print(self.actor.layers[0].weights))

            #print('actor')
            #print(actor_loss)
            #print(type(actor_loss))
            #print('critic')
            #print(critic_loss)
            #print(type(critic_loss))

            actor_loss = tf.convert_to_tensor(actor_loss, dtype=tf.float32)

            print(actor_loss)
            print(critic_loss)
            print('actor')
            print(self.actor.trainable_variables)
            print('critic')
            print(self.critic.trainable_variables)
            a_gradients = tape.gradient(actor_loss, self.actor.trainable_variables)
            c_gradients = tape2.gradient(critic_loss, self.critic.trainable_variables)
            print(a_gradients)
            print(c_gradients)

            #print(tape)
            #print('Actor loss')
            #print(actor_loss)
            #print('Trainable Weights')
            #print(self.actor.trainable_weights)
        
        #print(f'Gradients Actor: {a_gradients}. Gradients Critic: {c_gradients}')

        # Update parameters
        optimizer.apply_gradients(zip(a_gradients, self.actor.trainable_variables))
        optimizer.apply_gradients(zip(c_gradients, self.critic.trainable_variables))

        

        return actor_loss, critic_loss

    def update_policy(self, episodes, optimizer, clip_param, c_1 = 1, c_2=0.01):
        '''
        Update policy with the collected data (Parameter updates for actor)

        Args: 
        episodes(list): Contains all information on one episode in the following order:
                        [observations, actions, logits, rewards, BaselineEstimate, summed rewards]
        actor(object): Object of the actor model.
        critic(object): Object of the critic model.
        actor_loss(function): Clipped objective function for PPO.
        optimizer(object): Optimizer used to train actor.
        clip_param(float): Hyperparameter to decide values to clip ratio between.
        c_1(float): hyperparameter to determine how strongly loss of the critic network should be weighed
        c_2(float): hyperparameter to determine how strongly entropy should be weighed


        Information stored as:
        storage.episodes[different episodes]
                        [observations, actions, logits, rewards, BaselineEstimate, sum(self.rewards)]
                        [look at single one]
        '''
        # for epoch in training_iteratins:
        # Save network loss
        train_losses_actor = []
        train_losses_critic = []
        
        # Iterate over all finished episodes from collected training data
        for episode in tqdm_notebook(episodes):

            # Update parameters
            # Compute train losses and action by chosen by policy
            actor_loss, critic_loss = self.train_step(
                # States
                episode[0],
                # Actions
                episode[1],
                #optimizer (Adam)
                optimizer,
                # Logits
                episode[2],
                # Rewards
                episode[3],
                clip_param,
                c_1,
                c_2 
            )
            train_losses_actor.append(actor_loss)
            train_losses_critic.append(critic_loss)

            return train_losses_actor, train_losses_critic


    


    


  


    def get_advantage(self, rewards, b_estimates, gamma = 0.99):
        '''
        Computes Advantage for action in state.

        Args:
        rewards(float): Reward for action.
        gamma(float): Discount factor.
        b_estimates(float): Baseline Estimates.
        
        '''
        # Saves list of all rewards in new variable 
        #rewards = episodes[0][3]


        # Get discounted sum of rewards 
        disc_sum = self.discounted_reward(rewards, gamma)


        # # Estimated Value of the current situtation from the critics network
        # b_estimates = self.episodes[0][4] 

        # Convert lists to np arrays and flatten
        disc_sum_np = np.array(disc_sum)
        b_estimates_np = np.array(b_estimates)
        b_estimates_np = b_estimates_np.flatten()

        # substract arrays to obtain advantages
        advantages = np.subtract(disc_sum_np, b_estimates_np)

        return advantages


     ### MIGHT NOT WORK
    #  output for: discounted_reward([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 0.99)
    #  -> [8.91, 7.920000000000001, 6.930000000000001, 5.94, 4.95, 3.96, 2.9699999999999998, 1.98, 0.99, 0]
    #  ###
    def discounted_reward(self, rewards, gamma = 0.99):
        '''
        weighs all rewards in a way such that immediate rewards have a stronger impact than possible future rewards.

        Args:
        rewards(list): list of all rewards collected by the agent in episode t (?)
        gamma(float): Hyperparameter determining how much future rewards should be weighed in
        '''
        # To select the next reward
        i = 0
        discounted_rewards = []

        # Iterates through every reward and appends a discounted version to the output
        for r in rewards:
            disc = 0
            for t in rewards[i:-1]:
                discount_t = gamma ** t
                disc += t * discount_t
            i += 1
            discounted_rewards.append(disc)

        # returns list of discounted rewards.
        return discounted_rewards   



    ## get ratio lutsch noch ARSCH, das Ding verarscht mich anders

    def get_ratio_episode(self, actions, logits_old, logits_new): 
        r = []
        for a, o, n in zip(actions, logits_old, logits_new):
            o = tf.convert_to_tensor(o)
            n = tf.convert_to_tensor(n)
            #print(f'A: {a} O: {type(o)} N: {type(n)}')

            #get the Logarithmic version of all logits for computational efficiency
            log_prob_old = tf.nn.log_softmax(o)
            log_prob_new = tf.nn.log_softmax(n)

            # encode in OneHotVector and reduce to sum, giving the log_prob for the action the agent took for both policies
            logprobability_old = tf.reduce_sum(
                tf.one_hot(a, num_actions) * log_prob_old, axis=1
            )
            logprobability_new = tf.reduce_sum(
                tf.one_hot(a, num_actions) * log_prob_new, axis=1
            )
            # get the ratio of new over old prob
            ratio = tf.exp(logprobability_new - logprobability_old)
            r.append(ratio)
        return r


    def run(self):
        self.collect_train_data()
        data, _ = self.storage.get_episodes()
        #print(data)
        self.update_policy(data, optimizer, clip_ratio)
  

In [152]:

ppo_agent = Agent()
ppo_agent.run()
# ppo_agent.collect_train_data()
# data = storage.get_episodes()
# #print(data)
# print(ppo_agent.update_policy(data[0], actor, critic, optimizer, clip_ratio))

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch:0:   0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

type critic
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[[2.418201  2.491415  2.7956443 ... 4.403715  3.319935  3.0709276]]

 [[2.4181893 2.4914033 2.7956316 ... 4.4037304 3.3199487 3.0709407]]

 [[2.4182024 2.4914165 2.795646  ... 4.403713  3.3199337 3.070926 ]]

 ...

 [[2.4181814 2.491395  2.7956233 ... 4.403742  3.319958  3.0709498]]

 [[2.418177  2.4913905 2.7956183 ... 4.403748  3.3199635 3.070955 ]]

 [[2.4181778 2.4913917 2.7956195 ... 4.4037466 3.319962  3.0709536]]], shape=(52, 1, 52), dtype=float32)
tf.Tensor(12.281276, shape=(), dtype=float32)
tf.Tensor(4.8927484, shape=(), dtype=float32)
actor
[<tf.Variable 'actor/dense/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[-1.63800921e-02, -1.56523436e-02, -7.40791345e-03, ...,
        -1.41428283e-03,  9.91461449e-04,  3.09268874e-03],
       [-2.78602242e-02, -1.35148009e-02, -6.33828156e-03, ...,
        -1.27423191e-02,  5.50283771e-03,  3.61146708e-03],
       [-2.23480235e-03,  3.31171276e-03, 

ValueError: No gradients provided for any variable: (['actor/dense/kernel:0', 'actor/dense/bias:0', 'actor/dense_1/kernel:0', 'actor/dense_1/bias:0', 'actor/dense_2/kernel:0', 'actor/dense_2/bias:0', 'actor/dense_3/kernel:0', 'actor/dense_3/bias:0'],). Provided `grads_and_vars` is ((None, <tf.Variable 'actor/dense/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[-1.63800921e-02, -1.56523436e-02, -7.40791345e-03, ...,
        -1.41428283e-03,  9.91461449e-04,  3.09268874e-03],
       [-2.78602242e-02, -1.35148009e-02, -6.33828156e-03, ...,
        -1.27423191e-02,  5.50283771e-03,  3.61146708e-03],
       [-2.23480235e-03,  3.31171276e-03, -6.33148616e-03, ...,
        -3.60199600e-03,  5.98243391e-03, -4.54733288e-03],
       ...,
       [-1.15284565e-04,  1.01704868e-02,  7.13728368e-05, ...,
         1.63313504e-02,  8.45513027e-03, -1.05834287e-02],
       [ 7.51028303e-03,  2.15990865e-03,  7.19672255e-03, ...,
         4.51383833e-03, -5.73619828e-03,  5.78765245e-03],
       [-1.20154815e-02,  2.52695680e-02,  2.08671018e-03, ...,
        -1.13718761e-02, -2.88757589e-03,  6.90241344e-03]], dtype=float32)>), (None, <tf.Variable 'actor/dense/bias:0' shape=(128,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>), (None, <tf.Variable 'actor/dense_1/kernel:0' shape=(128, 128) dtype=float32, numpy=
array([[-0.00803269, -0.00755734,  0.00281461, ..., -0.00289996,
        -0.01399054, -0.01239738],
       [-0.00997126, -0.00390268,  0.01072986, ...,  0.01263903,
         0.00274303, -0.00694317],
       [ 0.00753926,  0.0057579 , -0.00174976, ..., -0.00258169,
        -0.00444449,  0.00500562],
       ...,
       [ 0.01215971, -0.01015533,  0.01338909, ..., -0.01340112,
        -0.00911181, -0.00957868],
       [ 0.01117375,  0.01645911, -0.00015771, ...,  0.00299049,
         0.01466596,  0.00439953],
       [ 0.00669039,  0.00507154, -0.00819705, ...,  0.01069654,
         0.01282239, -0.00716095]], dtype=float32)>), (None, <tf.Variable 'actor/dense_1/bias:0' shape=(128,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>), (None, <tf.Variable 'actor/dense_2/kernel:0' shape=(128, 64) dtype=float32, numpy=
array([[ 0.01361657,  0.01444342,  0.01622957, ..., -0.00071591,
         0.0030895 ,  0.01670394],
       [-0.00423875, -0.00315748, -0.01855959, ..., -0.01288712,
        -0.01097716,  0.0046248 ],
       [-0.00877126, -0.01433462,  0.00185777, ...,  0.0126122 ,
         0.00855011,  0.00976727],
       ...,
       [-0.01583707,  0.00445471, -0.01278754, ...,  0.0120641 ,
         0.00279803,  0.02242054],
       [ 0.01717794, -0.00272553,  0.00648627, ..., -0.01912042,
        -0.00965603,  0.00915352],
       [ 0.0120354 , -0.01243899,  0.01101775, ..., -0.00071088,
         0.00749606,  0.02501313]], dtype=float32)>), (None, <tf.Variable 'actor/dense_2/bias:0' shape=(64,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>), (None, <tf.Variable 'actor/dense_3/kernel:0' shape=(64, 4) dtype=float32, numpy=
array([[-0.08866136, -0.27883843,  0.21831375,  0.07456532],
       [-0.26057103, -0.17297573,  0.08374435,  0.01829883],
       [-0.06901233,  0.13466766, -0.11715324,  0.03807408],
       [ 0.22266573,  0.25129378,  0.20198986,  0.07525569],
       [-0.15978846,  0.18887389, -0.15133564, -0.04595765],
       [-0.21875896, -0.1221551 , -0.19874263,  0.08657539],
       [-0.16946946,  0.27828848, -0.18124323,  0.04722887],
       [ 0.14674142,  0.20180514, -0.03865913,  0.06646922],
       [ 0.15452138, -0.09726964,  0.15949994, -0.14067788],
       [-0.09241727, -0.19736305, -0.29537135, -0.09118237],
       [-0.03176451, -0.00958958, -0.04649106,  0.27230364],
       [ 0.08938798, -0.27243954,  0.2684542 ,  0.09244758],
       [ 0.2550072 ,  0.12442201, -0.2113882 ,  0.12940007],
       [ 0.22133005, -0.2559955 , -0.04463938,  0.22393292],
       [ 0.15863678,  0.17518422, -0.06493729, -0.23822415],
       [-0.2185326 ,  0.01482934, -0.11060274,  0.20738167],
       [ 0.05545571, -0.18175264, -0.2543781 ,  0.12666413],
       [ 0.10834956, -0.02718863, -0.00654158,  0.20151278],
       [-0.27479497, -0.0408943 , -0.2140682 , -0.09904158],
       [-0.00838038, -0.12688665,  0.02458426, -0.02885687],
       [ 0.13567105, -0.1930145 ,  0.05416933, -0.18861426],
       [ 0.11851946,  0.19364849,  0.13404053, -0.03026602],
       [ 0.09985596, -0.11999407, -0.08788495, -0.24484058],
       [-0.26895842, -0.22350276,  0.02136254, -0.16803406],
       [ 0.26020503, -0.20071727, -0.16845702,  0.03397307],
       [ 0.01317644, -0.18572676,  0.22592765,  0.17076188],
       [ 0.02814704,  0.03195432, -0.01231137,  0.26940155],
       [ 0.18318889,  0.0384413 , -0.08995682, -0.07394534],
       [ 0.02738586, -0.1223301 , -0.20211464,  0.19569898],
       [ 0.01859918,  0.02911851, -0.19670023,  0.2623127 ],
       [-0.1855167 , -0.15258493,  0.15487137,  0.13096103],
       [ 0.20135975, -0.1783713 , -0.12350133,  0.03085396],
       [-0.19057508, -0.04230607, -0.10429387, -0.22345665],
       [ 0.13837168, -0.15376408, -0.04975931, -0.18469313],
       [ 0.2065857 ,  0.00887653, -0.27666962,  0.21269178],
       [-0.24184854, -0.28785002,  0.2182746 ,  0.01856238],
       [-0.05308783, -0.179622  ,  0.20419061,  0.07292292],
       [ 0.2868412 , -0.21973756,  0.20540375, -0.26088315],
       [-0.19169228,  0.07066464, -0.25452477,  0.00403324],
       [-0.2234359 , -0.21529949, -0.24808659,  0.24839175],
       [-0.21233585, -0.08565757,  0.07344678, -0.05300538],
       [-0.13152266, -0.01565787,  0.10239437,  0.23843557],
       [ 0.1669069 ,  0.29488516,  0.09042558, -0.24668255],
       [-0.14868708, -0.10315025, -0.13161147,  0.15553722],
       [ 0.24543685,  0.1424518 , -0.20946187, -0.06327555],
       [ 0.2839226 ,  0.04985267,  0.08883417, -0.13403219],
       [-0.04393783,  0.15881652,  0.23823214,  0.09107232],
       [ 0.17175883, -0.02798551, -0.11835514,  0.19840065],
       [ 0.13396782, -0.01433223,  0.20598859, -0.1043479 ],
       [-0.00483274,  0.2256707 ,  0.18786329, -0.13234687],
       [ 0.2934358 , -0.25038671, -0.09731893, -0.11705486],
       [-0.1717652 , -0.11204414, -0.09522483,  0.10637039],
       [-0.21854812, -0.25519416, -0.13856904, -0.07926817],
       [-0.05986524,  0.08628008,  0.11967629,  0.15938154],
       [ 0.12173775,  0.07679802, -0.1500695 ,  0.155388  ],
       [ 0.21525037, -0.23135445, -0.07041359,  0.18416685],
       [-0.1577814 , -0.22918323,  0.27919704,  0.00370073],
       [ 0.22809613, -0.05775201,  0.21309894,  0.10020515],
       [ 0.03015816, -0.11456785,  0.25603408,  0.25676334],
       [-0.10822882,  0.24791408, -0.19739032, -0.27287233],
       [-0.2615825 ,  0.17910641,  0.27268726, -0.10725538],
       [-0.18997473, -0.08769579,  0.14024225,  0.08762652],
       [ 0.13527787,  0.03279263,  0.2514047 ,  0.20941943],
       [-0.29669392,  0.08612865,  0.26297712,  0.21619815]],
      dtype=float32)>), (None, <tf.Variable 'actor/dense_3/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)).

In [None]:
env.close()