In [109]:
# Neural Network
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
# Environment


import gym
# Further support
import numpy as np
import time
import scipy.signal
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt

%load_ext tensorboard


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# TrajectoryStorage


In [110]:
class Storage:
    '''
    Contains all information the agent collects interacting with the environment.
    '''


    def __init__(self):
        '''
        Initializes empty lists as storages all observation variables during trajectory
        '''
        # Saves information about the current state of the agent at each step
        self.observations = []

        # Saves actions made and rewards achieved
        self.actions = []
        self.rewards = []
        # Outputs from the actor network, an action is sampled from (Probabilities)
        self.logits = []
        # Outputs from the crtitics network (Values)
        self.BaselineEstimate = []

        # finished episodes will be completely stored in this list 
        self.episodes = []


    def store(self, observation, action, logits, reward, BaselineEstimate):
        '''
        Adds given information to the storage.

        Args:
        observation(obj): information (e.g. pixel values) about current state of agent
        action(float): Output of the actor network. Describes the action taken
        logits():
        reward(floats): Rewards collected by agent
        BaselineEstimate():
        '''
        self.observations.append(observation)
        self.actions.append(action)
        self.logits.append(logits)
        self.rewards.append(reward)
        self.BaselineEstimate.append(BaselineEstimate) 
        

    def conclude_episode(self):
        '''
        Append all collected values to episodes list once one episode is finished.
        Computes all rewards collected in one episode. Prepares storage for next episode.
        '''
        self.episodes.append(
            [self.observations,
             self.actions, 
             self.logits,
             self.rewards,
             self.BaselineEstimate,
             # Get the return of the whole episode 
             sum(self.rewards)])
             
        # Empty the arrays for new trajectory
        self.observations.clear()
        self.actions.clear()
        self.logits.clear()
        self.rewards.clear()
        self.BaselineEstimate.clear()

     
    def get_episodes(self):
        '''
        Returns list containing finished trajectories stored in self.episodes
        and the amount of episodes passed.
        '''
        return self.episodes, len(self.episodes)
        
        

# Actor Model


In [111]:
class Actor(Model):
    '''
    Neural network computing the actions the agent will take
    '''


    def __init__(self):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Actor, self).__init__()
        
        self.l = [
            # Three Dense Layers with random initial parameters having a standart deviation of 0.01
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            
            # Output layer with softmax activation function applied to for neurons.
            # Outputs prpobability for each of our for actions 
            # (Do nothing, fire left orientation engine, fire main engine, fire right orientation engine)
            Dense(4, activation="softmax")
        ]


    #@tf.function        
    def call(self, x):
        '''
        Iterates input x through network to create softmax ouutput.

        Args:
        x(): Network input. Pixel values representing the current state of the agent
        '''
        for l in self.l:
            x = l(x)
        return x


    #####  logits = actor(observation) -> actor must be in capitol, gets instantiated twice, maybe idea is wrong
    #@tf.function
    def sample_action(self,observation):
        '''
        Calls the actor network with state of the agent and returns the network object + the samnpled action

        Args:
        observation(): Representation of actors state. Same as x in the call function. 
        '''
        # Output of softmax function
        #logits = self.call(observation)
        logits = self(observation)
    # tf.print(type(logits))
        # Sample action from the Softmax output of the network
        action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    # tf.print(action)
        return logits, action

# Critic Model

In [112]:
class Critic(Model):
    '''
    Represents the value function of the network. 
    Input is a certain state and output a float value for that state.
    '''


    def __init__(self):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Critic, self).__init__()
        self.l = [
            # Three Dense Layers with ReLu activation function
            # Random initial parameters having a standart deviation of 0.01
            
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),

            # Output layer with Tanh activation function to get float output value ([-1;1])
            # Random initial parameters having a standart deviation of 0.01
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]


    #@tf.function 
    def call(self, x):
        '''
        Iterates input x through network to create tanh output between -1 and 1 
        giving input state x a value.

        Args:
        x(): Network input. Pixel values representing the current state of the agent.
        '''
        for l in self.l:
            x = l(x)
        return x

In [113]:
'''
Adjust Hyperparameters
'''

# Number of iterations
epochs = 1
# Leads to ~10 Episodes per epoch, then compute new parameters (smaller batching)
steps_per_epoch = 1000 

# Learning rate for actor and critic
lr_actor = 3e-4
lr_critic = 3e-4

# Movements in environment (state-space) to collect training data
train_policy_iterations = 80
train_value_iterations = 80

# Parameter to decide how strongly the policy ratio gets clipped therefore how much policy (actor network)
#  updates we allow
# The selected 0.2 is the number proposed by the original paper by OpenAI
clip_ratio = 0.2
# Weighs loss of critic model
c_1 = 0.5

#
target_kl = 0.01


# Update weights with Adam optimizer
optimizer = Adam()

# To toggle displaying of environment
render = False

# Discount variable for rewards to whey immediate rewards stronger
gamma = 0.99

In [114]:
# Reset all states generated by Keras
tf.keras.backend.clear_session()

# Define environment
env = gym.make("LunarLander-v2")
# Get dimensions of state and amount of possible actions (4 for LunarLander-v2)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage object to save observations, actions, rewards etc. during trajectory
#storage = Storage()

# initialize actor and critics model
#observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
# actor = Actor()
# critic = Critic()

# Initialize: observation(agent state), 
# episode return(summed rewards for singe ) and 
# episode length(amount of steps taken (=frames) before agent finished)
# observation, episode_return, episode_length = env.reset(), 0, 0

In [115]:
class Agent:
    '''
    ###Skizze - Not used yet

    Currently contains:
    - Collects data
    - Training process (iterator, updater, actor loss fun)
    - get advantage function
    - dicount rewards function
    - Get ratio function

    Whats missing: 
    - All the FUCKING self's before variable assignment and for functions (fuck you python, even though i love you)    
    '''

    def __init__(self):
        ''' 
        Initialize Parameters.
        ###Maybe pass hyperparameters?
        '''
        self.actor = Actor()
        self.critic = Critic()
        self.storage = Storage()
        #print(self.actor.trainable_variables())


    def collect_train_data(self):
        '''
        Agent takes steps in environment according to current policy. Information gets saved to update policy.
        -> Data collection
        '''
        observation, episode_return, episode_length = env.reset(), 0, 0
        episodes_total = 0
        # Iteration of whole training process
        # for epoch in tqdm(range(epochs), desc = 'Epochs'):

            # Initialize values for return, length and episodes
            # sum_return = 0
            # sum_length = 0
            # num_episodes = 0

        # Each timestep t of steps_per_epoch (in paper denoted as capital T)
        #  allows takes on action in a state and saves the information in storage object
        for t in tqdm(range(steps_per_epoch)):

            # Toggles displaying of environment
            # if render or epoch == epochs-1 and epochs != 1:
            #     env.render()

            if render:
                env.render()

            # Reshaping observation to fit as input for Actor network (policy)
            observation = observation.reshape(1,-1)
            
            # Obtain action and logits for this observation by our actor
            logits, action = self.actor.sample_action(observation)
            
            # Take action in environment and obtain the rewards for it
            # Variable done represents wether agent has finished 
            # The last variable would be diagnostic information, not needed for training
            observation_new, reward, done, _ = env.step(action[0].numpy())

            # Sum up rewards over this episode and count amount of frames
            episode_return += reward
            episode_length += 1

            # Get the Base-Estimate from the Critics network
            base_estimate = self.critic(observation)

            # Store Variables collected in this timestep t
            self.storage.store(observation=observation, action=action, logits=logits, reward=reward, BaselineEstimate=base_estimate)
            # Save the new state of our agent
            observation = observation_new
            
            # Check if terminal state is reached in environment
            if done:
                # Save information about episode
                self.storage.conclude_episode()
                # Refresh environment and reset return and length value
                observation, episode_return, episode_length = env.reset(), 0, 0

        # obtain all episodes saved in storage
        # episodes, amount_episodes = self.storage.get_episodes()


    def update_policy(self, episodes, optimizer, clip_param, c_1 = 1, c_2=0.01):
        '''
        Update policy with the collected data (Parameter updates for actor)

        Args: 
        episodes(list): Contains all information on one episode in the following order:
                        [observations, actions, logits, rewards, BaselineEstimate, summed rewards]
        actor(object): Object of the actor model.
        critic(object): Object of the critic model.
        actor_loss(function): Clipped objective function for PPO.
        optimizer(object): Optimizer used to train actor.
        clip_param(float): Hyperparameter to decide values to clip ratio between.
        c_1(float): hyperparameter to determine how strongly loss of the critic network should be weighed
        c_2(float): hyperparameter to determine how strongly entropy should be weighed


        Information stored as:
        storage.episodes[different episodes]
                        [observations, actions, logits, rewards, BaselineEstimate, sum(self.rewards)]
                        [look at single one]
        '''
        # for epoch in training_iteratins:
        # Save network loss
        train_losses_actor = []
        train_losses_critic = []
        
        # Iterate over all finished episodes from collected training data
        for episode in tqdm(episodes):

            # Update parameters
            # Compute train losses and action by chosen by policy
            actor_loss, critic_loss = self.train_step(
                # States
                episode[0],
                # Actions
                episode[1],
                #optimizer (Adam)
                optimizer,
                # Logits
                episode[2],
                # Rewards
                episode[3],
                clip_param,
                c_1,
                c_2 
            )

            train_losses_actor.append(actor_loss)
            train_losses_critic.append(critic_loss)

            return train_losses_actor, train_losses_critic


    def train_step(self, states, actions, optimizer, train_logits, train_rewards, clip_param, c_1, c_2):
        '''
        Updates actor network parameters and returns the loss to evaluate performance.

        Args:
        model(object): Object of the actor model.
        input(list): contains floats describing the actors state.
        loss_function(function): Clipped objective function for PPO.
        optimizer(object): Optimizer used to train actor.
        train_logits():
        train_rewards():
        clip_param():
        c_1(): 
        c_2():
        '''

        

        # use tf.gradientTape to compute loss, then gradients and apply these to the model to modify the parameters
        with tf.GradientTape() as tape, tf.GradientTape() as tape2:
            # print(self.actor.trainable_variables())
            # Obtain action and logits for this state selected by policy
            #print(f' Observation shape/type {observation}')
            #print(f'Trainables: {self.actor.layers[0].weights}')


            # logits_new, actions_new = sample_action(states)
            logits_new = []
            b_estimates_new = []

            # Compute values with updated critic network
            # b_estimates_new = critic(states)

            # till we work with np arrays we need to sample each action for this by looping through it
            for i in states:
                logits, _ = self.actor.sample_action(i)
                logits_new.append(logits)
                b_estimate = self.critic(i)
                
                b_estimates_new.append(b_estimate)

            # Compute & weigh entropy 
            #entropy = c_2 * np.mean(-(logits_new * train_logits))   # <----- DOESNT WORK YET Musste ich erstmal rausnehmen für den Rest vom Debugging
            # entropy = 0.01

            # Computes MSE between output of the critics network (value) the discounted sum of rewards
            #  which represents an estimate based on rewards collected during training
            # critic_loss = c_1 * tf.keras.losses.MeanSquaredError(b_estimates_new, self.discounted_reward(train_rewards)).numpy()
            #print('Weewoo')
            #print(tf.reduce_mean((np.array(train_rewards) - tf.convert_to_tensor(b_estimates_new, dtype=tf.float32)) ** 2))
            critic_loss = tf.reduce_mean((np.array(train_rewards) - tf.convert_to_tensor(b_estimates_new, dtype=tf.float32)) ** 2)
            actor_loss = self.actor_loss_fun(actions, train_logits, logits_new, train_rewards, b_estimates_new, clip_param)
            #actor_loss = self.actor_loss_fun(actions, train_logits, logits_new, train_rewards, b_estimates_new, clip_param)
            #actor_loss = tf.convert_to_tensor(actor_loss)
            #critic_loss = tf.cast(critic_loss, dtype=tf.float32)
            #print(f'Critics loss:{type(critic_loss)}. Actor Loss {actor_loss.dtype}')
            
            print("gradient_fun")
            print(type(actor_loss))
           # print('Actor weights')
           # print(self.critic.layers[0].weights)

            #a_gradients = tape.gradient(actor_loss, self.actor.trainable_variables)
            c_gradients = tape2.gradient(critic_loss, self.critic.trainable_variables)
            
            #actor_loss = 0
            #print(tape)
            #print('Actor loss')
            #print(actor_loss)
            #print('Trainable Weights')
            #print(self.actor.trainable_weights)
        
        #print(f'Gradients Actor: {a_gradients}. Gradients Critic: {c_gradients}')

        # Update parameters
        #optimizer.apply_gradients(zip(a_gradients, self.actor.trainable_variables))
        optimizer.apply_gradients(zip(c_gradients, self.critic.trainable_variables))

        #print("updated weights")
        #print(self.critic.layers[0].weights)

        

        return actor_loss, critic_loss


    def actor_loss_fun(self, actions, logits_old, logits_new, rewards, b_estimates_new, clip_param):
        '''
        Computes loss for Actor Network output.

        Args:
        logits_old():
        logits_new():
        reward():
        b_estimates_new():
        clip_param():
        '''
        
        ratio = self.get_ratio_episode(actions, logits_old, logits_new)

        ### FIND OUT WHICH: SINGLE OR MULTIPLE ELEMENTS ARE WANTED AND ADJUST EITHER IN GET_ADV OR THE UPPER TWO FUNCTIONS
        advantage = self.get_advantage(rewards, b_estimates_new)
        
        # Unclipped value
        l1 = ratio * advantage
        # Clipped ratio between values determined by Hyperparam and multiplied by advantage (see objective function)
        l2 = np.clip(ratio, a_min=1 - clip_param, a_max=1 + clip_param) * advantage
        l1 = tf.convert_to_tensor(l1)
        l2 = tf.convert_to_tensor(l2)
        
        # Compute minimum of both and take the mean to return float loss
        actor_loss = -tf.reduce_mean(tf.minimum(l1, l2))
        print("loss_fun")
        #print(type(tf.convert_to_tensor(np.array(actor_loss), dtype=tf.float32)))
        print(np.array(actor_loss))
        return tf.convert_to_tensor(np.array(actor_loss), dtype=tf.float32)


  


    def get_advantage(self, rewards, b_estimates, gamma = 0.99):
        '''
        Computes Advantage for action in state.

        Args:
        rewards(float): Reward for action.
        gamma(float): Discount factor.
        b_estimates(float): Baseline Estimates.
        
        '''
        # Saves list of all rewards in new variable 
        #rewards = episodes[0][3]


        # Get discounted sum of rewards 
        disc_sum = self.discounted_reward(rewards, gamma)


        # # Estimated Value of the current situtation from the critics network
        # b_estimates = self.episodes[0][4] 

        # Convert lists to np arrays and flatten
        disc_sum_np = np.array(disc_sum)
        b_estimates_np = np.array(b_estimates)
        b_estimates_np = b_estimates_np.flatten()

        # substract arrays to obtain advantages
        advantages = np.subtract(disc_sum_np, b_estimates_np)

        return advantages


     ### MIGHT NOT WORK
    #  output for: discounted_reward([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 0.99)
    #  -> [8.91, 7.920000000000001, 6.930000000000001, 5.94, 4.95, 3.96, 2.9699999999999998, 1.98, 0.99, 0]
    #  ###
    def discounted_reward(self, rewards, gamma = 0.99):
        '''
        weighs all rewards in a way such that immediate rewards have a stronger impact than possible future rewards.

        Args:
        rewards(list): list of all rewards collected by the agent in episode t (?)
        gamma(float): Hyperparameter determining how much future rewards should be weighed in
        '''
        # To select the next reward
        i = 0
        discounted_rewards = []

        # Iterates through every reward and appends a discounted version to the output
        for r in rewards:
            disc = 0
            for t in rewards[i:-1]:
                discount_t = gamma ** t
                disc += t * discount_t
            i += 1
            discounted_rewards.append(disc)

        # returns list of discounted rewards.
        return discounted_rewards   



    ## get ratio lutsch noch ARSCH, das Ding verarscht mich anders

    def get_ratio_episode(self, actions, logits_old, logits_new): 
        r = []
        for a, o, n in zip(actions, logits_old, logits_new):
            o = tf.convert_to_tensor(o)
            n = tf.convert_to_tensor(n)
            #print(f'A: {a} O: {type(o)} N: {type(n)}')

            #get the Logarithmic version of all logits for computational efficiency
            log_prob_old = tf.nn.log_softmax(o)
            log_prob_new = tf.nn.log_softmax(n)

            # encode in OneHotVector and reduce to sum, giving the log_prob for the action the agent took for both policies
            logprobability_old = tf.reduce_sum(
                tf.one_hot(a, num_actions) * log_prob_old, axis=1
            )
            logprobability_new = tf.reduce_sum(
                tf.one_hot(a, num_actions) * log_prob_new, axis=1
            )
            # get the ratio of new over old prob
            ratio = tf.exp(logprobability_new - logprobability_old)
            r.append(ratio)
        return r


    def run(self):

        c_loss = []
        a_loss = []
        save_epochs = []

        for epoch in tqdm(range(epochs), desc=str(epochs)):
            self.collect_train_data()
            data, _ = self.storage.get_episodes()
            #print(data)
            a, c = self.update_policy(data, optimizer, clip_ratio)
            a_loss.append(a)
            c_loss.append(c)
            save_epochs.append(epoch)

        return a_loss, c_loss, save_epochs


            
  

In [116]:

ppo_agent = Agent()
actor_loss, critic_loss, save_epochs = ppo_agent.run()
# ppo_agent.collect_train_data()
# data = storage.get_episodes()
# #print(data)
# print(ppo_agent.update_policy(data[0], actor, critic, optimizer, clip_ratio))

plot = 1
if plot:
    fig, axs = plt.subplots(2)
    fig.suptitle('Critic & Actor Loss')
    axs[0].plot(save_epochs, critic_loss)
    # For actor loss later
    axs[1].plot(save_epochs, critic_loss)
'''
Nikis idea what the problem is:
- Actor_loss needs to be eager tensor dtpye float32 shit
'''

100%|██████████| 1000/1000 [00:03<00:00, 277.34it/s]
  0%|          | 0/11 [00:00<?, ?it/s]
1:   0%|          | 0/1 [00:03<?, ?it/s]


loss_fun
17.674249368990846
gradient_fun
<class 'tensorflow.python.framework.ops.EagerTensor'>


ValueError: No gradients provided for any variable: (['actor/dense/kernel:0', 'actor/dense/bias:0', 'actor/dense_1/kernel:0', 'actor/dense_1/bias:0', 'actor/dense_2/kernel:0', 'actor/dense_2/bias:0', 'actor/dense_3/kernel:0', 'actor/dense_3/bias:0'],). Provided `grads_and_vars` is ((None, <tf.Variable 'actor/dense/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[-0.00540234,  0.00503504, -0.00397553, ..., -0.00462557,
        -0.00591265,  0.02421263],
       [ 0.00168411,  0.00219151, -0.0221698 , ..., -0.00685579,
        -0.00442842,  0.00893392],
       [-0.0008146 , -0.00633603,  0.00081152, ..., -0.00448825,
         0.00757394,  0.01312999],
       ...,
       [ 0.00897591, -0.00092317, -0.01236722, ...,  0.00358165,
         0.0049873 ,  0.01273157],
       [ 0.00232917,  0.01662414,  0.00691048, ...,  0.00230937,
         0.01516791,  0.00615546],
       [-0.00347868, -0.01057736, -0.01041433, ..., -0.00336184,
        -0.00307776,  0.00679514]], dtype=float32)>), (None, <tf.Variable 'actor/dense/bias:0' shape=(128,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>), (None, <tf.Variable 'actor/dense_1/kernel:0' shape=(128, 128) dtype=float32, numpy=
array([[-0.00181471,  0.00046002, -0.00394883, ..., -0.01144408,
        -0.00889392, -0.00744953],
       [-0.00879102, -0.0114245 , -0.01561856, ..., -0.01409598,
         0.00270666, -0.00794675],
       [ 0.0146388 ,  0.01565737, -0.00444501, ...,  0.00061687,
         0.00112723, -0.00103347],
       ...,
       [ 0.01030375, -0.00128284, -0.02208928, ...,  0.00526952,
        -0.00614633,  0.01031822],
       [ 0.00018099, -0.00272611,  0.01435435, ...,  0.00870736,
         0.01108824,  0.00820426],
       [-0.01041555, -0.0084445 , -0.00974746, ...,  0.01206642,
        -0.01883854, -0.01615048]], dtype=float32)>), (None, <tf.Variable 'actor/dense_1/bias:0' shape=(128,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>), (None, <tf.Variable 'actor/dense_2/kernel:0' shape=(128, 64) dtype=float32, numpy=
array([[-1.09523432e-02,  4.70806984e-03,  7.09796464e-03, ...,
         1.40006014e-03,  3.79924290e-03,  2.05443054e-03],
       [ 1.07491072e-02, -7.25436769e-03, -1.21000791e-02, ...,
         1.30753489e-02, -1.21214315e-02, -4.26769536e-03],
       [ 6.49184873e-03, -2.41763308e-03,  8.49148352e-03, ...,
        -2.85155363e-02, -3.89457028e-03,  1.21120000e-02],
       ...,
       [ 1.90021992e-02, -4.86763820e-05, -1.64333195e-03, ...,
         8.27024691e-03,  1.41660161e-02,  3.21729574e-03],
       [-1.17920861e-02, -2.36707600e-03, -3.88719083e-04, ...,
         1.42745010e-03, -5.17856702e-03, -3.52891535e-03],
       [ 2.00320361e-03,  4.54213005e-03,  2.70186737e-03, ...,
         1.54422307e-02, -2.54795537e-03,  5.33705624e-03]], dtype=float32)>), (None, <tf.Variable 'actor/dense_2/bias:0' shape=(64,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>), (None, <tf.Variable 'actor/dense_3/kernel:0' shape=(64, 4) dtype=float32, numpy=
array([[ 0.10019085,  0.26521456,  0.10242036, -0.12537979],
       [-0.03524053,  0.12316877, -0.07485355,  0.11988726],
       [ 0.04753539, -0.06178357,  0.18550104,  0.06659004],
       [-0.17818199, -0.21180172, -0.25707546, -0.13141543],
       [ 0.2916727 ,  0.22136998,  0.13578764,  0.29403394],
       [-0.03515705,  0.05960038,  0.21002966,  0.05615267],
       [ 0.20811003,  0.05324852, -0.2395409 ,  0.11401558],
       [-0.28650543, -0.26055574, -0.18886843, -0.24259748],
       [-0.13776734,  0.24908608,  0.10414386,  0.2849648 ],
       [ 0.10801578, -0.2896709 , -0.01962474, -0.24921958],
       [-0.06124009,  0.09348625,  0.1074993 ,  0.02223209],
       [-0.24522507, -0.24593002,  0.2188201 ,  0.28615296],
       [-0.01250002, -0.21772787, -0.29578006, -0.0722232 ],
       [-0.06776956,  0.16380182,  0.12931445, -0.21060711],
       [-0.16907415, -0.05005662, -0.16986237, -0.27887213],
       [-0.07729758, -0.00632033,  0.08661166, -0.10217781],
       [-0.04928248,  0.11700663, -0.26074654,  0.07165664],
       [-0.27479434,  0.17639944,  0.2460553 ,  0.23038161],
       [ 0.01591706, -0.05723694,  0.03052974, -0.00324976],
       [-0.15525274, -0.13775864, -0.10672189, -0.2573229 ],
       [-0.09051545, -0.01326051,  0.09552008, -0.28087127],
       [-0.12062664,  0.26510602,  0.16747132,  0.15986452],
       [-0.14615998, -0.23450094,  0.04131392,  0.2441178 ],
       [ 0.16423157,  0.11882815, -0.19333687,  0.13138887],
       [ 0.11745301,  0.11219111, -0.09304574, -0.12135121],
       [-0.04947837,  0.18959534, -0.06760716,  0.05848396],
       [ 0.18167734, -0.1683485 , -0.22654846,  0.29406047],
       [-0.19773422, -0.21731561, -0.01272631, -0.05474482],
       [-0.06799328, -0.29387137, -0.22914794,  0.04203963],
       [ 0.2026591 , -0.09881304,  0.26556903,  0.12053812],
       [-0.13360585, -0.1742622 , -0.26650888,  0.13277128],
       [ 0.25753957,  0.1612364 ,  0.14067405,  0.07968518],
       [-0.21761484, -0.14705111,  0.03937584, -0.08019558],
       [-0.29118535, -0.12227471,  0.03761998, -0.13383155],
       [ 0.01602522,  0.06648189,  0.00551185, -0.27443486],
       [ 0.21630871,  0.12444106,  0.08403394,  0.21860462],
       [ 0.2925511 ,  0.06447589, -0.2122508 , -0.19275849],
       [-0.21858141,  0.17051485,  0.18435636, -0.1318497 ],
       [ 0.25399786,  0.01577881,  0.11870018,  0.2092517 ],
       [-0.12461323,  0.22905809,  0.02524012, -0.2577904 ],
       [ 0.11325249,  0.05704927, -0.18438074,  0.10700837],
       [-0.17039998,  0.25746214,  0.2213614 ,  0.11570764],
       [-0.08480884,  0.27224904,  0.1564329 , -0.18379462],
       [-0.09342675,  0.14960486,  0.25265092,  0.2426433 ],
       [-0.18687803, -0.25093842,  0.25302124,  0.0559465 ],
       [ 0.19995785,  0.08335134,  0.28469396, -0.22185978],
       [ 0.11453846, -0.18357316,  0.25201792,  0.19564697],
       [-0.26780993,  0.18415198,  0.15758312,  0.22869569],
       [-0.14326121,  0.05526415,  0.23584718, -0.20082363],
       [ 0.12953654,  0.09828642, -0.25494877,  0.11612895],
       [-0.233812  ,  0.02414134, -0.14084458,  0.0164344 ],
       [-0.29199284, -0.18788296,  0.07764602,  0.07311583],
       [ 0.22685993, -0.1112511 , -0.00064561,  0.23306245],
       [ 0.09487113, -0.27447337, -0.01611161, -0.18587357],
       [ 0.05438673,  0.00916237,  0.18493044, -0.06180233],
       [ 0.28093153, -0.22755796,  0.07602295,  0.20633197],
       [-0.02245906, -0.20460689, -0.03476879,  0.17667252],
       [-0.05727731,  0.06297761,  0.05765486,  0.29494208],
       [-0.03085488, -0.22136106,  0.16634336,  0.13138065],
       [-0.19744909,  0.23656225,  0.23342359,  0.20696998],
       [-0.21084295,  0.13469201,  0.19871047, -0.13744639],
       [-0.02686024, -0.13753031, -0.18523604, -0.16640987],
       [-0.19464374,  0.08038983, -0.22795442,  0.17523634],
       [ 0.2520337 , -0.07866251, -0.23814398, -0.03437087]],
      dtype=float32)>), (None, <tf.Variable 'actor/dense_3/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)).

In [None]:
env.close()