In [71]:
# Neural Network
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
# Environment


import gym
# Further support
import numpy as np
import time
import scipy.signal
from tqdm import tqdm
import datetime

%load_ext tensorboard


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# TrajectoryStorage


In [72]:
class Storage:
    '''
    Contains all information the agent collects interacting with the environment.
    '''


    def __init__(self):
        '''
        Initializes empty lists as storages all observation variables during trajectory
        '''
        # Saves information about the current state of the agent at each step
        self.observations = []

        # Saves actions made and rewards achieved
        self.actions = []
        self.rewards = []
        # Outputs from the actor network, an action is sampled from (Probabilities)
        self.logits = []
        # Outputs from the crtitics network (Values)
        self.BaselineEstimate = []

        # finished episodes will be completely stored in this list 
        self.episodes = []


    def store(self, observation, action, logits, reward, BaselineEstimate):
        '''
        Adds given information to the storage.

        Args:
        observation(obj): information (e.g. pixel values) about current state of agent
        action(float): Output of the actor network. Describes the action taken
        logits():
        reward(floats): Rewards collected by agent
        BaselineEstimate():
        '''
        self.observations.append(observation)
        self.actions.append(action)
        self.logits.append(logits)
        self.rewards.append(reward)
        self.BaselineEstimate.append(BaselineEstimate) 
        

    def conclude_episode(self):
        '''
        Append all collected values to episodes list once one episode is finished.
        Computes all rewards collected in one episode. Prepares storage for next episode.
        '''
        self.episodes.append(
            [self.observations,
             self.actions, 
             self.logits,
             self.rewards,
             self.BaselineEstimate,
             # Get the return of the whole episode 
             sum(self.rewards)])
             
        # Empty the arrays for new trajectory
        self.observations.clear()
        self.actions.clear()
        self.logits.clear()
        self.rewards.clear()
        self.BaselineEstimate.clear()

     
    def get_episodes(self):
        '''
        Returns list containing finished trajectories stored in self.episodes
        and the amount of episodes passed.
        '''
        return self.episodes, len(self.episodes)
        
        

# Actor Model


In [73]:
class Actor(Model):
    '''
    Neural network computing the actions the agent will take
    '''


    def __init__(self):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Actor, self).__init__()
        
        self.l = [
            # Three Dense Layers with random initial parameters having a standart deviation of 0.01
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            
            # Output layer with softmax activation function applied to for neurons.
            # Outputs prpobability for each of our for actions 
            # (Do nothing, fire left orientation engine, fire main engine, fire right orientation engine)
            Dense(4, activation="softmax")
        ]


    #@tf.function        
    def call(self, x):
        '''
        Iterates input x through network to create softmax ouutput.

        Args:
        x(): Network input. Pixel values representing the current state of the agent
        '''
        for l in self.l:
            x = l(x)
        return x


    #####  logits = actor(observation) -> actor must be in capitol, gets instantiated twice, maybe idea is wrong
    #@tf.function
    def sample_action(self,observation):
        '''
        Calls the actor network with state of the agent and returns the network object + the samnpled action

        Args:
        observation(): Representation of actors state. Same as x in the call function. 
        '''
        # Output of softmax function
        #logits = self.call(observation)
        logits = self(observation)
    # tf.print(type(logits))
        # Sample action from the Softmax output of the network
        action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    # tf.print(action)
        return logits, action

# Critic Model

In [74]:
class Critic(Model):
    '''
    Represents the value function of the network. 
    Input is a certain state and output a float value for that state.
    '''


    def __init__(self):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Critic, self).__init__()
        self.l = [
            # Three Dense Layers with ReLu activation function
            # Random initial parameters having a standart deviation of 0.01
            
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),

            # Output layer with Tanh activation function to get float output value ([-1;1])
            # Random initial parameters having a standart deviation of 0.01
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]


    #@tf.function 
    def call(self, x):
        '''
        Iterates input x through network to create tanh output between -1 and 1 
        giving input state x a value.

        Args:
        x(): Network input. Pixel values representing the current state of the agent.
        '''
        for l in self.l:
            x = l(x)
        return x

In [75]:
'''
Adjust Hyperparameters
'''

# Number of iterations
epochs = 10
# Leads to ~10 Episodes per epoch, then compute new parameters (smaller batching)
steps_per_epoch = 1000 

# Learning rate for actor and critic
lr_actor = 3e-4
lr_critic = 3e-4

# Movements in environment (state-space) to collect training data
train_policy_iterations = 80
train_value_iterations = 80

# Parameter to decide how strongly the policy ratio gets clipped therefore how much policy (actor network)
#  updates we allow
# The selected 0.2 is the number proposed by the original paper by OpenAI
clip_ratio = 0.2
# Weighs loss of critic model
c_1 = 0.5

#
target_kl = 0.01


# Update weights with Adam optimizer
optimizer = Adam()

# To toggle displaying of environment
render = False

# Discount variable for rewards to whey immediate rewards stronger
gamma = 0.99

In [76]:
# Reset all states generated by Keras
tf.keras.backend.clear_session()

# Define environment
env = gym.make("LunarLander-v2")
# Get dimensions of state and amount of possible actions (4 for LunarLander-v2)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage object to save observations, actions, rewards etc. during trajectory
#storage = Storage()

# initialize actor and critics model
#observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
# actor = Actor()
# critic = Critic()

# Initialize: observation(agent state), 
# episode return(summed rewards for singe ) and 
# episode length(amount of steps taken (=frames) before agent finished)
# observation, episode_return, episode_length = env.reset(), 0, 0

In [77]:
class Agent:
    '''
    ###Skizze - Not used yet

    Currently contains:
    - Collects data
    - Training process (iterator, updater, actor loss fun)
    - get advantage function
    - dicount rewards function
    - Get ratio function

    Whats missing: 
    - All the FUCKING self's before variable assignment and for functions (fuck you python, even though i love you)    
    '''

    def __init__(self):
        ''' 
        Initialize Parameters.
        ###Maybe pass hyperparameters?
        '''
        self.actor = Actor()
        self.critic = Critic()
        self.storage = Storage()
        #print(self.actor.trainable_variables())


    def collect_train_data(self):
        '''
        Agent takes steps in environment according to current policy. Information gets saved to update policy.
        -> Data collection
        '''
        observation, episode_return, episode_length = env.reset(), 0, 0
        episodes_total = 0
        # Iteration of whole training process
        # for epoch in tqdm(range(epochs), desc = 'Epochs'):

            # Initialize values for return, length and episodes
            # sum_return = 0
            # sum_length = 0
            # num_episodes = 0

        # Each timestep t of steps_per_epoch (in paper denoted as capital T)
        #  allows takes on action in a state and saves the information in storage object
        for t in tqdm(range(steps_per_epoch)):

            # Toggles displaying of environment
            # if render or epoch == epochs-1 and epochs != 1:
            #     env.render()

            if render:
                env.render()

            # Reshaping observation to fit as input for Actor network (policy)
            observation = observation.reshape(1,-1)
            
            # Obtain action and logits for this observation by our actor
            logits, action = self.actor.sample_action(observation)
            
            # Take action in environment and obtain the rewards for it
            # Variable done represents wether agent has finished 
            # The last variable would be diagnostic information, not needed for training
            observation_new, reward, done, _ = env.step(action[0].numpy())

            # Sum up rewards over this episode and count amount of frames
            episode_return += reward
            episode_length += 1

            # Get the Base-Estimate from the Critics network
            base_estimate = self.critic(observation)

            # Store Variables collected in this timestep t
            self.storage.store(observation=observation, action=action, logits=logits, reward=reward, BaselineEstimate=base_estimate)
            # Save the new state of our agent
            observation = observation_new
            
            # Check if terminal state is reached in environment
            if done:
                # Save information about episode
                self.storage.conclude_episode()
                # Refresh environment and reset return and length value
                observation, episode_return, episode_length = env.reset(), 0, 0

        # obtain all episodes saved in storage
        # episodes, amount_episodes = self.storage.get_episodes()


    def update_policy(self, episodes, optimizer, clip_param, c_1 = 1, c_2=0.01):
        '''
        Update policy with the collected data (Parameter updates for actor)

        Args: 
        episodes(list): Contains all information on one episode in the following order:
                        [observations, actions, logits, rewards, BaselineEstimate, summed rewards]
        actor(object): Object of the actor model.
        critic(object): Object of the critic model.
        actor_loss(function): Clipped objective function for PPO.
        optimizer(object): Optimizer used to train actor.
        clip_param(float): Hyperparameter to decide values to clip ratio between.
        c_1(float): hyperparameter to determine how strongly loss of the critic network should be weighed
        c_2(float): hyperparameter to determine how strongly entropy should be weighed


        Information stored as:
        storage.episodes[different episodes]
                        [observations, actions, logits, rewards, BaselineEstimate, sum(self.rewards)]
                        [look at single one]
        '''
        # for epoch in training_iteratins:
        # Save network loss
        train_losses_actor = []
        train_losses_critic = []
        
        # Iterate over all finished episodes from collected training data
        for episode in tqdm(episodes):

            # Update parameters
            # Compute train losses and action by chosen by policy
            actor_loss, critic_loss = self.train_step(
                # States
                episode[0],
                # Actions
                episode[1],
                #optimizer (Adam)
                optimizer,
                # Logits
                episode[2],
                # Rewards
                episode[3],
                clip_param,
                c_1,
                c_2 
            )
            train_losses_actor.append(actor_loss)
            train_losses_critic.append(critic_loss)

            return train_losses_actor, train_losses_critic


    def train_step(self, states, actions, optimizer, train_logits, train_rewards, clip_param, c_1, c_2):
        '''
        Updates actor network parameters and returns the loss to evaluate performance.

        Args:
        model(object): Object of the actor model.
        input(list): contains floats describing the actors state.
        loss_function(function): Clipped objective function for PPO.
        optimizer(object): Optimizer used to train actor.
        train_logits():
        train_rewards():
        clip_param():
        c_1(): 
        c_2():
        '''

        

        # use tf.gradientTape to compute loss, then gradients and apply these to the model to modify the parameters
        with tf.GradientTape() as tape, tf.GradientTape() as tape2:
            # print(self.actor.trainable_variables())
            # Obtain action and logits for this state selected by policy
            #print(f' Observation shape/type {observation}')
            #print(f'Trainables: {self.actor.layers[0].weights}')


            # logits_new, actions_new = sample_action(states)
            logits_new = []
            b_estimates_new = []

            # Compute values with updated critic network
            # b_estimates_new = critic(states)

            # till we work with np arrays we need to sample each action for this by looping through it
            for i in states:
                logits, _ = self.actor.sample_action(i)
                logits_new.append(logits)
                b_estimate = self.critic(i)
                
                b_estimates_new.append(b_estimate)

            # Compute & weigh entropy 
            #entropy = c_2 * np.mean(-(logits_new * train_logits))   # <----- DOESNT WORK YET Musste ich erstmal rausnehmen für den Rest vom Debugging
            # entropy = 0.01

            # Computes MSE between output of the critics network (value) the discounted sum of rewards
            #  which represents an estimate based on rewards collected during training
            # critic_loss = c_1 * tf.keras.losses.MeanSquaredError(b_estimates_new, self.discounted_reward(train_rewards)).numpy()
            #print('Weewoo')
            #print(tf.reduce_mean((np.array(train_rewards) - tf.convert_to_tensor(b_estimates_new, dtype=tf.float32)) ** 2))
            critic_loss = tf.reduce_mean((np.array(train_rewards) - tf.convert_to_tensor(b_estimates_new, dtype=tf.float32)) ** 2)
            #actor_loss = entropy * self.actor_loss_fun(actions, train_logits, logits_new, train_rewards, b_estimates_new, clip_param)
            #actor_loss = self.actor_loss_fun(actions, train_logits, logits_new, train_rewards, b_estimates_new, clip_param)
            #actor_loss = tf.convert_to_tensor(actor_loss)
            #critic_loss = tf.cast(critic_loss, dtype=tf.float32)
            #print(f'Critics loss:{type(critic_loss)}. Actor Loss {actor_loss.dtype}')

            print('Actor weights')
            print(self.critic.layers[0].weights)

            #a_gradients = tape.gradient(actor_loss, self.actor.trainable_variables)
            c_gradients = tape2.gradient(critic_loss, self.critic.trainable_variables)
            
            actor_loss = 0
            #print(tape)
            #print('Actor loss')
            #print(actor_loss)
            #print('Trainable Weights')
            #print(self.actor.trainable_weights)
        
        #print(f'Gradients Actor: {a_gradients}. Gradients Critic: {c_gradients}')

        # Update parameters
        #optimizer.apply_gradients(zip(a_gradients, self.actor.trainable_variables))
        optimizer.apply_gradients(zip(c_gradients, self.critic.trainable_variables))

        print("updated weights")
        print(self.critic.layers[0].weights)

        

        return actor_loss, critic_loss


    def actor_loss_fun(self, actions, logits_old, logits_new, rewards, b_estimates_new, clip_param):
        '''
        Computes loss for Actor Network output.

        Args:
        logits_old():
        logits_new():
        reward():
        b_estimates_new():
        clip_param():
        '''
        
        ratio = self.get_ratio_episode(actions, logits_old, logits_new)

        ### FIND OUT WHICH: SINGLE OR MULTIPLE ELEMENTS ARE WANTED AND ADJUST EITHER IN GET_ADV OR THE UPPER TWO FUNCTIONS
        advantage = self.get_advantage(rewards, b_estimates_new)
        
        # Unclipped value
        l1 = ratio * advantage
        # Clipped ratio between values determined by Hyperparam and multiplied by advantage (see objective function)
        l2 = np.clip(ratio, a_min=1 - clip_param, a_max=1 + clip_param) * advantage
        l1 = tf.convert_to_tensor(l1)
        l2 = tf.convert_to_tensor(l2)
        
        # Compute minimum of both and take the mean to return float loss
        actor_loss = -tf.reduce_mean(tf.minimum(l1, l2))
        return actor_loss


  


    def get_advantage(self, rewards, b_estimates, gamma = 0.99):
        '''
        Computes Advantage for action in state.

        Args:
        rewards(float): Reward for action.
        gamma(float): Discount factor.
        b_estimates(float): Baseline Estimates.
        
        '''
        # Saves list of all rewards in new variable 
        #rewards = episodes[0][3]


        # Get discounted sum of rewards 
        disc_sum = self.discounted_reward(rewards, gamma)


        # # Estimated Value of the current situtation from the critics network
        # b_estimates = self.episodes[0][4] 

        # Convert lists to np arrays and flatten
        disc_sum_np = np.array(disc_sum)
        b_estimates_np = np.array(b_estimates)
        b_estimates_np = b_estimates_np.flatten()

        # substract arrays to obtain advantages
        advantages = np.subtract(disc_sum_np, b_estimates_np)

        return advantages


     ### MIGHT NOT WORK
    #  output for: discounted_reward([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 0.99)
    #  -> [8.91, 7.920000000000001, 6.930000000000001, 5.94, 4.95, 3.96, 2.9699999999999998, 1.98, 0.99, 0]
    #  ###
    def discounted_reward(self, rewards, gamma = 0.99):
        '''
        weighs all rewards in a way such that immediate rewards have a stronger impact than possible future rewards.

        Args:
        rewards(list): list of all rewards collected by the agent in episode t (?)
        gamma(float): Hyperparameter determining how much future rewards should be weighed in
        '''
        # To select the next reward
        i = 0
        discounted_rewards = []

        # Iterates through every reward and appends a discounted version to the output
        for r in rewards:
            disc = 0
            for t in rewards[i:-1]:
                discount_t = gamma ** t
                disc += t * discount_t
            i += 1
            discounted_rewards.append(disc)

        # returns list of discounted rewards.
        return discounted_rewards   



    ## get ratio lutsch noch ARSCH, das Ding verarscht mich anders

    def get_ratio_episode(self, actions, logits_old, logits_new): 
        r = []
        for a, o, n in zip(actions, logits_old, logits_new):
            o = tf.convert_to_tensor(o)
            n = tf.convert_to_tensor(n)
            #print(f'A: {a} O: {type(o)} N: {type(n)}')

            #get the Logarithmic version of all logits for computational efficiency
            log_prob_old = tf.nn.log_softmax(o)
            log_prob_new = tf.nn.log_softmax(n)

            # encode in OneHotVector and reduce to sum, giving the log_prob for the action the agent took for both policies
            logprobability_old = tf.reduce_sum(
                tf.one_hot(a, num_actions) * log_prob_old, axis=1
            )
            logprobability_new = tf.reduce_sum(
                tf.one_hot(a, num_actions) * log_prob_new, axis=1
            )
            # get the ratio of new over old prob
            ratio = tf.exp(logprobability_new - logprobability_old)
            r.append(ratio)
        return r


    def run(self):
        for epoch in tqdm(range(epochs), desc=str(epochs)):
            self.collect_train_data()
            data, _ = self.storage.get_episodes()
            #print(data)
            self.update_policy(data, optimizer, clip_ratio)
  

In [78]:

ppo_agent = Agent()
ppo_agent.run()
# ppo_agent.collect_train_data()
# data = storage.get_episodes()
# #print(data)
# print(ppo_agent.update_policy(data[0], actor, critic, optimizer, clip_ratio))

'''
Nikis idea what the problem:
- Actor_loss needs to be eager tensor dtpye float32 shit
'''

100%|██████████| 1000/1000 [00:03<00:00, 259.58it/s]
  0%|          | 0/10 [00:00<?, ?it/s]
10:  10%|█         | 1/10 [00:04<00:37,  4.20s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.00590714,  0.0123431 , -0.01353293, ...,  0.01070632,
         0.02066741, -0.04164343],
       [-0.01133195, -0.00058727,  0.00548537, ..., -0.00211708,
        -0.01763518,  0.00022721],
       [ 0.01235563,  0.00057025,  0.01588361, ..., -0.00657793,
         0.01395506,  0.00834802],
       ...,
       [-0.00907882, -0.00623742, -0.0039287 , ..., -0.00079001,
         0.00231977, -0.01399863],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01332614,  0.02134664, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00092428]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

100%|██████████| 1000/1000 [00:03<00:00, 289.28it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
10:  20%|██        | 2/10 [00:07<00:30,  3.80s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.00566075,  0.0123431 , -0.01389103, ...,  0.01070632,
         0.02037184, -0.0416933 ],
       [-0.01213689, -0.00058727,  0.00648026, ..., -0.00211708,
        -0.01693899,  0.00121712],
       [ 0.01188577,  0.00057025,  0.01504584, ..., -0.00657793,
         0.01334865,  0.00781164],
       ...,
       [-0.00932669, -0.00623742, -0.0030102 , ..., -0.00079001,
         0.00287123, -0.01461196],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01332614,  0.02134664, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00092428]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 0.00064285,  0.        ,  0.00099365,  0.        ,  0.00098309,
       -0.00097545,  0.        ,  0.        , -0.00099612, -0.00090434,
        0.00098281, -0.00094671,  0.        ,  0.00099613,  0.0

100%|██████████| 1000/1000 [00:03<00:00, 291.32it/s]
  0%|          | 0/30 [00:00<?, ?it/s]
10:  30%|███       | 3/10 [00:11<00:26,  3.74s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.00549572,  0.0119762 , -0.01376882, ...,  0.01070632,
         0.02017385, -0.04184141],
       [-0.01267619,  0.00015463,  0.0062977 , ..., -0.00211708,
        -0.01647258,  0.00217053],
       [ 0.01157102, -0.00013831,  0.01550773, ..., -0.00657793,
         0.0129424 ,  0.00708863],
       ...,
       [-0.00949272, -0.00689792, -0.0021062 , ..., -0.00079001,
         0.00324066, -0.01530967],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01332614,  0.02134664, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00092428]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 0.00107352,  0.00074093,  0.00088118,  0.        ,  0.00164182,
       -0.00162905,  0.        ,  0.        , -0.00191502, -0.00151026,
        0.00164135, -0.00158104,  0.        ,  0.00125337,  0.0

100%|██████████| 1000/1000 [00:03<00:00, 275.68it/s]
  0%|          | 0/40 [00:00<?, ?it/s]
10:  40%|████      | 4/10 [00:15<00:23,  3.87s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.0053682 ,  0.0113404 , -0.0143284 , ...,  0.01070632,
         0.02002086, -0.04247082],
       [-0.01309304,  0.0010059 ,  0.00642092, ..., -0.00211708,
        -0.01611209,  0.00296423],
       [ 0.01132778, -0.00085184,  0.01500138, ..., -0.00657793,
         0.01262842,  0.00644049],
       ...,
       [-0.00962101, -0.00650159, -0.00117559, ..., -0.00079001,
         0.00352617, -0.01473431],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01332614,  0.02134664, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00092428]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 1.4063733e-03,  1.5863483e-03,  1.0266597e-03,  0.0000000e+00,
        2.1510101e-03, -2.1342745e-03,  0.0000000e+00,  0.0000000e+00,
       -2.3644043e-03, -1.9786276e-03,  2.1503973e-03, -2.0713743

100%|██████████| 1000/1000 [00:03<00:00, 265.79it/s]
  0%|          | 0/51 [00:00<?, ?it/s]
10:  50%|█████     | 5/10 [00:19<00:19,  3.87s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.00480201,  0.01059739, -0.01505103, ...,  0.01106913,
         0.0205855 , -0.043239  ],
       [-0.01261595,  0.00190937,  0.00674527, ..., -0.00269401,
        -0.01662355,  0.00383209],
       [ 0.01075117, -0.00167422,  0.01429653, ..., -0.00600696,
         0.01310961,  0.00565497],
       ...,
       [-0.00913124, -0.00588566, -0.00023921, ..., -0.00132977,
         0.00295114, -0.01399286],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01383333,  0.02191368, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00142926]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 1.9903800e-03,  2.4881670e-03,  1.3675510e-03, -5.6877412e-04,
        2.8151842e-03, -1.6026157e-03,  0.0000000e+00,  5.7714223e-04,
       -2.4938912e-03, -2.3622622e-03,  2.7457292e-03, -2.4730104

100%|██████████| 1000/1000 [00:03<00:00, 280.21it/s]
  0%|          | 0/61 [00:00<?, ?it/s]
10:  60%|██████    | 6/10 [00:23<00:15,  3.94s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.00432345,  0.01040426, -0.01491295, ...,  0.01137573,
         0.02106274, -0.04377332],
       [-0.0122127 ,  0.00257963,  0.00733486, ..., -0.00318164,
        -0.01705584,  0.00469262],
       [ 0.0102638 , -0.00136485,  0.0146851 , ..., -0.00552436,
         0.01351631,  0.00539047],
       ...,
       [-0.0087173 , -0.00604364, -0.00066743, ..., -0.00178597,
         0.00246512, -0.01409157],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01426199,  0.02239295, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00185606]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 2.4839947e-03,  3.1628935e-03,  1.9594566e-03, -1.0495131e-03,
        3.3765619e-03, -9.5161243e-04,  0.0000000e+00,  1.0649577e-03,
       -2.1440263e-03, -2.6865057e-03,  3.4837676e-03, -2.8124757

100%|██████████| 1000/1000 [00:03<00:00, 269.61it/s]
  0%|          | 0/72 [00:00<?, ?it/s]
10:  70%|███████   | 7/10 [00:27<00:11,  3.93s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.00375709,  0.01016461, -0.01504733, ...,  0.01163993,
         0.02147405, -0.044333  ],
       [-0.01159272,  0.00321357,  0.00791307, ..., -0.00360191,
        -0.01742841,  0.00551071],
       [ 0.00967006, -0.00113457,  0.01495658, ..., -0.00510842,
         0.01386681,  0.00510534],
       ...,
       [-0.00813914, -0.00616573, -0.00100158, ..., -0.00217914,
         0.00204623, -0.01413095],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01463142,  0.02280601, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00222389]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 3.1488342e-03,  3.7963837e-03,  2.5427709e-03, -1.4638451e-03,
        3.9364183e-03, -2.8120511e-04,  0.0000000e+00,  1.4853918e-03,
       -1.7440260e-03, -2.9659492e-03,  4.2047636e-03, -3.1050441

100%|██████████| 1000/1000 [00:03<00:00, 297.06it/s]
  0%|          | 0/83 [00:00<?, ?it/s]
10:  80%|████████  | 8/10 [00:30<00:07,  3.78s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.0032625 ,  0.00961837, -0.01555884, ...,  0.01187061,
         0.02183325, -0.04503484],
       [-0.0110513 ,  0.00395659,  0.00860697, ..., -0.00396894,
        -0.01775377,  0.00638574],
       [ 0.00915155, -0.00147471,  0.01465682, ..., -0.00474518,
         0.0141729 ,  0.00455213],
       ...,
       [-0.00763425, -0.00602034, -0.00100877, ..., -0.00252249,
         0.00168041, -0.01386929],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01495402,  0.02316674, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00254509]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 3.7294358e-03,  4.5412127e-03,  3.2438133e-03, -1.8256782e-03,
        4.4253413e-03,  3.8900389e-04,  0.0000000e+00,  1.8525562e-03,
       -1.1579429e-03, -3.2099760e-03,  5.0081429e-03, -3.3605378

100%|██████████| 1000/1000 [00:03<00:00, 287.88it/s]
  0%|          | 0/93 [00:00<?, ?it/s]
10:  90%|█████████ | 9/10 [00:34<00:03,  3.83s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.00282649,  0.00892977, -0.01622175, ...,  0.01207394,
         0.0221499 , -0.04582629],
       [-0.01057401,  0.00466086,  0.00931828, ..., -0.00429249,
        -0.01804059,  0.00710252],
       [ 0.00869446, -0.00205331,  0.01409605, ..., -0.00442497,
         0.01444273,  0.00388551],
       ...,
       [-0.00718919, -0.00550887, -0.00053725, ..., -0.00282516,
         0.00135792, -0.01333642],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01523839,  0.02348474, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00282822]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 4.2412682e-03,  5.2502924e-03,  3.9612963e-03, -2.1446524e-03,
        4.8563555e-03,  1.0034184e-03,  0.0000000e+00,  2.1762329e-03,
       -5.0805666e-04, -3.4250901e-03,  5.7542562e-03, -3.5857649

100%|██████████| 1000/1000 [00:03<00:00, 303.40it/s]
  0%|          | 0/103 [00:00<?, ?it/s]
10: 100%|██████████| 10/10 [00:38<00:00,  3.84s/it]

Actor weights
[<tf.Variable 'critic/dense_4/kernel:0' shape=(8, 128) dtype=float32, numpy=
array([[ 0.00258909,  0.00835986, -0.01676222, ...,  0.01181054,
         0.022479  , -0.04647947],
       [-0.0099416 ,  0.00529521,  0.00995915, ..., -0.00460033,
        -0.0180862 ,  0.00774948],
       [ 0.00839476, -0.00255879,  0.01360704, ..., -0.00417031,
         0.01482833,  0.00330359],
       ...,
       [-0.00678624, -0.00505399, -0.00011817, ..., -0.00306046,
         0.00106869, -0.01286285],
       [ 0.0049195 , -0.00667758, -0.002179  , ..., -0.00545166,
         0.00582104,  0.00608449],
       [ 0.01549083,  0.02376704, -0.01277834, ..., -0.00598551,
        -0.00671953,  0.00307956]], dtype=float32)>, <tf.Variable 'critic/dense_4/bias:0' shape=(128,) dtype=float32, numpy=
array([ 4.85643744e-03,  5.88971144e-03,  4.60895989e-03, -2.42781406e-03,
        5.28362440e-03,  1.56041689e-03,  4.31408320e-04,  2.66807596e-03,
        8.05705786e-05, -3.84048000e-03,  6.42771181e-03,




'\nNikis idea what the problem:\n- Actor_loss needs to be eager tensor dtpye float32 shit\n'

In [79]:
env.close()