In [1]:
# Neural Network
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
# Environment


import gym
# Further support
import numpy as np
import time
import scipy.signal
from tqdm.notebook import tqdm_notebook
import datetime

%load_ext tensorboard


Init Plugin
Init Graph Optimizer
Init Kernel


# Trajectory Storage 2.0
Numpy Version with full-batch return of epoch 


In [3]:
def discounted_reward_sum(rewards, gamma = 0.99):
        '''
        weighs all rewards in a way such that immediate rewards have a stronger impact than possible future rewards.

        Args:
        rewards(list): list of all rewards collected by the agent in episode t (?)
        gamma(float): Hyperparameter determining how much future rewards should be weighed in
        '''
        # To select the next reward
        i = 0
        discounted_rewards = []

        # Iterates through every reward and appends a discounted version to the output
        for r in rewards:
            disc = 0
            for t in rewards[i:-1]:
                discount_t = gamma ** t
                disc += t * discount_t
            i += 1
            discounted_rewards.append(disc)

        # returns list of discounted rewards.
        return sum(discounted_rewards)   

In [4]:
class Storage2:

    def __init__(self, observation_dimension, size):
        self.observations = np.zeros((size, observation_dimension), dtype=np.float32)
        self.actions = np.zeros(size, dtype=np.int32)
        self.rewards = np.zeros(size, dtype=np.float32)
        self.episode_return = np.zeros(size, dtype=np.float32)
        self.baseline_estimates = np.zeros(size, dtype=np.float32)
        self.pointer_start, self.pointer_end= 0,0
        

    def store(self, observation, action, reward, baseline_estimate):
        self.observations[self.pointer_end] = observation
        self.actions[self.pointer_end] = action
        self.rewards[self.pointer_end] = reward
        self.baseline_estimates[self.pointer_end] = baseline_estimate
        self.pointer_end += 1

    def conclude_episode(self, last_value = 0):
        indexes = slice(self.pointer_start, self.pointer_end)
        rewards_total = np.append(self.rewards[indexes], last_value) # maybe weglassen?
        baseline_estimates_total = np.append(self.baseline_estimates[indexes], last_value) # den maybe auch?
        self.episode_return = discounted_reward_sum(self.rewards[indexes])
        self.pointer_start = self.pointer_end

    def get_episodes(self,actor):
        self.pointer_start, self.pointer_end = 0,0

        return self.observations, self.actions, self.rewards, np.mean(self.episode_return), self.baseline_estimates, actor.get_prob(self.actions, self.observations)
        

# Actor Model


In [5]:
class Actor(Model):
    '''
    Neural network computing the actions the agent will take
    '''


    def __init__(self, actionspace, struct=[256,128,64]):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Actor, self).__init__()
        self.actionspace = actionspace
        
        self.l = [
            # Three Dense Layers with random initial parameters having a standart deviation of 0.01
            Dense(struct[0], activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(struct[1], activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(struct[2], activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            
            # Output layer with softmax activation function applied to for neurons.
            # Outputs prpobability for each of our for actions 
            # (Do nothing, fire left orientation engine, fire main engine, fire right orientation engine)
            Dense(self.actionspace, activation="softmax", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]


    @tf.function        
    def call(self, x):
        '''
        Iterates input x through network to create softmax ouutput.

        Args:
        x(): Network input. Pixel values representing the current state of the agent
        '''
        for l in self.l:
            x = l(x)
        return x


    #####  logits = actor(observation) -> actor must be in capitol, gets instantiated twice, maybe idea is wrong
    #@tf.function

    ## logits nur an stelle action zurück
    @tf.function
    def sample_action(self,observation):
        '''
        Calls the actor network with state of the agent and returns the network object + the samnpled action

        Args:
        observation(): Representation of actors state. Same as x in the call function. 
        '''
        # Output of softmax function
        #logits = self.call(observation)
        logits = self(observation)
        # tf.print(type(logits))
        # Sample action from the Softmax output of the network
        action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
        # tf.print(action)
        return logits, action


    #@tf.function
    def get_prob(self,actions, states):
        # actions = actions.flatten()

        logits = self.call(states)
        #probs = np.ones_like(actions)
        
        # print(f'Indexes: {actions, type(actions)}: logits. {logits}, dtype {logits.dtype}')
        # print(f'Indexes: {len(actions)}: logits. {len(logits)}')
        
        logits_flat = tf.squeeze(logits)
        # print(logits_flat)

        ind_1d = tf.range(len(logits_flat))
        test = tf.stack([ind_1d, actions], axis=1)

        new_probs = tf.gather_nd(logits_flat, test)
        
        return new_probs
            

# Critic Model

In [6]:
class Critic(Model):
    '''
    Represents the value function of the network. 
    Input is a certain state and output a float value for that state.
    '''


    def __init__(self,struct):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Critic, self).__init__()
        self.l = [
            # Three Dense Layers with ReLu activation function
            # Random initial parameters having a standart deviation of 0.01
            
            Dense(struct[0], activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(struct[1], activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(struct[2], activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),

            # Output layer with Tanh activation function to get float output value ([-1;1])
            # Random initial parameters having a standart deviation of 0.01
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]


    @tf.function 
    def call(self, x):
        '''
        Iterates input x through network to create tanh output between -1 and 1 
        giving input state x a value.

        Args:
        x(): Network input. Pixel values representing the current state of the agent.
        '''
        for l in self.l:
            x = l(x)
        return x

In [7]:
'''
Adjust Hyperparameters
'''



# Movements in environment (state-space) to collect training data
train_policy_iterations = 80
train_value_iterations = 80


In [8]:
# Reset all states generated by Keras
tf.keras.backend.clear_session()


In [9]:
class Agent:
    '''

    Currently contains:
    - Collects data
    - Training process (iterator, updater, actor loss fun)
    - get advantage function
    - dicount rewards function
    - Get ratio function

  
    '''

    def __init__(self, env_name, render=False, steps_per_epoch=1000, epochs=100, actor_structure=[256,128,64], critic_structure=[256,128,64]):
        ''' 
        Initialize Parameters.
        
        Args:
        env_name(): String Name of the Environment Passed
        render(): Boolean determining if env should be rendered during training
        steps_per_epoch(): how many steps/frame the agent should take during each Epoch of training; Default=1000
        epochs(): How many epochs of training should the agent do; Default=100
        actor_structure(): Define the Structure of the NN, Default: [256,128,64] (Can only take List of len 3)
        critic_structure(): Define the Structure of the NN, Default: [256,128,64] (Can only take List of len 3)
        '''
        # create environemt
        self.env = gym.make(env_name)
        self.observation_dimensions = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.n

        # set Hyperparameters
        self.lr = 3e-4
        self.clip_ratio = 0.2
        self.c_1 = 0.5
        self.optimizer = Adam()
        self.render = render
        self.steps_per_epoch = steps_per_epoch
        self.epochs = epochs
        self.actor_struct = actor_structure
        self.critic_struct = critic_structure
        
        # create models and temporary storage
        self.actor = Actor(self.num_actions,self.actor_struct)
        self.critic = Critic(self.critic_struct)
        self.storage = Storage2(self.observation_dimensions, size=steps_per_epoch)


    def collect_train_data(self, epoch):
        '''
        Agent takes steps in environment according to current policy. Information gets saved to update policy.
        -> Data collection
        '''
        observation, episode_return, episode_length = self.env.reset(), 0, 0
        episodes_total = 0
        # Iteration of whole training process
        

        # Initialize values for return, length and episodes
        sum_return = 0
        sum_length = 0
        num_episodes = 0

        # Each timestep t of steps_per_epoch (in paper denoted as capital T)
        #  allows takes on action in a state and saves the information in storage object
        for t in tqdm_notebook(range(self.steps_per_epoch), desc = 'Epoch:' + str(epoch)):

            # Toggles displaying of environment
            if self.render or epoch == self.epochs-1 and self.epochs != 1:
                self.env.render()

            # Reshaping observation to fit as input for Actor network (policy)
            observation = observation.reshape(1,-1)
            
            # Obtain action and logits for this observation by our actor
            logits, action = self.actor.sample_action(observation)
            
            # Take action in environment and obtain the rewards for it
            # Variable done represents wether agent has finished 
            # The last variable would be diagnostic information, not needed for training
            observation_new, reward, done, _ = self.env.step(action[0].numpy())

            # Sum up rewards over this episode and count amount of frames
            episode_return += reward
            episode_length += 1

            # Get the Base-Estimate from the Critics network
            base_estimate = self.critic(observation)

            # Store Variables collected in this timestep t
            self.storage.store(observation, action, reward, base_estimate)
            # Save the new state of our agent
            observation = observation_new
            
            # Check if terminal state is reached in environment
            if done or (t == self.steps_per_epoch - 1):
                # Save information about episode
                self.storage.conclude_episode()
                # Refresh environment and reset return and length value
                observation, episode_return, episode_length = self.env.reset(), 0, 0

        # obtain all episodes saved in storage
        # episodes, amount_episodes = self.storage.get_episodes()


    def actor_loss_fun(self,probs_old, probs_new, rewards, b_estimates, clip_param):
        '''
        Computes loss for Actor Network output.

        Args:
        logits_old():
        logits_new():
        reward():
        b_estimates_new():
        clip_param():
        '''
        

        # ratio = self.get_ratio_episode(actions, logits_old, logits_new)
        ratio = self.get_ratio(probs_old,probs_new)

        ### FIND OUT WHICH: SINGLE OR MULTIPLE ELEMENTS ARE WANTED AND ADJUST EITHER IN GET_ADV OR THE UPPER TWO FUNCTIONS
        advantage = self.get_advantage(rewards, b_estimates)
        
        # Unclipped value
        l1 = ratio * advantage
        # Clipped ratio between values determined by Hyperparam and multiplied by advantage (see objective function)
        
        #l2 = np.clip(ratio, a_min=1 - clip_param, a_max=1 + clip_param) * advantage

        l2 = tf.clip_by_value(ratio, clip_value_min=1-clip_param, clip_value_max=1+clip_param) * advantage
        #l1 = np.array(l1, dtype="float32")
        #l2 = np.array(l2, dtype="float32")
        

        # Compute minimum of both and take the mean to return float loss
        #actor_loss = -tf.reduce_mean(tf.minimum(l1, l2))
        #l1 = tf.convert_to_tensor(np.array([tf.convert_to_tensor(l, dtype=tf.float32) for l in l1]), dtype=tf.float32)
        #l2 = tf.convert_to_tensor(np.array([tf.convert_to_tensor(l, dtype=tf.float32) for l in l2]), dtype=tf.float32)
        return l1, l2


    def train_step(self, observations, actions, optimizer, rewards_old, probs_old, baseline_estimates,returns, clip_param, c_1 = 1, c_2=0.01):
        '''
        Updates actor network parameters and returns the loss to evaluate performance.

        Args:
        model(object): Object of the actor model.
        input(list): contains floats describing the actors state.
        loss_function(function): Clipped objective function for PPO.
        optimizer(object): Optimizer used to train actor.
        train_logits():
        train_rewards():
        clip_param():
        c_1(): 
        c_2():
        '''
        # use tf.gradientTape to compute loss, then gradients and apply these to the model to modify the parameters
        with tf.GradientTape(persistent=True) as tape:
            
            probs_new = self.actor.get_prob(actions, observations)
            # print(f'Probs Old: {probs_old}, oftype: {type(probs_old)}. Probs New: {probs_new}, ofType {type(probs_new)}')

            # Compute & weigh entropy 
            #entropy = c_2 * np.mean(-(logits_new * train_logits))   # <----- DOESNT WORK YET Musste ich erstmal rausnehmen für den Rest vom Debugging
            # entropy = 0.01

            # Computes MSE between output of the critics network (value) the discounted sum of rewards
            #  which represents an estimate based on rewards collected during training
            # critic_loss = c_1 * tf.keras.losses.MeanSquaredError(b_estimates_new, self.discounted_reward(train_rewards)).numpy()            
            critic_loss = tf.reduce_mean((returns - self.critic(observations)) ** 2)
            #actor_loss = entropy * self.actor_loss_fun(actions, train_logits, logits_new, train_rewards, b_estimates_new, clip_param)
            l1,l2 = self.actor_loss_fun(probs_old, probs_new, rewards_old, baseline_estimates, clip_param)


            actor_loss = -tf.reduce_mean(tf.minimum(l1, l2))
            #critic_loss = tf.cast(critic_loss, dtype=tf.float32)
            #print(f'Critics loss:{type(critic_loss)}. Actor Loss {actor_loss.dtype}')

            #print('Actor weights')
            #print(print(self.actor.layers[0].weights))

            #print('actor')
            #print(actor_loss)
            #print(type(actor_loss))
            #print('critic')
            #print(critic_loss)
            #print(type(critic_loss))

            #actor_loss = tf.convert_to_tensor(actor_loss, dtype=tf.float32)

            # print(actor_loss)
            # print(critic_loss)
            # print('actor')
            # print(self.actor.trainable_variables)
            # print('critic')
            #print(self.critic.trainable_variables)
            a_gradients = tape.gradient(actor_loss, self.actor.trainable_variables)
            c_gradients = tape.gradient(critic_loss, self.critic.trainable_variables)
            # print(a_gradients)
            # print(c_gradients)

            #print(tape)
            #print('Actor loss')
            #print(actor_loss)
            #print('Trainable Weights')
            #print(self.actor.trainable_weights)
        
        #print(f'Gradients Actor: {a_gradients}. Gradients Critic: {c_gradients}')

        # Update parameters
        optimizer.apply_gradients(zip(a_gradients, self.actor.trainable_variables))
        optimizer.apply_gradients(zip(c_gradients, self.critic.trainable_variables))

        # del tape

        return actor_loss, critic_loss

    def update_policy(self, episodes, optimizer, clip_param, c_1 = 1, c_2=0.01):
        '''
        Update policy with the collected data (Parameter updates for actor)

        Args: 
        episodes(list): Contains all information on one episode in the following order:
                        [observations, actions, logits, rewards, BaselineEstimate, summed rewards]
        actor(object): Object of the actor model.
        critic(object): Object of the critic model.
        actor_loss(function): Clipped objective function for PPO.
        optimizer(object): Optimizer used to train actor.
        clip_param(float): Hyperparameter to decide values to clip ratio between.
        c_1(float): hyperparameter to determine how strongly loss of the critic network should be weighed
        c_2(float): hyperparameter to determine how strongly entropy should be weighed


        Information stored as:
        storage.episodes[different episodes]
                        [observations, actions, logits, rewards, BaselineEstimate, sum(self.rewards)]
                        [look at single one]
        '''
        # for epoch in training_iteratins:
        # Save network loss
        train_losses_actor = []
        train_losses_critic = []
        
        # Iterate over all finished episodes from collected training data
        for episode in tqdm_notebook(episodes):

            # Update parameters
            # Compute train losses and action by chosen by policy
            actor_loss, critic_loss = self.train_step(
                # States
                episode[0],
                # Actions
                episode[1],
                #optimizer (Adam)
                optimizer,
                # Logits
                episode[2],
                # Rewards
                episode[3],
                clip_param,
                c_1,
                c_2,
                episode[6]
            )
            train_losses_actor.append(actor_loss)
            train_losses_critic.append(critic_loss)

            return train_losses_actor, train_losses_critic


    


    


  


    def get_advantage(self, rewards, b_estimates, gamma = 0.99):
        '''
        Computes Advantage for action in state.

        Args:
        rewards(float): Reward for action.
        gamma(float): Discount factor.
        b_estimates(float): Baseline Estimates.
        
        '''
        # Saves list of all rewards in new variable 
        #rewards = episodes[0][3]


        # Get discounted sum of rewards 
        disc_sum = self.discounted_reward(rewards, gamma)


        # # Estimated Value of the current situtation from the critics network
        # b_estimates = self.episodes[0][4] 

        # Convert lists to np arrays and flatten
        disc_sum_np = np.array(disc_sum)
        b_estimates_np = np.array(b_estimates)
        b_estimates_np = b_estimates_np.flatten()

        # substract arrays to obtain advantages
        advantages = np.subtract(disc_sum_np, b_estimates_np)

        return advantages


     ### MIGHT NOT WORK
    #  output for: discounted_reward([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 0.99)
    #  -> [8.91, 7.920000000000001, 6.930000000000001, 5.94, 4.95, 3.96, 2.9699999999999998, 1.98, 0.99, 0]
    #  ###
    def discounted_reward(self, rewards, gamma = 0.99):
        '''
        weighs all rewards in a way such that immediate rewards have a stronger impact than possible future rewards.

        Args:
        rewards(list): list of all rewards collected by the agent in episode t (?)
        gamma(float): Hyperparameter determining how much future rewards should be weighed in
        '''
        # To select the next reward
        i = 0
        discounted_rewards = []

        # Iterates through every reward and appends a discounted version to the output
        for r in rewards:
            disc = 0
            for t in rewards[i:-1]:
                discount_t = gamma ** t
                disc += t * discount_t
            i += 1
            discounted_rewards.append(disc)

        # returns list of discounted rewards.
        return discounted_rewards   




    def get_ratio(self, probs_old, probs_new):
        log_probs_old = tf.nn.log_softmax(probs_old)
        log_probs_new = tf.nn.log_softmax(probs_new)

        # ratio = tf.exp(log_probs_new-log_probs_old)
        #ratio = tf.exp(probs_new-probs_new)
        ratio = tf.divide(log_probs_new,log_probs_old)
        return ratio


    def run(self):
        for epoch in tqdm_notebook(range(self.epochs), desc = 'Epochs'):
            
            self.collect_train_data((epoch))
            observations, actions, rewards, returns, baseline_estimates, probs = self.storage.get_episodes(self.actor)
            self.train_step(observations, actions, self.optimizer, rewards, probs, baseline_estimates,returns, self.clip_ratio)
            #print(observations, actions, rewards, returns, baseline_estimates, probs)
            print(f'Return: {np.mean(returns)}')
            #print(data)
            #self.update_policy(data, self.optimizer, self.clip_ratio)
        print(self.actor.trainable_variables)
        print(self.critic.trainable_variables)
        self.env.close()
  

In [10]:

ppo_agent = Agent(env_name='LunarLander-v2', render=True,epochs=20)
ppo_agent.run()


Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-04-09 17:12:34.987221: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-09 17:12:34.987306: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch:0:   0%|          | 0/1000 [00:00<?, ?it/s]

2022-04-09 17:12:35.480982: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-04-09 17:12:35.481607: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-04-09 17:12:35.481698: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-04-09 17:12:35.558657: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-04-09 17:12:43.821210: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-04-09 17:12:43.879749: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-04-09 17:12:43.924131: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:11



2022-04-09 17:12:45.134034: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-04-09 17:12:45.262715: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Return: -2774.4591319226665


Epoch:1:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -4676.504014997554


Epoch:2:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -7943.694642010811


Epoch:3:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -1903.0171753221427


Epoch:4:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -1253.7566815941932


Epoch:5:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -1622.3278040004245


Epoch:6:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -20.97350343076418


Epoch:7:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -101.52291818916815


Epoch:8:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -4897.8960047000855


Epoch:9:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: 415.1947104350876


Epoch:10:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -622.6389877410418


Epoch:11:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -2624.8872324911435


Epoch:12:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: 160.2467110616285


Epoch:13:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -49.64934272384537


Epoch:14:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -108.81403674473816


Epoch:15:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -1150.1427751286838


Epoch:16:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -528.6665509486959


Epoch:17:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -309.6650765461923


Epoch:18:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -13.52161814753123


Epoch:19:   0%|          | 0/1000 [00:00<?, ?it/s]

Return: -534.8320024434419
[<tf.Variable 'dense/kernel:0' shape=(8, 256) dtype=float32, numpy=
array([[-1.50827961e-02,  2.61323154e-02, -7.41020124e-03, ...,
        -3.02780565e-04,  1.01455664e-02, -6.42375136e-03],
       [ 1.39157055e-02, -2.89546279e-03, -1.34565653e-02, ...,
        -9.69316158e-03, -1.22106774e-02, -5.69066219e-03],
       [ 1.13433087e-02, -2.46069301e-03,  3.77112701e-05, ...,
        -4.51337919e-03, -9.15297493e-03,  1.25844693e-02],
       ...,
       [-3.18263611e-03, -2.21650768e-03, -3.32938624e-03, ...,
        -6.06468786e-03,  5.53688593e-03, -4.57641715e-03],
       [-4.10830975e-03,  1.60619318e-02, -2.41473187e-02, ...,
        -1.19672650e-02,  1.00186327e-02, -2.61615263e-03],
       [-7.56196585e-03,  8.51930212e-03,  1.23066241e-02, ...,
        -8.17059819e-03,  6.53973129e-03, -1.62330773e-02]], dtype=float32)>, <tf.Variable 'dense/bias:0' shape=(256,) dtype=float32, numpy=
array([-3.92505014e-03,  1.07309362e-03, -7.86275417e-03,  1.3498268