In [31]:
from tensorflow.keras.layers import Dense, Activation, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential, load_model, Model
import tensorflow as tf
import numpy as np
import gym
import time
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook, tqdm

In [32]:
env = gym.make("CarRacing-v0")

In [33]:
class Storage:

    def __init__(self, observation_dimension, size):
        self.observations = np.zeros((size, observation_dimension), dtype=np.float32)
        self.actions = np.zeros(size, dtype=np.int32)
        self.rewards = np.zeros(size, dtype=np.float32)
        self.episode_return = np.zeros(size, dtype=np.float32)
        self.baseline_estimates = np.zeros(size, dtype=np.float32)
        self.advantages = np.zeros(size, dtype=np.float32)
        
        self.pointer_start, self.pointer_end= 0,0
        

    def store(self, observation, action, reward, baseline_estimate):
        self.observations[self.pointer_end] = observation
        self.actions[self.pointer_end] = action
        self.rewards[self.pointer_end] = reward
        self.baseline_estimates[self.pointer_end] = baseline_estimate
        self.pointer_end += 1

    def conclude_episode(self, last_value = 0):
        indexes = slice(self.pointer_start, self.pointer_end)
        # self.episode_return = discounted_reward_sum(self.rewards[indexes])
        # self.advantages[indexes] = get_advantage(self.rewards[indexes], self.baseline_estimates[indexes])
        self.pointer_start = self.pointer_end

    def get_episodes(self,actor):
        self.pointer_start, self.pointer_end = 0,0

        return self.observations, self.actions, self.rewards, np.mean(self.episode_return), self.baseline_estimates, actor.get_prob(self.actions, self.observations), self.advantages
        

In [34]:
class Agent(Model):

    def __init__(self, action_space):
        '''
        Initialize layer architecture for Actor Network.

        Args:
        actionspace(): number of possible actions the agent can take
        struct(): Width of layers in the network - must be of length = 3 
        '''
        super(Agent, self).__init__()
        self.action_space = action_space


        self.l = [
            # Conv2D()
            # tanh activation for the last layer as the -1/1 boundaries are needed for mu of steering
            # sigmoid activation for gas, break etc. because of the 0/1 boundaries
        ]
        # return mu between -1 and 1
        self.steering_out = Dense(1, activation="tanh")
        # return both mus between 0 and 1
        self.gas_break_out = Dense(2,activation="sigmoid")

    @tf.function
    def call(self, x, sample_logs, actions):
        for l in self.l:
            x = l(x)
        return x


    @tf.function 
    def sample_action(self, observation):
        '''
        Calls the actor network with state of the agent and returns the network object + the samnpled action

        Args:
        observation(): Representation of actors state. Same as x in the call function. 
        '''
        # obtain logits from network output
        pre_output = self(observation)

        steering_mu = self.steering_out(pre_output)
        steering_out = tf.random.normal(1,mean=steering_mu, stddev=0.02)

        gas__break_mu = self.gas_break_out(pre_output)
        gas_break_out = tf.random.normal(2,mean=gas__break_mu, stddev=0.02)



        # Sample action from the Softmax output of the network
        action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)

        return logits, action


    def get_logs(self, states, actions):


        # get logits from network
        logits = self.call(states)
        logits_flat = tf.squeeze(logits)

        # get the value of the logits at index action
        index_first_dim = tf.range(len(logits_flat))
        index_2D = tf.stack([index_first_dim, actions], axis=1)

        new_probs = tf.gather_nd(logits_flat, index_2D)
        
        return new_probs
            

        

