# A2C, Advantage Actor Critic

We are using a Advantage Actor Critic which <br/>
- Uses **Bootstrapped** estimates for it's critcs' targets. And NOT Monte-Carlo estimates.
- Takes a batch of gradients for training, NO parallelism in sampling or Asynchronuous sampling.
- Has a **seperate network** for Actor and Critic. No weight sharing between the networks.
- Simple **1-step return** as the Advantage (Bias). Not n-step (n = length of episode) where the critic is used for estimating only the baseline (Variance) or average of many n-step returns - GAE, where we take a bias-variance trade-off



## Step 1: Imports

In [1]:
import gym
import tensorflow as tf
from tensorflow import keras
import numpy as np
import datetime as dt
import math

## Step 2: Environment

In [2]:
GAMMA = 0.99

env = gym.make("CartPole-v0")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print("state",state_size, "action", action_size)

STORE_PATH = '/Users/SV/Desktop/Lyra/CS285/Actor-Critic'
logger = tf.summary.create_file_writer(STORE_PATH + f"/AC-CartPole_{dt.datetime.now().strftime('%d%m%Y%H%M')}")


state 4 action 2


## Step 3: Network

In [3]:
actor = keras.Sequential([
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(action_size, activation='softmax')
])

critic = keras.Sequential([
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(30, activation='relu', kernel_initializer=keras.initializers.he_normal()),
    keras.layers.Dense(1)
])



## Step 4: Sample Action

In [4]:
# Since only one state is there, N = 1 in the mini-batch.
# And index 0, gives the value for that state.     

def sample_action(state):
    softmax_out = actor(state.reshape((1, -1)))
    selected_action = np.random.choice(action_size, p=softmax_out.numpy()[0])
    return selected_action


## Step 5: Sample Episode

In [5]:
def get_rewards2go(raw_rewards):
    reward_sum = 0
    result = []
    for reward in reversed(raw_rewards):
        reward_sum = reward + GAMMA * reward_sum
        result.append(reward_sum)
        result.reverse()
    return result

def sample_episode(states, next_states, rewards, rewards2go, actions):
    state = env.reset()
    raw_rewards = []
    while True:
        action = sample_action(state)
        next_state, reward, done, _ = env.step(action)
        
        states.append(state)
        next_states.append(next_state)
        actions.append(action)
        raw_rewards.append(reward)
        state = next_state        
        
        if done:
            rewards.extend(raw_rewards)
            rewards2go.extend(get_rewards2go(raw_rewards))
            break

## Step 6: Training

In [None]:
sample_size = 15000
batch_size = 5120
steps = 1000

actor_optimizer = tf.keras.optimizers.Adam(lr=0.001)

def averageGradients(grads, N):
    for i in range(len(grads)):
        grads[i] = grads[i]/N
        
def addGradients(grads, batch_grads):
    sum_grad = []
    for (grad, batch_grad) in zip(grads, batch_grads):
        sum_grad.append(grad + batch_grad)
    return sum_grad

def generate_sample_batcher(states, next_states, rewards, rewards2go, actions):
    states = np.array(states)
    next_states = np.array(next_states)
    rewards = np.array(rewards)
    rewards2go = np.array(rewards2go)
    actions = np.array(actions)
    
    # For Policy Gradients     
#     baseline = np.mean(rewards2go)
#     rewards2go = rewards2go - baseline
    
    def sample_batcher(n):
        return states[n:n+batch_size], next_states[n:n+batch_size], rewards[n:n+batch_size], rewards2go[n:n+batch_size], actions[n:n+batch_size]

    return sample_batcher
    
for step in range(steps):

    # This is one gradient step    

    # Numpy arrays are immutable. 
    # So these NEED to be python lists.
    # Once sampled, u can convert them into np arrays
    rewards = []
    rewards2go = []
    states = []
    actions = []
    next_states = []

    N = 0 
    while(len(states) < sample_size):
        sample_episode(states, next_states, rewards, rewards2go, actions)
        N += 1
        if(N%10 == 0):
            print("Sampling Episode - ", N)
    print("Sampled", N, "Episodes")

    avg_reward = np.sum(rewards) / N    
    bat_per_epoch = math.floor(len(states) / batch_size)
    sample_batcher = generate_sample_batcher(states, next_states, rewards, rewards2go, actions)
    
    ###### Train Critic ######
    
    # NEW optimizer for each policy to forget old policy's gradients.
    critic_optimizer = tf.keras.optimizers.Adam(lr=0.001)
    
    # @t1, fit critic, rest of the time, take a single gradient-step     
    epoch = 300
    if step != 0:
        epoch = 1
        
    for j in range(epoch):
        
        gradients = None
        loss = 0 
        for i in range(bat_per_epoch): 
            
            # Sample a batch sequentially not randomnly             
            n = i*batch_size
            batch_states, batch_next_states, batch_rewards, batch_rewards2go, batch_actions = sample_batcher(n)            
                         
            # @t1, Monte-Carlo estimates, rest of the time, Bootstrapped rewards.
            y_true = batch_rewards2go
            if step != 0:
                y_true = batch_rewards + GAMMA * np.squeeze(critic(batch_next_states))
            
            with tf.GradientTape() as tape:

                # Squeeze is to remove dimension of size 1                 
                # if we use np.squeeze, automatic differentiation wont work
                y_pred = tf.squeeze(critic(batch_states))         
                batch_loss = tf.keras.losses.MSE(y_true=y_true, y_pred=y_pred)
                
            batch_gradients = tape.gradient(batch_loss, critic.trainable_variables)
            loss += batch_loss
            
            # Sum the Gradients over all batches
            if gradients is None:
                gradients = batch_gradients
            else:
                gradients = addGradients(gradients, batch_gradients)
                
        critic_optimizer.apply_gradients(zip(gradients, critic.trainable_variables))  
        if j%10 == 0:
            print(f"Training critic for {j+1} epochs ", loss)

    ###### Train Actor ######
    
    gradients = None
    for i in range(bat_per_epoch):
        n = i*batch_size
        batch_states, batch_next_states, batch_rewards, batch_rewards2go, batch_actions = sample_batcher(n)
        
        with tf.GradientTape() as tape:
            predictions = actor(batch_states)
            batch_loss = tf.keras.losses.sparse_categorical_crossentropy(y_true=batch_actions, y_pred=predictions, from_logits=False)

            # For Actor-Critic
            advantage = batch_rewards + GAMMA * np.squeeze(critic(batch_next_states)) - np.squeeze(critic(batch_states))
            batch_loss = batch_loss * advantage

            # For Policy Gradients
#             batch_loss = batch_loss * batch_rewards2go
            
        batch_gradients = tape.gradient(batch_loss, actor.trainable_variables)

        # Sum the Gradients over all batches
        if gradients is None:
            gradients = batch_gradients
        else:
            gradients = addGradients(gradients, batch_gradients)

    # For Policy Gradients
#     averageGradients(gradients, N)
    actor_optimizer.apply_gradients(zip(gradients, actor.trainable_variables))     

    if step % 100 == 0:
        print("Saving model, actor & critic @ timestep", step)
        actor.save_weights(STORE_PATH + f"/actor{dt.datetime.now().strftime('%d%m%Y%H%M')}")
        # For Actor-Critic
        critic.save_weights(STORE_PATH + f"/critic{dt.datetime.now().strftime('%d%m%Y%H%M')}")

    print(f"Step: {step}, AvgReward: {avg_reward}, step: {step}")
    with logger.as_default():
            tf.summary.scalar('avgReward', avg_reward, step=step)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Sampling Episode -  10
Sampling Episode -  20
Sampling Episode -  30
Sampling Episode -  40
Sampling Episode -  50
Sampling Episode -  60
Sampling Episode -  70
Sampling Episode -  80
Sampling Episode -  90
Sampling Episode -  100
Sampling Episode -  110
Sampling Episode -  120
Sampling Episode -  130
Sampling Episode -  140
Sampling Episode -  150
Sampling Episode -  160
Sampling Episode -  170
Sampling Episode -  180
Sampling Episode -  190
Sampling Episode -  200
Sampling Episode -  210
Sampling Episode -  220
Sampling Episode -  230
Sampling Episode -  240
Sampling Episode -  250
Sampling Episode -  260
Sampling Episode -  270
Sampling Episode -  280
Sampling Episode -  290
Sampling Epi

Sampling Episode -  910
Sampling Episode -  920
Sampling Episode -  930
Sampling Episode -  940
Sampling Episode -  950
Sampled 955 Episodes
Training critic for 1 epochs  tf.Tensor(7.260952, shape=(), dtype=float32)
Step: 1, AvgReward: 15.713089005235602, step: 1
Sampling Episode -  10
Sampling Episode -  20
Sampling Episode -  30
Sampling Episode -  40
Sampling Episode -  50
Sampling Episode -  60
Sampling Episode -  70
Sampling Episode -  80
Sampling Episode -  90
Sampling Episode -  100
Sampling Episode -  110
Sampling Episode -  120
Sampling Episode -  130
Sampling Episode -  140
Sampling Episode -  150
Sampling Episode -  160
Sampling Episode -  170
Sampling Episode -  180
Sampling Episode -  190
Sampling Episode -  200
Sampling Episode -  210
Sampling Episode -  220
Sampling Episode -  230
Sampling Episode -  240
Sampling Episode -  250
Sampling Episode -  260
Sampling Episode -  270
Sampling Episode -  280
Sampling Episode -  290
Sampling Episode -  300
Sampling Episode -  310
S

Sampling Episode -  260
Sampling Episode -  270
Sampling Episode -  280
Sampling Episode -  290
Sampling Episode -  300
Sampling Episode -  310
Sampling Episode -  320
Sampling Episode -  330
Sampling Episode -  340
Sampling Episode -  350
Sampling Episode -  360
Sampling Episode -  370
Sampling Episode -  380
Sampling Episode -  390
Sampling Episode -  400
Sampling Episode -  410
Sampling Episode -  420
Sampling Episode -  430
Sampling Episode -  440
Sampling Episode -  450
Sampling Episode -  460
Sampling Episode -  470
Sampling Episode -  480
Sampling Episode -  490
Sampling Episode -  500
Sampling Episode -  510
Sampling Episode -  520
Sampling Episode -  530
Sampling Episode -  540
Sampling Episode -  550
Sampling Episode -  560
Sampling Episode -  570
Sampling Episode -  580
Sampling Episode -  590
Sampling Episode -  600
Sampling Episode -  610
Sampling Episode -  620
Sampling Episode -  630
Sampling Episode -  640
Sampling Episode -  650
Sampling Episode -  660
Sampling Episode

Sampling Episode -  580
Sampling Episode -  590
Sampling Episode -  600
Sampling Episode -  610
Sampling Episode -  620
Sampling Episode -  630
Sampling Episode -  640
Sampling Episode -  650
Sampling Episode -  660
Sampling Episode -  670
Sampling Episode -  680
Sampling Episode -  690
Sampling Episode -  700
Sampling Episode -  710
Sampling Episode -  720
Sampling Episode -  730
Sampling Episode -  740
Sampling Episode -  750
Sampling Episode -  760
Sampling Episode -  770
Sampling Episode -  780
Sampling Episode -  790
Sampling Episode -  800
Sampling Episode -  810
Sampling Episode -  820
Sampling Episode -  830
Sampling Episode -  840
Sampling Episode -  850
Sampling Episode -  860
Sampling Episode -  870
Sampling Episode -  880
Sampling Episode -  890
Sampling Episode -  900
Sampling Episode -  910
Sampling Episode -  920
Sampling Episode -  930
Sampling Episode -  940
Sampling Episode -  950
Sampling Episode -  960
Sampling Episode -  970
Sampling Episode -  980
Sampling Episode

Sampling Episode -  810
Sampling Episode -  820
Sampling Episode -  830
Sampling Episode -  840
Sampling Episode -  850
Sampling Episode -  860
Sampling Episode -  870
Sampling Episode -  880
Sampling Episode -  890
Sampling Episode -  900
Sampling Episode -  910
Sampling Episode -  920
Sampling Episode -  930
Sampling Episode -  940
Sampling Episode -  950
Sampling Episode -  960
Sampling Episode -  970
Sampling Episode -  980
Sampling Episode -  990
Sampling Episode -  1000
Sampling Episode -  1010
Sampling Episode -  1020
Sampling Episode -  1030
Sampling Episode -  1040
Sampling Episode -  1050
Sampling Episode -  1060
Sampled 1068 Episodes
Training critic for 1 epochs  tf.Tensor(9.7735405, shape=(), dtype=float32)
Step: 11, AvgReward: 14.051498127340825, step: 11
Sampling Episode -  10
Sampling Episode -  20
Sampling Episode -  30
Sampling Episode -  40
Sampling Episode -  50
Sampling Episode -  60
Sampling Episode -  70
Sampling Episode -  80
Sampling Episode -  90
Sampling Episo