# <font color='deeppink'><b> Actor-Critic in Reinforcement Learning </b></font>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions.categorical import Categorical
import random
import time 
import numpy as np
import circuit.ngspice as ng
import torchplot as tp
from itertools import count
from collections import namedtuple
from circuit.fcas import FoldedCascodeRLEnvDiscrete
import math
from torch.optim.lr_scheduler import StepLR

# Plots 
import matplotlib.pyplot as plt

from datetime import datetime
import logging
# Setting a fixed seed for reproducible results
np.random.seed(seed=12)
torch.manual_seed(seed=12)


#import pygame
#from pygame import gfxdraw


log_file = 'fcas.log'
with open(log_file, "w") as f:
    f.write(str(datetime.now().time()))

logging.basicConfig(filename=log_file, level=logging.DEBUG)




In [None]:
# Create the environment
env = FoldedCascodeRLEnvDiscrete() 

In [None]:
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

In [None]:
print("There are {} actions".format(env.action_size))
print(env.reset().shape)

# <font color='orangered'><b> Model Network </b></font>

In [None]:
# Network: Actor-Critic 
#     
# First fully connected layer with 23 inputs (states) and 256 outputs (arbitrary number)
# 


class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.common1 = torch.nn.Linear(30, 1024) # x because there are x parameters as the observation space
        self.common2 = torch.nn.Linear(1024, 512)
        
        self.actor = torch.nn.Linear(512, 62) # 64 for the number of actions

        # self.fc3 = torch.nn.Linear(128,20)
        self.critic = torch.nn.Linear(512, 1) # Critic is always 1
        
        self.saved_actions = []
        self.rewards = []
        
    def forward(self, x):
                #print(f"{x= }")
        #print(f"{action_scores= }")

        x = F.leaky_relu(self.common1(x))
        x = F.leaky_relu(self.common2(x))        

        action_scores = torch.sigmoid(self.actor(x))
        

        action_prob = F.softmax(action_scores, dim=-1)

        # x = torch.sigmoid(self.fc3(x))
        state_values = self.critic(x)
        # state_values = F.softmax(state_values)
        # print(f"{state_values= }")
        
        
        return action_prob, state_values
       

# <font color='indianred'><b> New Model Network </b></font>

In [None]:
class ActorCritic_new(nn.Module):
    def __init__(self):
        super(ActorCritic_new, self).__init__()

        self.saved_actions = []
        self.rewards = []

        self.critic = nn.Sequential(
            nn.Linear(23, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512,1)
        )

        self.actor = nn.Sequential(
            nn.Linear(23, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.Sigmoid(),
            nn.Linear(512, 60),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        state_values = self.critic(x)
        action_probs = self.actor(x)

        return action_probs, state_values


In [None]:
action_space_size = env.action_size
state_space_size = env.obs.shape[0]

# Lists for plotting
episode_list = []
reward_list = []
rwrd_list = []
loss_list = []
policy_loss_list = []
value_loss_list = []


# <font color='violet'><b> Specifications </b></font>

In [None]:
learning_rate = 1e-7

model = ActorCritic()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Adaptive Learning Rate
scheduler = StepLR(optimizer, step_size=100, gamma=0.2)

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

# <font color='crimson'><b> Action Selection </b></font>

In [None]:
def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model.forward(state)

    # flag = random()
  
    # exploration_rate_threshold = random.uniform(0,1)
    # print(len(probs), len(state_value))

    # print(state)
    # print(probs)

    # probs = math.log(probs)
    logging.info('state:' + str(state))
    logging.info('state_value:' +str(state_value))
    logging.info('probs:' + str(probs))
    m = Categorical(probs)
 
    logging.info(str(m))
    action = m.sample()

    logging.info('action:' + str(action))
    # print("Action: " , action)   


    # else:
    # while action.item() != 1 and action.item() != 3 and action.item() != 5\
    #         and action.item() != 7 and action.item() != 12 and action.item() != 14 \
    #              and action.item() != 21 and action.item() != 26 and action.item() != 27 :
    #     print(action.item())
    #     action = torch.tensor(env.sample_action())
        # print(action.item())


    # perform one action and assess the results

    # action = torch.tensor(env.sample_action())
    # print(action)
    
    # print("Chosen Action: ", action.item())


    # print(action.item())
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    
    return action.item()


# <font color='scarlet'><b> Finish Episode condition</b></font>

In [None]:
from importlib.metadata import entry_points


def finish_episode():
    
    # We calculate the losses and perform backprop in this function
    R = 0
    policy_losses = []
    value_losses = []
    #entropy_losses = []
    beta = 1e-3
    saved_actions = model.saved_actions
    returns = []
    gamma = 0.99 #Discount factor for past rewards
    

    # Calculate expected value from rewards
    # - At each timestep what's the total reward received after that timestep
    # - Rewards in the past are discounted by multiplying them with gamma
    # - These are the labels for our critic
    # - R: Past Rewards
    # - r: total Reward


    logging.info("len(model.rewards)): " + str(len(model.rewards)))

    logging.info(model.rewards)

    for r in model.rewards[::-1]:
        R = r + gamma * R 
        returns.insert(0, R)
        # print(returns)


    # R é return real
    # value é o retorno do crítico


    if len(returns) <= 1:
        returns = torch.tensor([0.0])
        
    # Normalize
    else: 
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

    # print("model rewards" , model.rewards)
    # print("Returns " , returns)
    # print("Saved action" ,saved_actions)


    # Critic train

    logging.info("len(saved_actions)): " + str(len(saved_actions)))
    logging.info("len(returns)): " + str(len(returns)))

    for (log_prob, value), R in zip(saved_actions, returns):
        # At this point in history, the critic estimated that we would get a
        # total reward = `value` in the future. We took an action with log probability
        # of `log_prob` and ended up recieving a total reward = `ret`.
        # The actor must be updated so that it predicts an action that leads to
        # high rewards (compared to critic's estimate) with high probability.
        advantage = R - value.item()
        # print("advantage: ", advantage)
        
       # entropy_loss = -beta * (action_probs * log_prob).sum(dim=1).mean()   
        policy_losses.append(-log_prob * advantage)
        # Actor Loss
        policy_loss_list.append(policy_losses)
        

        # The critic must be updated so that it predicts a 
        # better estimate of future rewards.
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
        value_loss_list.append(value_losses)
        
        logging.info("log_prob: " + str(log_prob))
        
        logging.info("R: " + str(R))
        logging.info("value: " + str(value))
      

        
        # print("Log_prob: ", log_prob)


    logging.info(value_losses)
    logging.info(policy_losses)
    # Sets the gradients to zero before performing backpropagation because pytorch accumulates the gradients
    optimizer.zero_grad()

    # BackPropagation
    # loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

    # print("Policy loss: ", policy_losses)
    # print("Value loss: ", value_losses)
    # print("Total Loss", loss)
    loss_list.append(loss)
    loss.backward()
    
    # Gradient Clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm= 2)
    
    # print("----------------------------------------------------------GRADIENTS-----------------------------------------------\n")
    # for p in model.parameters():
        #print(p.grad.norm())
        #print(loss)



    optimizer.step()
    scheduler.step()
    # print("LR = ", scheduler.get_last_lr())

    
    # Clear the loss and reward history
    # del model.rewards[:]
    # del model.saved_actions[:]


# <font color='darkorange'><b> Plots </b></font>

In [None]:
# Pretty Colors for plots

Colors = ['mediumaquamarine', 'blueviolet', 'lightcoral',
            'darkorange', 'royalblue', 'crimson', 'dimgray',
            'navy', 'violet', 'teal', 'hotpink', 'peru']


In [None]:
def plot_reward(episode_list, reward_list, intervals=None):
  
    n_ep = len(episode_list)

    if intervals is None:
        intervals = [[0, n_ep]]

    for i, [l, r] in enumerate(intervals):
        plt.plot(episode_list[max(0,l):min(r,n_ep)], reward_list[max(0,l):min(r,n_ep)], color=Colors[i])
        print(l, r)
    
    plt.xlabel('x - # Episode')
    plt.ylabel('y - Reward')
    plt.title('Reward over episodes')
    # plt.xscale('log')
    # plt.xscale('log')
    plt.grid()
    plt.show()

In [None]:
def plot_losses(episode_list, loss_list):

    tp.plot(episode_list, loss_list)
    tp.xlabel('x - # Episode')
    tp.ylabel('y - Loss')
    tp.title('Total Losses over episodes')
    tp.grid()
    tp.show()

In [None]:
def plot_policy_losses(episode_list, policy_loss_list):

    tp.plot(episode_list, policy_loss_list)
    tp.xlabel('x - # Episode')
    tp.ylabel('y - Loss')
    tp.title('Policy Losses over episodes')
    tp.grid()
    tp.show()

In [None]:
def plot_value_losses(episode_list, value_loss_list):

    tp.plot(episode_list, value_loss_list)
    tp.xlabel('x - # Episode')
    tp.ylabel('y - Loss')
    tp.title('Value Losses over episodes')
    tp.grid()
    tp.show()

In [None]:
# def plot_last_reward(episode_list)

# <font color='gold'><b> Training </b></font>

In [None]:
def train():
    running_reward = -1
    episode_list = []
    reward_list = []
    t_max = 50
        # cnt_up = 0


    for i_episode in count(): # We need around this much episodes
        state = env.reset(env.values_init)
        ep_reward = 0
        reward_improvement = 0 
        # print(state)
        # print(probs)

       
    
        for t in range(1, t_max):
            # print(t, end=', \n')
        
            action = select_action(state)                   # Action is selected
            state, reward, done, log = env.step(action)     # State is updated accordingly

            # print("reward: ",reward)
            # print("action: ",action)
            # print("state: " ,state)
            

            model.rewards.append(reward)
            # print(model.rewards)
            # if i_episode > 0:
            #     if model.rewards[t-2] < model.rewards[t-1]:
            #             reward_improvement = model.rewards[t-1] - model.rewards[t-2]
            #             cnt_up += 1
            #             print("# Action:",action, "Improvement:", reward_improvement)

                # elif model.rewards[t-2] > model.rewards[t-1]:
                #     reward_improvement = model.rewards[t-1] - model.rewards[t-2]
                #     print("# Action:",action, "Piorou:", reward_improvement)

            # se nao houver improvement - fazer unstep

            ep_reward = reward
        

            
            if done:
                print(t, end=', \n')
                break        
        
        # print("nºde melhoramentos: ", cnt_up )

        # Breaking conditions in case it does not find a solution that fits the requirements
        # if i_episode == 200:
        #     break


        # Update running reward to check condition for solving
        running_reward = 0.05 * ep_reward + (1-0.05) * running_reward


        # Additional lists for plotting
        episode_list.append(i_episode)
        rwrd_list.append(reward)
        reward_list.append(ep_reward) 


        finish_episode()


        # print("learning rate = ", learning_rate)
        # finish_episode()
        # finish_episode()
        # finish_episode()
        # finish_episode()
        del model.rewards[:]
        del model.saved_actions[:]


        # exploration_rate = min_exploration_rate + \
            # (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*i_episode)




        if i_episode % 1 == 0: #% 2 == 0: # We will print some things out
            print("------------------------Episode {}\tLast Reward: {:.3f}\tRunning reward: {:.3f}---------------------------------------------\n".format(
                i_episode, ep_reward, running_reward
            ))
            
        if running_reward >= 0:
            print("Solved, running reward is now {} and the last episode runs to {} time steps".format(
                    running_reward, t
            ))
            print("log: ", log)

            break
           
    # plot_reward(episode_list, reward_list, [[0,100],[99,300],[299,500],[499,1000],[999,1500],[1499,3000],[2999,4000],[3999,5000],[4999,6000]])
    # # plot_reward()
    # plot_losses(episode_list, loss_list)
    # plot_policy_losses(episode_list, policy_loss_list)
    # plot_value_losses(episode_list, value_loss_list)

        
    # plot_value_losses(episode_list, value_losses)
   

In [None]:
train()

In [None]:
env._run_simulation()
env.measures

In [None]:
# There. we finished
# Lets see it in action
done = False
cnt = 0

In [None]:
next_performance, done, log = env.target.verify(env.measures)


In [None]:
print("next performance", next_performance)
print("done", done)
print("log", log)
env.measures

In [None]:
# observation = env.reset()
# while True:
# cnt += 1
env.render()
# action = select_action(observation)
# observation, reward, done, _ = env.step(action)
# Lets see how long it lasts until failing
# print(f"Game lasted {cnt} moves")