In [54]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import matplotlib.pyplot as plt
from gym import spaces

In [65]:
# Neural network for NAF
class NAFNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(NAFNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 256)
        self.fc2 = nn.Linear(256, 256)

        self.fc_value = nn.Linear(256, 1)
        self.fc_mu = nn.Linear(256, action_size)
        self.fc_l = nn.Linear(256, action_size * action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))

        value = self.fc_value(x)
        mu = torch.tanh(self.fc_mu(x))*torch.tensor([1, 0.167])
        l = self.fc_l(x)

        l_matrix = l.view(-1, action_size, action_size)
        l_matrix = torch.tril(l_matrix, -1) + torch.diag_embed(torch.exp(torch.diagonal(l_matrix, dim1=-2, dim2=-1)))
        p_matrix = torch.bmm(l_matrix, l_matrix.transpose(2, 1))

        return value, mu, p_matrix

    def q_value(self, state, action):
        value, mu, p_matrix = self.forward(state)
        action_diff = action - mu
        advantage = -0.5 * torch.bmm(action_diff.unsqueeze(1), torch.bmm(p_matrix, action_diff.unsqueeze(2))).squeeze(2)
        q_value = value + advantage
        return q_value

In [60]:
# Replay buffer
class ReplayBuffer:
    def __init__(self, size):
        self.memory = deque(maxlen=size)
    
    def add(self, experience):
        self.memory.append(experience)
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [77]:
def train_model():
    if len(memory) < batch_size*10:
        return

    minibatch = memory.sample(batch_size)
    states = torch.FloatTensor([e[0] for e in minibatch])
    actions = torch.FloatTensor([e[1] for e in minibatch])
    rewards = torch.FloatTensor([e[2] for e in minibatch])
    next_states = torch.FloatTensor([e[3] for e in minibatch])
    dones = torch.FloatTensor([e[4] for e in minibatch])

    q_values = naf_network.q_value(states, actions)
    next_actions = target_naf_network(next_states)[1]
    next_q_values = target_naf_network.q_value(next_states, next_actions)
    target_q_values = rewards.unsqueeze(1) + (1 - dones).unsqueeze(1) * discount_factor * next_q_values

    loss = loss_fn(q_values, target_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    for target_param, param in zip(target_naf_network.parameters(), naf_network.parameters()):
        target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

In [62]:
def get_action(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    state = torch.FloatTensor(state).unsqueeze(0)
    _, mu, _ = naf_network(state)
    return mu.detach().numpy()[0]

In [74]:
class cstr_env(gym.Env):

    def __init__(self): 
        # Define action & observation space   
        self.action_space = spaces.Box(low = np.array([-1.0, -1.0], dtype=np.float32), 
                                       high = np.array([1.0 , 1.0], dtype=np.float32), 
                                       dtype=np.float32, shape=(2, ))   
        self.observation_space = spaces.Box(low=np.array([-1.0, -1.0], dtype=np.float32), 
                                            high=np.array([1.0, 1.0], dtype=np.float32), 
                                            dtype=np.float32, shape=(2, ))
        self.n_episode = 0 # current episode number.

        
    def is_done(self, x_next):
        done = False
        c1 = (abs(x_next[0] - self.setpoint_states[0]) < 0.01)
        c2 = (abs(x_next[1] - self.setpoint_states[1]) < 0.01)
        steady_state = c1 and c2  

        # Record the steady state status for the current step
        self.goal_state_done[self.ep_step] = steady_state
        
        # Check if there are at least 4 previous steps
        if self.ep_step > 3: 
            # Get the steady state status for the last three steps
            p3 = self.goal_state_done[self.ep_step-2] 
            p2 = self.goal_state_done[self.ep_step-1] 
            p1 = self.goal_state_done[self.ep_step-0] 
            # If the last three steps were steady states, set 'done' to True
            if  p3 and p2 and p1:
                done = True  
        return done 

    
    def get_dx(self, x, u):
        Q = torch.tensor([[9.35, 0.41], [0.41, 0.02]])
        R = torch.tensor([[1/500, 0], [0, 1/100]])
        P = torch.tensor([[9.35, 0.41], [0.41, 0.02]])
        
        params = [0.5734, 395.3268, 100e-3, 0.1, 72e+9, 8.314e+4, 8.314, 310, -4.78e+4, 0.239, 1000, 1]
        CAs, Ts, CF, CV, Ck0, CE, CR, CT0, CDh, Ccp, Crho, CA0s = params                   
        g1, g2 = CF/CV, 1/(Crho*Ccp*CV)
        x1, x2 = x[0], x[1]
        
        f1 = (CF/CV)*(-x1) - Ck0*np.exp(-CE/(CR*(x2+Ts))) * (x1+CAs)+(CF/CV) * (CA0s-CAs)
        f2 = (CF/CV)*(-x2) + (-CDh/(Crho*Ccp))*Ck0*np.exp(-CE/(CR*(x2+Ts)))*(x1+CAs) + CF*(CT0-Ts)/CV
        dx = [f1, f2] + u*[g1, g2]
        return dx

    
    def step(self, action):
        dt = 5e-3
        self.current_u = action
        state = self.current_s
        x_next = self.current_s + dt*self.get_dx(self.current_s, action)
        done = self.is_done(x_next) 
        reward = -np.sum((x_next - self.setpoint_states)**2)*0.01 + np.sum((self.current_u - self.setpoint_actions)**2)  
 
        # changing the previous state to the current state. 
        self.previous_u = self.current_u 
        # changing the current state to next state
        self.current_s = x_next 
        # increase the step by one 
        self.ep_step += 1  

        # this is the trancated condition. 
        trancated = False 
        if self.ep_step == episode_length:
            trancated = True

        if self.ep_step == episode_length-1 or done:      
            self.n_episode += 1 
        
        # if done is true i.e. terminated is equal to done. 
        terminated = done

        return x_next, reward, done


    def reset(self):

        self.ep_step = 0 
        self.current_u= None 
        self.previous_u = None 
        self.current_s = None 

        ## list of true false which stores the weather the state is near to the goal state or not. 
        self.goal_state_done = [False] * (episode_length+5)

        self.setpoint_states = None
        self.setpoint_actions = None 

        ## this function is set the setpoint for the current state and actions. 
        self.setpoint_states  =  np.array([.0, .0], dtype=float)     
        self.setpoint_actions =  np.array([.0, .0], dtype=float)

        # this is the fixed initial state. 
        state, action = np.array([0.2, -5]),  np.array([1.5, 0.1]) 

        self.current_u = action 
        self.previous_u = action  
        self.current_s = state  

        return state

In [76]:
# Hyperparameters
learning_rate = 1e-4
discount_factor = 0.99
batch_size = 256
tau = 0.001
epsilon_decay = 0.995
epsilon_min = 0.01
memory_size = 100000
num_episodes = 3000
episode_length = num_episodes

In [None]:
memory = ReplayBuffer(memory_size)
epsilon = 1.0
# Set up the environment
env = cstr_env()
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]

# Initialize networks and optimizer
naf_network = NAFNetwork(state_size, action_size)
target_naf_network = NAFNetwork(state_size, action_size)
target_naf_network.load_state_dict(naf_network.state_dict())
target_naf_network.eval()

optimizer = optim.Adam(naf_network.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

for e in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    step = 0

    while not done:
        step += 1
        action = get_action(state, epsilon)
        next_state, reward, done = env.step(action)
        total_reward += reward

        memory.add((state, action, reward, next_state, done))
        state = next_state
        train_model()
        if step == 500:
            break

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    print(f"Episode: {e+1}/{num_episodes}, Reward: {total_reward}, done: {done}")
    print(f"state: {state}, action: {action}")

env.close()

Episode: 1/3000, Reward: -5239.6881169957305, done: False
state: [  0.27954507 -63.21722602], action: [ 0.22517358 -0.39738828]
Episode: 2/3000, Reward: -3907.569274965583, done: False
state: [  0.30741021 -59.14238107], action: [-0.66914266 -0.22523071]
Episode: 3/3000, Reward: -6812.965087777302, done: False
state: [  0.36885861 -66.80320907], action: [-0.97807145 -0.83533347]
Episode: 4/3000, Reward: -5554.499742730859, done: False
state: [  0.32082394 -63.98188787], action: [0.25814047 0.97887915]
Episode: 5/3000, Reward: -6613.8933134667195, done: False
state: [  0.26951161 -66.37712568], action: [-0.05953976 -0.9532626 ]
Episode: 6/3000, Reward: -6837.878454153776, done: False
state: [  0.32616873 -66.74862398], action: [-0.51179653 -0.86600846]
Episode: 7/3000, Reward: -5384.213796864692, done: False
state: [  0.36175728 -63.48980431], action: [0.23639834 0.94055015]
Episode: 8/3000, Reward: -7613.589143012114, done: False
state: [  0.37240414 -68.27154159], action: [-0.17893514