### HW 3-2. DDPG for Traffic Light Control

Please write down the codes for DDOG algorithm on this file. Also, add your comment with the result here,

In [1]:
### DDPG ###
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque

class ReplayMemory:
    
    def __init__(self,length):
        self.memory=deque(maxlen=length)
        
    def push(self, transition):
        self.memory.append(transition)
        
    def sample(self, size):
        sample= random.sample(self.memory, size)
        
        before_state=[i[0] for i in sample]
        action=[i[1] for i in sample]
        reward=[i[2] for i in sample]
        state=[i[3] for i in sample]
        done=[i[4] for i in sample]
        
        before_state=np.stack(before_state)
        before_state=torch.tensor(before_state).squeeze()
        ## print(before_state.shape)
        
        action=np.array(action)
        action=torch.tensor(action, dtype=torch.float32).squeeze(-1)
        
        ##print(action.shape)
        state=np.stack(state)
        state=torch.tensor(state).squeeze()
        
        reward=np.array(reward)
        reward=torch.tensor(reward,dtype=torch.float32).reshape(-1,1)
        
        done = np.array(done).astype(int)
        done=torch.tensor(done).reshape(-1,1)
        
        return before_state, action, reward, state, done
    
    def __len__(self):
        return len(self.memory)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=[64,64]):
        super(Actor,self).__init__()
        self.layers=nn.ModuleList()
        self.activations=nn.ModuleList()
        
        input_dims=[state_dim]+hidden_dim
        output_dims=hidden_dim + [action_dim]
        
        for in_dim, out_dim in zip(input_dims, output_dims):
            self.layers.append(nn.Linear(in_dim, out_dim))
            
        for i in range(len(hidden_dim)):
            self.activations.append(nn.LeakyReLU())
            
        self.activations.append(nn.Tanh())
        
    def forward(self, state):
        x=state
        for l, activation in zip(self.layers, self.activations):
            x=l(x)
            x=activation(x)
        return x
    
    
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64, output_dim=1):
        super(Critic, self).__init__()
        self.l1=nn.Linear(state_dim, hidden_dim)
        self.l2=nn.Linear(hidden_dim + action_dim, hidden_dim)
        self.l3=nn.Linear(hidden_dim, output_dim)
        
    def forward(self, state, action):
        x=F.relu(self.l1(state))
        x=F.relu(self.l2(torch.cat([x,action],dim=-1)))
        x=self.l3(x)
        
        return x
    
class OUNoise:
    
    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=1000):
        self.mu=mu
        self.theta=theta
        self.sigma=max_sigma
        self.max_sigma=max_sigma
        self.min_sigma=min_sigma
        self.decay_period=decay_period
        self.action_dim=action_space
        self.reset()
        
        self.epsilon=1.0
        self.epsilon_decay=0.00001
        self.epsilon_min=0.05
        
    def reset(self):
        self.state=np.ones(self.action_dim)*self.mu
        
    def evolve_state(self):
        x=self.state
        dx=self.theta*(self.mu-x)+self.sigma*np.random.randn(self.action_dim)
        self.state=x+dx
        return self.state
    
    def get_action(self, action,t=0):
        ou_state=self.evolve_state()*self.epsilon
        
        self.epsilon-=self.epsilon_decay
        if self.epsilon<self.epsilon_min:
            self.epsilon=self.epsilon_min
            
        self.sigma=self.max_sigma-(self.max_sigma-self.min_sigma) * min(1.0, t/ self.decay_period)
        return np.clip(action+ou_state,-1.0,1.0)

class DDPGAgent(nn.Module):
    def __init__(self, state_dim, action_dim, action_min, action_max, gamma=0.99):
        super(DDPGAgent,self).__init__()
        self.action_min=np.array(action_min)
        self.action_max=np.array(action_max)
        
        self.gamma=gamma
        self.ou_noise=OUNoise(action_dim)
        
        self.actor=Actor(state_dim, action_dim)
        self.critic=Critic(state_dim,action_dim)
        
        self.actor_target=Actor(state_dim,action_dim)
        self.critic_target=Critic(state_dim, action_dim)
        
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        
        self.actor_optimizer=torch.optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer=torch.optim.Adam(self.critic.parameters(), lr=1e-3)
        
        self.memory=ReplayMemory(5000)
        self.batch_size=50
        
        self.num_fit=0
        
        self.loss_ftn=nn.MSELoss()
        
    def train_start(self):
        
        if len(self.memory) > self.batch_size:
            return True
        else:
            return False
        
    def forward(self, state, t=0):
        action_before_norm=self.actor(state).detach().numpy()
        action_before_norm_with_noise = self.ou_noise.get_action(action_before_norm, t)
        action_after_norm=(action_before_norm_with_noise + 1) / 2 * (self.action_max-self.action_min)+self.action_min
        
        return action_before_norm, action_after_norm
    
    def save_memory(self, transition):
        self.memory.push(transition)
        
    def train(self):
        
        state, action, reward, next_state, done= self.memory.sample(self.batch_size)
        
        q=self.critic(state,action)
        
        next_q_val=self.critic_target(next_state,self.actor_target(next_state))
        target_q=reward+self.gamma*(1-done)*next_q_val
        
        critic_loss=self.loss_ftn(q,target_q)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        actor_loss=-self.critic(state,self.actor(state))
        actor_loss=actor_loss.mean()
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        if self.num_fit % 100==0:
            self.critic_target.load_state_dict(self.critic.state_dict())
            self.actor_target.load_state_dict(self.actor.state_dict())
            
        return critic_loss.item(), actor_loss.item()

In [2]:
### Training (Do not revise this code)###
from SUMO.TrafficEnv_DDPG import TrafficEnv

import torch
import random
import numpy as np
import matplotlib.pyplot as plt

def main_DDPG() :

    exp_dir = './SUMO/Single_Intersection'
    exp_type = 'binary'

    max_episode = 300
    max_epi_step = 800

    state_dim = 12
    action_dim = 1

    action_min = 5
    action_max = 20

    env = TrafficEnv(exp_dir, exp_type)
    agent = DDPGAgent(state_dim, action_dim, action_min, action_max)

    actor_loss_list = []
    critic_loss_list = []
    reward_list = []

    for episode in range(max_episode):

        state = env.reset()
        state = torch.tensor(state, dtype=torch.float32).reshape(-1, state_dim)

        actor_loss_epi = []
        critic_loss_epi = []
        reward_epi = []
        timing_data_epi = []
        action = None
        step = 0
        current_phase = 0

        for epi_step in range(max_epi_step):

            # make an action based on epsilon greedy action
            before_action = action

            action, action_norm = agent.forward(state, step)

            before_state = state

            state, reward, done = env.step(action_norm)

            state = torch.tensor(state, dtype=torch.float32).reshape(-1, state_dim)
            reward_epi.append(reward)

            # make a transition and save to replay memory
            transition = [before_state, action, reward, state, done]
            agent.save_memory(transition)

            if agent.train_start():
                critic_loss, actor_loss = agent.train()

                critic_loss_epi.append(critic_loss)
                actor_loss_epi.append(actor_loss)

            if done:
                if agent.train_start():
                    critic_mean = sum(critic_loss_epi) / len(critic_loss_epi)
                    actor_mean = sum(actor_loss_epi) / len(actor_loss_epi)
                    
                    critic_loss_list.append(critic_mean)
                    actor_loss_list.append(actor_mean)
                break

            step += 1

        reward_list.append(sum(reward_epi))  
    
        env.close()

        print(episode+1, reward_list[-1])

    return critic_loss_list, actor_loss_list, reward_list

if __name__ == "__main__":

    critic_loss_list, actor_loss_list, reward_list = main_DDPG()

    plt.plot(critic_loss_list)
    plt.title("Critic Loss of DDPG Agent")
    plt.savefig("./Visualization/DDPG/critic_loss.png")
    plt.close('all')
    
    plt.plot(actor_loss_list)
    plt.title("Actor Loss of DDPG Agent")
    plt.savefig("./Visualization/DDPG/actor_loss.png")
    plt.close('all')

    plt.plot(reward_list)
    plt.title("Reward of DDPG Agent")
    plt.savefig("./Visualization/DDPG/reward.png")
    plt.close('all')


1 -112
2 -168
3 -278
4 -703
5 -564
6 -118
7 -93
8 -111
9 -110
10 -124
11 -107
12 -93
13 -96
14 -116
15 -119
16 -97
17 -90
18 -98
19 -115
20 -111
21 -107
22 -114
23 -99
24 -103
25 -110
26 -97
27 -129
28 -96
29 -126
30 -120
31 -102
32 -109
33 -90
34 -100
35 -92
36 -100
37 -110
38 -105
39 -113
40 -93
41 -109
42 -107
43 -111
44 -97
45 -92
46 -112
47 -92
48 -152
49 -99
50 -129
51 -121
52 -108
53 -114
54 -92
55 -103
56 -110
57 -195
58 -104
59 -99
60 -95
61 -110
62 -111
63 -89
64 -95
65 -90
66 -115
67 -117
68 -92
69 -118
70 -113
71 -98
72 -103
73 -105
74 -106
75 -163
76 -104
77 -94
78 -95
79 -95
80 -94
81 -100
82 -114
83 -121
84 -110
85 -89
86 -92
87 -112
88 -106
89 -118
90 -93
91 -108
92 -94
93 -108
94 -93
95 -92
96 -87
97 -103
98 -107
99 -124
100 -114
101 -110
102 -111
103 -109
104 -99
105 -96
106 -103
107 -93
108 -107
109 -100
110 -120
111 -98
112 -105
113 -93
114 -97
115 -109
116 -110
117 -118
118 -111
119 -110
120 -92
121 -119
122 -116
123 -119
124 -110
125 -99
126 -108
127 -118
128 -97


In [None]:
### Add your comment for the result here ###
step을 진행할수록 reward가 fluctuate하긴 하지만, -100 정도로 유지됨을 볼 수 있다.
hyperparameter를 바꿔가며 모델을 발전시킨 것은 아니고, train의 결과이지만,
DQN의 방법에서 나온 reward보다 훨씬 좋게 나왔다.
actor loss 와 critic loss 는 점점 줄어드는 것을 볼 수 있다.