In [1]:
import torch
from torch import nn 
from torch.nn import functional
import gym
import numpy as np
from collections import deque
import random

def cuda_tensor(x):
    return torch.tensor(x, dtype=torch.float32, device='cuda')

class ReplayMemory:
    def __init__(self, maxlen):
        self.memory = deque(maxlen=maxlen)
    
    def push(self, i_state:np.ndarray, i_action:np.ndarray, reward:float, done:bool, f_state:np.ndarray):
        self.memory.append((i_state, i_action, reward, done, f_state))
    
    def len(self):
        return len(self.memory)
    
    def sample(self, batch_size):
        i_states, i_actions, rewards, dones, f_states = zip(*random.sample(self.memory, batch_size))
        i_states = torch.stack(tuple(map(cuda_tensor, i_states)))
        f_states = torch.stack(tuple(map(cuda_tensor, f_states)))
        i_actions = torch.stack(tuple(map(cuda_tensor, i_actions)))
        rewards = torch.stack(tuple(map(cuda_tensor, rewards)))
        dones = torch.stack(tuple(map(cuda_tensor, dones)))

        return i_states, i_actions, rewards, dones, f_states
    
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(8, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, 2),
            nn.Tanh()
        ).to('cuda')
        self.optimizer=torch.optim.Adam(self.network.parameters(), 0.0001)
    
    def forward(self, x):
        return self.network.forward(x)
    
    def choose_action(self, state, noise_scalar):
        with torch.no_grad():
            noisy_network = Actor()
            noisy_network.load_state_dict(self.state_dict())
            for name, i in noisy_network.named_parameters():
                if name[-1]=='t':
                    i.data += torch.randn_like(i.data) * noise_scalar
            return noisy_network.forward(state).cpu().numpy()
    
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(10, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, 1)
        ).to('cuda')
        self.optimizer=torch.optim.Adam(self.network.parameters(), 0.001, weight_decay=0.01)
    
    def forward(self, x):
        return self.network.forward(x)

class Agent:
    def __init__(self):
        self.noise = 0.3
        self.noise_decay = .002
        self.gamma = .99
        self.tau = 0.999
        
        self.main_actor = Actor()
        self.target_actor = Actor()

        self.main_critic = Critic()
        self.target_critic= Critic()
    
        self.target_actor.load_state_dict(self.main_actor.state_dict())
        self.target_critic.load_state_dict(self.main_critic.state_dict())

        self.replay = ReplayMemory(1000000)

    def learn(self):
        if self.replay.len()<128:
            return
        
        i_states, i_actions, rewards, dones, f_states= self.replay.sample(128)
        
        # update critic
        with torch.no_grad():
            f_actions = self.target_actor.forward(f_states)
            f_state_action_vals = self.target_critic.forward(torch.cat((f_states, f_actions), dim=1)).squeeze()
            td_targets = f_state_action_vals * (1-dones) * self.gamma + rewards

        i_state_action_vals = self.main_critic.forward(torch.cat((i_states, i_actions), dim=1)).squeeze()

        critic_loss = functional.smooth_l1_loss(i_state_action_vals, td_targets)
        self.main_critic.optimizer.zero_grad()
        critic_loss.backward()
        self.main_critic.optimizer.step()

        # update actor

        det_actions = self.main_actor.forward(i_states)
        state_action_vals = -self.main_critic.forward(torch.cat((i_states, det_actions), dim=1))
        
        self.main_actor.optimizer.zero_grad()
        state_action_vals.mean().backward()
        self.main_actor.optimizer.step()

    def choose_action(self, state:np.ndarray ):
        return self.main_actor.choose_action(cuda_tensor(state), self.noise)
    
    def update_target(self):
        self.target_actor.load_state_dict(self.main_actor.state_dict())
        self.target_critic.load_state_dict(self.main_critic.state_dict())


In [2]:

env=gym.make('LunarLanderContinuous-v2')
agent = Agent()
total_steps = 0

max_score=0
actor_model_path="ddpg_actor.pth"
critic_model_path = "ddpg_critic.pth"


In [None]:

for episode in range(1000):
    i_state = env.reset()
    score=0
    while True:
        total_steps += 1

        """ if total_steps % 129 ==0:
            breakpoint() """
        action = agent.choose_action(i_state)
        f_state, reward, done, _ = env.step(action)
        
        agent.replay.push(np.copy(i_state), np.copy(action), reward, done, np.copy(f_state))
        agent.learn()
        score += reward
        
        env.render('rgb_array')
        if total_steps % 30 == 0:
            agent.update_target()
            
        if done:
            break

        i_state = np.copy(f_state)
    if score > max_score:
        max_score = score
        torch.save(agent.main_actor.state_dict(), actor_model_path)
        torch.save(agent.main_critic.state_dict(), critic_model_path)
    agent.noise = max(0.01, agent.noise - agent.noise_decay)
        