In [1]:
import time
from collections import deque

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributions as dist

import gym

from utils import ReplayBuffer,clip

In [2]:
device = 'cuda'

In [3]:
class Actor(nn.Module):
    def __init__(self, N_s, N_a, max_a):
        super().__init__()
        self.layer_1 = nn.Linear(N_s,200)
        self.layer_2 = nn.Linear(200,200)
        self.layer_3 = nn.Linear(200,N_a)
        self.N_s = N_s
        self.N_a = N_a
        self.max_a = torch.tensor(max_a, dtype = torch.float32).to(device)
    
    def forward(self, s):
        assert type(s) == torch.Tensor
        
        h = F.leaky_relu(self.layer_1(s))
        h = F.leaky_relu(self.layer_2(h))
        h = torch.tanh(self.layer_3(h)) * self. max_a
        return h

In [4]:
class Critic(nn.Module):
    def __init__(self,N_s,N_a):
        super().__init__()
        self.N_s = N_s
        self.N_a = N_a
        
        self.layer1 = nn.Linear(N_s+N_a,200)
        self.layer2 = nn.Linear(200,200)
        self.layer3 = nn.Linear(200,1)
    
    def forward(self,s,a):
        x = torch.cat((s,a),dim=-1)
        
        Q = F.leaky_relu(self.layer1(x))
        Q = F.leaky_relu(self.layer2(Q))
        Q = self.layer3(Q)
        
        return Q

In [5]:
class DDPG():
    def __init__(self, N_s, N_a, max_a):
        self.N_s = N_s
        self.N_a = N_a
        self.max_a = max_a
        
        self.actor = Actor(N_s, N_a, max_a)
        self.critic = Critic(N_s, N_a)
        self.target_critic = Critic(N_s, N_a)
        self.target_critic.load_state_dict(self.critic.state_dict())
        
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=1e-3)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=1e-3)
        
    def to(self, device):
        self.actor.to(device)
        self.critic.to(device)
        self.target_critic.to(device)
        
    def polyak(self, tau):
        for param, target_param in zip(self.critic.parameters(),self.target_critic.parameters()):
            target_param.data.copy_(tau*param.data + (1-tau)*target_param.data)
    
    def target_Q(self, batch, gamma):
        NS = torch.tensor(batch['next_state'], dtype=torch.float32).to(device)
        R = torch.tensor(batch['reward'], dtype=torch.float32).to(device)
        D = torch.tensor(batch['done'],dtype=torch.int8).to(device)
        R = R.view(*R.shape,1)
        D = D.view(*D.shape,1)
        
        
        with torch.no_grad():
            A = self.actor(NS)
            NQ = self.target_critic(NS,A)
            
            return R + ( gamma * (~D) * NQ )
    
    def critic_loss(self, batch, gamma):
        S = torch.tensor(batch['state'], dtype=torch.float32).to(device)
        A = torch.tensor(batch['action'],dtype=torch.float32).to(device)
        
        Q = self.critic(S,A)
        target_Q = self.target_Q(batch, gamma)
        
        return F.mse_loss(Q,target_Q)
    
    def actor_loss(self, batch):
        S = torch.tensor(batch['state'], dtype=torch.float32).to(device)
        A = self.actor(S)
        loss = -self.critic(S,A).mean()
        #loss = -self.target_critic(S,A).mean()
        
        return loss
    
    def train_critic_once(self, batch, gamma=0.99, tau=0.005):
        self.critic_optim.zero_grad()
        L = self.critic_loss(batch, gamma)
        L.backward()
        self.critic_optim.step()
        
        self.polyak(tau)
        
        return L.item()
    
    def train_actor_once(self, batch):
        self.actor_optim.zero_grad()
        L = self.actor_loss(batch)
        L.backward()
        self.actor_optim.step()
        
        return L.item()
    
    def explore(self,s,sigma = 0.3):
        s = torch.tensor(s,dtype=torch.float32).to(device)
        with torch.no_grad():
            a = self.actor(s)
        a = a.detach().cpu().numpy()
        eps = np.random.normal(size = self.N_a, scale = sigma)
        
        return clip(a + eps,-self.max_a,self.max_a)

In [6]:
def run_episode(env, buffer, model, render = False, sigma = 0.3):
    s = env.reset()
    done = 0
    if render:
        env.render()
        
    R = 0
    while(not done):
        a = model.explore(s,sigma)
        ns, r, done, info = env.step(a)
        T = [s,ns,a,r,done]
        buffer.append(T)
        s = ns.copy()
        if render:
            env.render()
        R += r
        
    return R
    

In [7]:
buffer = ReplayBuffer(max_len = 1e6)
env = gym.make('HalfCheetah-v2')

model = DDPG(17,6, np.array([1., 1., 1., 1., 1., 1., ]))
model.to(device)

In [8]:
# Add initial episodes to buffer w/o training
for episode in tqdm(range(3)):
    R = run_episode(env, buffer, model, render = False, sigma = 0.5)

100%|██████████| 3/3 [00:01<00:00,  2.66it/s]


In [9]:
# train
N_episodes = 1000
pbar = tqdm(range(N_episodes))
for episode in pbar:
    render = False
    sigma = 0.8 * np.exp(-3*episode/N_episodes)
        
    R = run_episode(env, buffer, model, render = render, sigma = sigma)
    pbar.set_postfix({' ' : f'R: {R:.3f}, sigma: {sigma:.3f}'})
    
    
    
    for n in range(20):
        batch = buffer.sample(64)
        critic_L = model.train_critic_once(batch)
        if n%2 == 0:
            actor_L = model.train_actor_once(batch)
            #print(f"critic loss: {critic_L}")
            #print(f"actor loss: {actor_L}")
            

100%|██████████| 1000/1000 [10:01<00:00,  1.66it/s,  =R: -163.850, sigma: 0.040]


In [10]:
# test
for episode in range(1):
    run_episode(env, buffer, model, render = True, sigma = 0.0)

env.close()

Creating window glfw
