In [2]:
from gym import make

env = make("Pendulum-v0")

In [19]:
from torch import nn
from torch.nn import functional as F
from torch.distributions import Normal
import pickle


class Critic(nn.Module):
    
    def __init__(self, hidden=400, in_dim=3, out_dim=1):
        super().__init__()
        self.fc2 = nn.Linear(in_dim, hidden)
        self.valNet = nn.Linear(hidden, 1)
        ortho_init(self.fc2, weight_scale=1.0, constant_bias=0.0)
        ortho_init(self.valNet, weight_scale=1.0, constant_bias=0.0)
    
    def forward(self, x):
        z = F.relu(self.fc2(x))
        val = self.valNet(z)
        return val

def orthoInit(model):
    ortho_init(model.fc, weight_scale=0.01, constant_bias=0.0)
    ortho_init(model.sigma, weight_scale=0.01, constant_bias=0.0)
    ortho_init(model.mu, weight_scale=0.01, constant_bias=0.0)
    
class Actor(nn.Module):
    def __init__(self, hidden=400, in_dim=3, out_dim=1):
        super().__init__()
        self.fc = nn.Linear(in_dim, hidden)
        self.mu = nn.Linear(hidden, out_dim)
        self.sigma = nn.Parameter(torch.full((1,), np.log(0.6)))

    def forward(self, x):
        z2 = F.relu(self.fc(x))
        mu = 2 * F.tanh(self.mu(z2))
        sigma = self.sigma.expand_as(mu).exp()
        return mu, sigma
    
    def save(self):
            with open('agent.pkl', 'wb') as f:
                pickle.dump(self, f)

In [20]:
import wandb

wandb.login()

True

In [22]:
import numpy as np
import ray
from copy import deepcopy


def policy_loss(logProb, A):
    polLoss = -(logProb * A).mean()
    return polLoss

def zeroGrads(ann):
    ind = 0
    for e in ann.parameters():
        if e.grad is None:
            continue
        shape = e.size()
        nParams = np.prod(shape)
        e.grad.data *= 0
        ind += nParams

def value_loss(val, ret):
    return ((val - ret) ** 2 / 2).mean()


def getGrads(ann):
    ret = []
    for param, e in ann.named_parameters():
          ret += e.grad.data.view(-1).numpy().tolist()
    return ret

def setParameters(ann, meanVec):
    ind = 0
    for e in ann.parameters():
        shape = e.size()
        nParams = np.prod(shape)
        e.data = torch.Tensor(np.array(meanVec[ind:ind + nParams]).reshape(*shape))
        ind += nParams
        

@ray.remote
class Runner:
    def __init__(self, env_name, actor_id):
        self.env = make(env_name)
        self.id = actor_id
        self.policy = Actor()
        self.critic = Critic()
        self.i = 0
        self.stats_len = 100
        self.target_len = 800
        self.episode_len = 200
        self.update_len = 10
        self.entropy = 0.01
        self.gamma = 0.9
        self.total_reward = 0
        self.target = deepcopy(self.critic)
        
    
    def gather_rollout(self):
        total_reward = 0
        steps = 0
        policies = []
        actions = []
        values = []
        returns = []
        entropy = []
        logProb = []
        targets = []
        if self.i % self.episode_len == 0:
            self.state = self.env.reset()
        
        for epoch in range(1):
            rewards = []
            for i in range(self.update_len):
                mu, sigma = self.policy(torch.from_numpy(self.state).float())
                val = self.critic(torch.from_numpy(self.state).float())
                target = self.target(torch.from_numpy(self.state).float())
                pi = Normal(mu, sigma)
                action = pi.sample().detach()
                self.next_state, reward, done, _ = self.env.step(action)
                total_reward += reward
                reward = (reward + 8.1) / 8.1
                self.i += 1
                rewards.append(reward)
                actions.append(action)
                values.append(val)
                targets.append(target)
                logProb.append(pi.log_prob(action))
                entropy.append(pi.entropy())
                self.state = self.next_state
            self.total_reward += total_reward
            next_target = torch.zeros(1) if self.i % self.episode_len == 0 else self.target(torch.from_numpy(self.next_state).float())
            returns = torch.tensor(rewards).view(-1, 1) + self.gamma * torch.cat((torch.cat(targets[1:]), next_target)).view(-1, 1).detach()

        return {'logProb': torch.cat(logProb).view(-1, 1),
                'entropy': torch.cat(entropy).view(-1, 1),
                'rets': returns,
                'actions':  torch.cat(actions).view(-1, 1),
                'vals':  torch.cat(values).view(-1, 1)}, self.total_reward

    def backward(self, rollout):
        adv = advantage(rollout['rets'] - rollout['vals'])
        polLoss = policy_loss(rollout['logProb'], adv)
        entLoss = -rollout['entropy'].mean() * self.entropy
        valLoss = value_loss(rollout['vals'], rollout['rets'])
        totLoss = valLoss + entLoss + polLoss
        totLoss.backward()
        return getGrads(self.policy), getGrads(self.critic), float(valLoss), float(entLoss), float(polLoss) 

    def compute_gradient(self, params, critic):
        setParameters(self.policy, params[0])
        zeroGrads(self.policy)
        setParameters(self.critic, critic[0])
        zeroGrads(self.critic)
        if self.i % self.target_len == 0:
            self.target = deepcopy(self.critic)
        rollout, total_reward = self.gather_rollout()
        grads, gradsCritic, valLoss, entLoss, polLoss = self.backward(rollout)
        info = {"id": self.id,
                'valLoss': valLoss,
                'entLoss': entLoss,
                'polLoss': polLoss,
                'action': rollout['actions'].mean()}
        if self.i % self.stats_len == 0:
            info["totalReward"] =  self.total_reward
            self.total_reward = 0
        return {0: grads}, {0: gradsCritic} , info

In [23]:
import numpy as np
import ray
from torch.nn.parameter import Parameter
from torch import optim
from torch.autograd import Variable
from collections import defaultdict
from torch.optim.lr_scheduler import StepLR
import torch


def getParameters(ann):
    ret = []
    for name, e in ann.named_parameters():
        ret += e.data.view(-1).numpy().tolist()
    return ret

  
class ManualAdam(optim.Adam):
    def step(self, grads):
        grads = Variable(torch.Tensor(np.array(grads)))
        self.param_groups[0]['params'][0].grad = grads
        super().step()
        
def stepOpt(opt, gradDict, params):  
        grads = defaultdict(list)
        for worker, grad in gradDict.items():
                grads[worker].append(grad)

        gradAry = torch.zeros_like(params)

        for worker, gradList in grads.items():
            grad = np.array(gradList)
            grad = np.mean(grad, 0)
            grad = np.clip(grad, -5, 5)
            gradAry[worker] = torch.Tensor(grad)

        opt.step(gradAry)
        
        
def train(num_workers, env_name="Pendulum-v0"):
    policy = Actor()
    critic = Critic()
    model = [getParameters(policy)]
    modelCritic = [getParameters(critic)]
    params = Parameter(torch.Tensor(np.array(model)))
    paramsCritic = Parameter(torch.Tensor(np.array(modelCritic)))
    opt = ManualAdam([params], lr=0.0001)
    optCritic = ManualAdam([paramsCritic], lr=0.001)
    wandb.init()
    ray.init()

    agents = [Runner.remote(env_name, i) for i in range(num_workers)]
    gradient_list = [agent.compute_gradient.remote(params.detach().numpy(), paramsCritic.detach().numpy()) for agent in agents]
    i = 0
    
    while True:
        done_id, gradient_list = ray.wait(gradient_list)

        grads, gradsCritic, info = ray.get(done_id)[0]
        wandb.log(info)
        i += 1
        stepOpt(opt, grads, params)
        opt.zero_grad()
        stepOpt(optCritic, gradsCritic, paramsCritic)
        optCritic.zero_grad()
        if 'totalReward' in info.keys() and info['totalReward'] > -10:
            setParameters(policy, params.detach().numpy()[0])
            policy.save()
        parameters = params.detach().numpy()
        parCritic = paramsCritic.detach().numpy()
        gradient_list.extend([agents[info["id"]].compute_gradient.remote(parameters, parCritic)])
    return policy

In [24]:
ray.shutdown()

In [None]:
train(1)

In [1]:
import torch.nn as nn


def ortho_init(module, nonlinearity=None, weight_scale=1.0, constant_bias=0.0):
    r"""Applies orthogonal initialization for the parameters of a given module.
    
    Args:
        module (nn.Module): A module to apply orthogonal initialization over its parameters. 
        nonlinearity (str, optional): Nonlinearity followed by forward pass of the module. When nonlinearity
            is not ``None``, the gain will be calculated and :attr:`weight_scale` will be ignored. 
            Default: ``None``
        weight_scale (float, optional): Scaling factor to initialize the weight. Ignored when
            :attr:`nonlinearity` is not ``None``. Default: 1.0
        constant_bias (float, optional): Constant value to initialize the bias. Default: 0.0
        
    .. note::
    
        Currently, the only supported :attr:`module` are elementary neural network layers, e.g.
        nn.Linear, nn.Conv2d, nn.LSTM. The submodules are not supported.
    
    Example::
    
        >>> a = nn.Linear(2, 3)
        >>> ortho_init(a)
    
    """
    if nonlinearity is not None:
        gain = nn.init.calculate_gain(nonlinearity)
    else:
        gain = weight_scale
        
    if isinstance(module, (nn.RNNBase, nn.RNNCellBase)):
        for name, param in module.named_parameters():
            if 'weight_' in name:
                nn.init.orthogonal_(param, gain=gain)
            elif 'bias_' in name:
                nn.init.constant_(param, constant_bias)
    else:  # other modules with single .weight and .bias
        nn.init.orthogonal_(module.weight, gain=gain)
        nn.init.constant_(module.bias, constant_bias)

In [8]:
import pickle
import torch
with open("agent.pkl", "rb") as f:
        weights = pickle.load(f)

In [1]:
from gym import make
from torch.nn import functional as F

env = make("Pendulum-v0")

for epoch in range(100):
 state = env.reset()
 for i in range(75):
    
    mu, sigma = weights(torch.from_numpy(state).float())
    action = mu.detach()
    state, reward, done, _ = env.step(action)
    env.render()

NameError: name 'weights' is not defined