In [1]:
import gym

from stable_baselines3 import PPO

env = gym.make('CartPole-v1')

model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()

env.close()

Using cuda device
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 631  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 396         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008806784 |
|    clip_fraction        | 0.234       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | -1.47e+04   |
|    learning_rate        | 0.0003      |
|    loss                 | 9.7         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0176     |
|    value_loss           | 53.9        |
-----------------------

In [7]:
import stable_baselines3
help(stable_baselines3)
help(stable_baselines3.common.on_policy_algorithm)

Help on package stable_baselines3:

NAME
    stable_baselines3

PACKAGE CONTENTS
    a2c (package)
    common (package)
    ddpg (package)
    dqn (package)
    ppo (package)
    sac (package)
    stable_baselines3 (package)
    td3 (package)

DATA
    file_handler = <_io.TextIOWrapper name='/home/darijan/.local/li...asel...
    version_file = '/home/darijan/.local/lib/python3.8/site-packages/stabl...

VERSION
    0.7.0

FILE
    /home/darijan/.local/lib/python3.8/site-packages/stable_baselines3/__init__.py


Help on module stable_baselines3.common.on_policy_algorithm in stable_baselines3.common:

NAME
    stable_baselines3.common.on_policy_algorithm

CLASSES
    stable_baselines3.common.base_class.BaseAlgorithm(abc.ABC)
        OnPolicyAlgorithm
    
    class OnPolicyAlgorithm(stable_baselines3.common.base_class.BaseAlgorithm)
     |  OnPolicyAlgorithm(policy: Union[str, Type[stable_baselines3.common.policies.ActorCriticPolicy]], env: Union[gym.core.Env, stable_baselines3.common.vec_

In [None]:
import torch
import torch.nn.functional as F


def maml_grad(model, inputs, outputs, lr, batch=1):
    """
    Update a model's gradient using MAML.
    The gradient will point in the direction that
    improves the total loss across all inner-loop
    mini-batches.
    
    Args:
        model: an nn.Module for training.
        inputs: a large batch of model inputs.
        outputs: a large batch of model outputs.
        lr: the inner-loop SGD learning rate.
        batch: the inner-loop batch size.
    """
    params = list(model.parameters())
    device = params[0].device
    initial_values = []
    final_values = []
    losses = []
    scalar_losses = []

    for i in range(0, inputs.shape[0], batch):
        x = inputs[i:i+batch]
        y = outputs[i:i+batch]
        target = y.to(device)
        out = model(x.to(device))

        loss = -(outputs*torch.log(out+1e-8)).mean()
        losses.append(loss)
        scalar_losses.append(loss.item())
        initial_values.append([p.clone().detach() for p in params])
        updated = []
        grads = torch.autograd.grad(loss, params, create_graph=True, retain_graph=True)
        for grad, param in zip(grads, params):
            x = param - lr * grad
            updated.append(x)
            param.data.copy_(x)
        final_values.append(updated)

    gradient = [torch.zeros_like(p) for p in params]
    for loss, initial, final in list(zip(losses, initial_values, final_values))[::-1]:
        for p, x in zip(params, initial):
            p.data.copy_(x)
        grad1 = torch.autograd.grad(loss, params, retain_graph=True)
        grad2 = torch.autograd.grad(final, params, grad_outputs=gradient, retain_graph=True)
        gradient = [v1 + v2 for v1, v2 in zip(grad1, grad2)]

    for p, g in zip(params, gradient):
        if p.grad is None:
            p.grad = g
        else:
            p.grad.add_(g)
            
    return scalar_losses


In [2]:
def mymaml_grad(model, inputs_train, targets_train, inputs_test, targets_test, lr):
    params = list(model.parameters())
    import copy
    data_of_params = copy.deepcopy([p.data for p in params])
    batch_size = len(inputs_train)
    test_scalar_losses = []

    for x_train, y_train, x_test, y_test in zip(inputs_train, targets_train, inputs_test, targets_test):
        preds = model(x_train)
        loss = -(y_train*torch.log(preds+1e-8)).mean()
        grads = torch.autograd.grad(loss, params, create_graph=True, retain_graph=True)
        for param, grad in zip(params, grads):
            x = param -lr*grad
            param.data.copy_(x)
        preds = model(x_test)
        loss = -(y_test*torch.log(preds+1e-8)).mean()/batch_size
        test_scalar_losses.append(loss.item())
        loss.backward()

        for param, data in zip(params, data_of_params):
            param.data = data

    return test_scalar_losses


In [None]:
def mymaml_grad(model, intrain, targtrain, intest, targtest, lr):
    params = list(model.parameters())
    batch_size = len(intrain)
    test_scalar_losses = []

    for x_train, y_train, x_test, y_test in zip(intrain, targtrain, intest, targtest):

        preds = model(x_train)
        loss = -(y_train*torch.log(preds+1e-8)).mean()
        # grads = torch.autograd.grad(loss, params, create_graph=True, retain_graph=True)
        loss.backward()

        cmodel = copy.deepcopy(model)
        cparams = list(cmodel.parameters())
        for param, cparam, grad in zip(params, cparams, grads):
            tmp = param - lr*grad 
            with torch.no_grad():
                cparam.copy_( tmp )

        for name, p in model.named_parameters():
            p2 = torch.nn.Parameter( p + 10 )
            setattr_rec(model, name, p2)

        preds = cmodel(x_test)
        loss = -(y_test*torch.log(preds+1e-8)).mean()/batch_size
        test_scalar_losses.append(loss.item())
        # grads1 = torch.autograd.grad(loss, cparams)
        # grads2 = torch.autograd.grad(new,   params)
        # grads = [g1*g2 for g1, g2 in zip(grads1, grads2)]
        grads = torch.autograd.grad(loss, params, allow_unused=True)

        print(grads[0])#, grads1[0], grads2[0])
        for param, grad in zip(params, grads):
            if param.grad is None:
                param.grad = grad
            else:
                param.grad += grad
    return test_scalar_losses