In [1]:
import torch 
import tensorflow as tf
import torch.nn as nn
import numpy as np
import gym
import matplotlib.pyplot as plt
import ncps 
from ncps.torch import LTC
from ncps.torch import CfC
from ncps.wirings import AutoNCP
import pytorch_lightning as pl
import torch.utils.data as data
from state_indep_ppo import PPO

2025-02-27 10:10:46.411690: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-27 10:10:46.442352: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# LightningModule for training a RNNSequence module
class SequenceLearner(pl.LightningModule):
    def __init__(self, model, lr=0.005):
        super().__init__()
        self.model = model
        self.lr = lr

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat, _ = self.model.forward(x)
        y_hat = y_hat.view_as(y)
        loss = nn.MSELoss()(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat, _ = self.model.forward(x)
        y_hat = y_hat.view_as(y)
        loss = nn.MSELoss()(y_hat, y)

        self.log("val_loss", loss, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        # Here we just reuse the validation_step for testing
        return self.validation_step(batch, batch_idx)

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr)

In [9]:
env = gym.make("Pendulum-v1")
actionspace = env.action_space.shape[0]
obsspace = env.observation_space.shape[0]
print(f"actionspace: {actionspace}, obs space {obsspace}")
rollout_buffer = []
agent = PPO(
    ob_space=obsspace,
    actions=actionspace,
    n_batches=10,
    gamma=0.95   ,
    lam=0.95,
    kl_coeff=0.2,
    clip_rewards=False,
    clip_param=0.2,
    vf_clip_param=10.0,
    entropy_coeff=0,
    a_lr=5e-4,
    c_lr=5e-4,
    device='cpu',
    max_ts=100,

    # Any custom kwargs can also be passed in here. For example:
    rollouts_per_batch=5,
    max_timesteps_per_episode=200,
    n_updates_per_iteration=3,
)

# 3. Train the agent
total_timesteps = 2000//5  # Decide how long you want to train
rollouts = agent.learn(total_timesteps=total_timesteps, env=env, rollout_buffer= rollout_buffer)

actionspace: 1, obs space 3
Learning... Running 200 timesteps per episode, 5 timesteps per batch for a total of 400 rollouts


Rollouts:   0%|                                                                         | 0/400 [00:00<?, ?it/s, loss=0]


In [4]:
from baseline import train_ppo

env_name = "Pendulum-v1"
agent = train_ppo(env_name)

actionspace: 1, obs space 3
Learning... Running 200 timesteps per episode, 5 timesteps per batch for a total of 400 rollouts


Rollouts:   0%|                                                                         | 0/400 [00:00<?, ?it/s, loss=0]


In [5]:
print()


