# Overview

In [1]:

import gymnasium as gym
import numpy as np
import torch

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.policy import PPOPolicy
from tianshou.trainer import onpolicy_trainer
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.discrete import Actor, Critic

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

2023-11-09 11:13:00.343560: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# environments
env = gym.make('CartPole-v0')
train_envs = DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(20)])
test_envs = DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(10)])

In [3]:
# model and optimiser
net = Net(env.observation_space.shape, hidden_sizes=[64, 64], device=device)
actor = Actor(net, env.action_space.n, device=device).to(device)
critic = Critic(net, device=device).to(device)
actor_critic = ActorCritic(actor, critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

In [4]:
# PPO policy
dist = torch.distributions.Categorical
policy = PPOPolicy(actor, critic, optim, dist, action_space=env.action_space, deterministic_eval=True)

In [5]:
# collector
train_collector = Collector(policy, train_envs, VectorReplayBuffer(20000, len(train_envs)))
test_collector = Collector(policy, test_envs)

In [6]:
# trainer
result = onpolicy_trainer(
    policy,
    train_collector,
    test_collector,
    max_epoch=10,
    step_per_epoch=50000,
    repeat_per_collect=10,
    episode_per_test=10,
    batch_size=256,
    step_per_collect=2000,
    stop_fn=lambda mean_reward: mean_reward >= 195
)
print(result)

Epoch #1: 50001it [00:10, 4937.64it/s, env_step=50000, len=136, loss=31.220, loss/clip=-0.005, loss/ent=0.555, loss/vf=62.460, n/ep=11, n/st=2000, rew=136.27]                           


Epoch #1: test_reward: 146.700000 ± 6.148984, best_reward: 146.700000 ± 6.148984 in #1


Epoch #2:  60%|######    | 30000/50000 [00:05<00:03, 5791.43it/s, env_step=80000, len=199, n/ep=9, n/st=2000, rew=199.22]                                                                

{'duration': '15.48s', 'train_time/model': '9.57s', 'test_step': 3561, 'test_episode': 30, 'test_time': '0.34s', 'test_speed': '10490.51 step/s', 'best_reward': 200.0, 'best_result': '200.00 ± 0.00', 'train_step': 80000, 'train_episode': 1367, 'train_time/collector': '5.57s', 'train_speed': '5284.40 step/s'}





In [7]:
# watch it's performance
policy.eval()
result = test_collector.collect(n_episode=1, render=False)
print("Final reward: {}, length: {}".format(result["rews"].mean(), result["lens"].mean()))

Final reward: 200.0, length: 200.0
