In [1]:
import yuzu as yz

In [2]:
from yuzu import torch_graph

In [3]:
torch_graph.init()

In [4]:
import sys
import os

import torch
from dataclasses import dataclass, field
from abc import ABC, abstractmethod
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import gym
from typing import Any, NamedTuple
from yuzu import rllib

In [5]:
envname = 'Taxi-v3'
class ClipRewardEnv(gym.RewardWrapper):
    """
    Clips the reward to {+1, 0, -1} by its sign.
    Args:
        env (gym.Env): The environment to wrap
    """

    def __init__(self, env: gym.Env):
        gym.RewardWrapper.__init__(self, env)
    
    def reward(self, reward: float) -> float:
        return reward/100

def create_env(mode='rgb_array'):
    env = gym.make(envname, render_mode='rgb_array')
    env = rllib.OneHotObservationWrapper(env)
    env = rllib.YuzuObservationWrapper(env)
    env = ClipRewardEnv(env)
    return env
env = create_env()

In [6]:
N = 64
class NeuralNetwork(nn.Module):
    def __init__(self, obs_size, act_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size,N),
            nn.ReLU(),
            nn.Linear(N,N),
            nn.ReLU(),
            nn.Linear(N,act_size),
        )

    def forward(self, x):
        x = x.inner()
        if x.ndim == 1:
            x = torch.reshape(x, (-1,) + x.shape)
        return yz.wrap(self.net(x))


def create_model():
    return NeuralNetwork(env.observation_space.shape[0], env.action_space.n)

In [7]:
import torch
from torch.utils.tensorboard import SummaryWriter

ri = 0
li = 0
def objective(config):
    global ri
    global li
    ri = 0
    li = 0
    suffix = "lr={},gamma={},replay_buf_size={},batch_size={},train_count={},update_interval={},eps_decay={},N={}".format(
        config["lr"], config["gamma"], config["replay_buf_size"], config["batch_size"], config["train_count"], 
        config["update_interval"], config["eps_decay"], config['N'])
    # writer = SummaryWriter(logdir+suffix)
    writer = SummaryWriter()
    
    def create_model():
        return NeuralNetwork(env.observation_space.shape[0], env.action_space.n)
    model = create_model()
    #300000
    agent = rllib.DQNAgent(model, create_model(), None, env.action_space, 1.0, 0.0, config["eps_decay"])
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    options = rllib.DQNOptions(optimizer)
    options.replay_buf_size = config["replay_buf_size"]
    options.num_steps = 1024000
    options.double_dqn = True
    options.max_episode_len = 128
    options.gamma = config["gamma"]
    options.update_interval = config["update_interval"]
    options.batch_size = config["batch_size"]
    options.train_count =  config["train_count"]
    options.train_interval = 128
    
    def add_reward(x):
        global ri
        writer.add_scalar("Reward/train", x, ri)
        ri += 1
    
    def add_loss(x):
        global li
        for k,v in x.items():
            writer.add_scalar("Loss/" + k, v, li)
        li += 1
    
    options.report_reward = add_reward
    options.report_train = add_loss
    stats = rllib.dqn_train(create_env, agent, None, options)
    score = np.mean(stats.reward_history[-128:])
    return {"score": score, "model": model, "agent": agent}

agent = objective({
    'lr': 0.001,
    'gamma': 0.9,
    'replay_buf_size': 256*128,
    'batch_size': 128,
    'N': 64,
    'train_count': 1,
    'update_interval': 1000,
    'eps_decay': 700000,
})["agent"]

  if not isinstance(terminated, (bool, np.bool8)):


KeyboardInterrupt: 