In [None]:
from tianshou_drl.tianshou_ddpg import DDPGTrainer
from tianshou_setup import get_env_continous

ddpgt = DDPGTrainer(get_env_continous)

In [None]:
ddpgt.train_collector.collect(n_episode=1)

In [None]:
get_env_continous().agents

In [None]:
ddpgt.policy_manager.policies

In [None]:
ddpgt.train_collector.buffer.sample(1)[0].obs

In [None]:
import torch

In [None]:
torch.as_tensor(obs, dtype=torch.float32)

In [None]:
ddpgt.policy_manager.update(100,ddpgt.train_collector.buffer)

In [None]:
ddpgt.onploicy_train()

In [None]:
from argparse import ArgumentParser

import os
import time

from malib.runner import run
from malib.agent import IndependentAgent
from malib.scenarios.psro_scenario import PSROScenario
from malib.rl.dqn import DQNPolicy, DQNTrainer, DEFAULT_CONFIG
from malib.rollout.envs.open_spiel import env_desc_gen
from malib.rollout.envs.env import Wrapper
from malib_setup import PettingZooEnv



In [None]:
env = PettingZooEnv()

In [None]:
class PSROTrainer:
    def __init__(self, log_dir="./logs/", env=None):
        self.log_dir = log_dir
        self.env = env
        self.trainer_config = DEFAULT_CONFIG["training_config"].copy()
        self.trainer_config["total_timesteps"] = int(1e6)
        self.training_config = {
            "type": IndependentAgent,
            "trainer_config": self.trainer_config,
            "custom_config": {},
        }
        self.rollout_config = {
            "fragment_length": 2000,  # every thread
            "max_step": 200,
            "num_eval_episodes": 10,
            "num_threads": 2,
            "num_env_per_thread": 10,
            "num_eval_threads": 1,
            "use_subproc_env": False,
            "batch_mode": "time_step",
            "postprocessor_types": ["defaults"],
            # every # rollout epoch run evaluation.
            "eval_interval": 1,
            "inference_server": "ray",  # three kinds of inference server: `local`, `pipe` and `ray`
        }
        self.agent_mapping_func = lambda agent: agent
        self.algorithms = {
            "default": (
                DQNPolicy,
                DQNTrainer,
                # model configuration, None for default
                {},
                {},
            )
        }
        self.env_description = env_desc_gen(env)
        self.runtime_logdir = os.path.join(self.log_dir, f"psro/{time.time()}")
        if not os.path.exists(self.runtime_logdir):
            os.makedirs(self.runtime_logdir)
    
    def train(self):
        scenario = PSROScenario(
            name=f"psro",
            log_dir=self.runtime_logdir,
            algorithms=self.algorithms,
            env_description=self.env_description,
            training_config=self.training_config,
            rollout_config=self.rollout_config,
            # control the outer loop.
            global_stopping_conditions={"max_iteration": 50},
            agent_mapping_func=self.agent_mapping_func,
            # for the training of best response.
            stopping_conditions={
                "training": {"max_iteration": int(1e4)},
                "rollout": {"max_iteration": 100},
            },
        )

        run(scenario)


In [None]:
from environment.Environment import VehicleJobSchedulingEnvACE, VehicleJobSchedulingParameters
from pettingzoo.utils.wrappers import BaseWrapper
def get_env():
    env = VehicleJobSchedulingEnvACE()
    env = BaseWrapper(env)
    env = PettingZooEnv(env)
    return env

In [None]:
psro = PSROTrainer(env)

In [None]:
psro.train()

In [None]:
import numpy as np
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
import hashlib

# Define the size and capacity of the hash table
SIZE = 1000000
CAPACITY = 2 * SIZE

# Define a simple hash function using hashlib.md5
@cuda.jit(device=True)
def hash_func(key):
    return int(hashlib.md5(key.encode()).hexdigest(), 16) % CAPACITY

# Define a kernel function to insert key-value pairs into the hash table
@cuda.jit
def insert_kernel(keys, values, table, states):
    # Get the thread index
    i = cuda.grid(1)
    # Check the bounds
    if i < keys.size:
        # Get the key and value
        key = keys[i]
        value = values[i]
        # Compute the hash of the key
        hash = hash_func(key)
        # Probe the table until an empty slot is found
        while True:
            # Get the slot index using atomic add to avoid collisions
            slot = cuda.atomic.add(table, hash, 1) - 1
            # Check if the slot is empty or occupied by the same key
            if slot == -1 or keys[slot] == key:
                # Store the key and value in the slot
                keys[slot] = key
                values[slot] = value
                break
            # Increment the hash and wrap around the table capacity
            hash += 1
            if hash == CAPACITY:
                hash = 0

# Define a kernel function to lookup values by keys from the hash table
@cuda.jit
def lookup_kernel(keys, values, table, results):
    # Get the thread index
    i = cuda.grid(1)
    # Check the bounds
    if i < keys.size:
        # Get the key
        key = keys[i]
        # Compute the hash of the key
        hash = hash_func(key)
        # Probe the table until the key is found or an empty slot is encountered
        while True:
            # Get the slot index from the table
            slot = table[hash]
            # Check if the slot is empty
            if slot == -1:
                # The key is not in the table, return None
                results[i] = None
                break
            # Check if the slot contains the key
            if keys[slot] == key:
                # The key is found, return the value
                results[i] = values[slot]
                break
            # Increment the hash and wrap around the table capacity
            hash += 1
            if hash == CAPACITY:
                hash = 0

# Create some random keys and values as numpy arrays
keys = np.array([f"key{i}" for i in range(SIZE)])
values = np.random.rand(SIZE)

# Create a hash table as a numpy array of integers
# The table stores the slot indices for each hash value, initialized to -1
table = np.full(CAPACITY, -1, dtype=np.int32)

# Create a random number generator state for each thread
threads_per_block = 256
blocks_per_grid = (SIZE + threads_per_block - 1) // threads_per_block
rng_states = create_xoroshiro128p_states(threads_per_block * blocks_per_grid, seed=1)

# Copy the keys, values and table to the device memory
d_keys = cuda.to_device(keys)
d_values = cuda.to_device(values)
d_table = cuda.to_device(table)

# Launch the insert kernel on the device
insert_kernel[blocks_per_grid, threads_per_block](d_keys, d_values, d_table, rng_states)

# Copy the table back to the host memory
table = d_table.copy_to_host()

# Print some statistics of the hash table
print(f"Hash table size: {SIZE}")
print(f"Hash table capacity: {CAPACITY}")
print(f"Hash table load factor: {np.count_nonzero(table != -1) / CAPACITY}")

# Create some query keys as numpy arrays, some are in the table and some are not
query_keys = np.array([f"key{i}" for i in np.random.randint(0, 2 * SIZE, 10)])

# Create a result array to store the lookup results
results = np.empty(10, dtype=np.float64)

# Copy the query keys and result array to the device memory
d_query_keys = cuda.to_device(query_keys)
d_results = cuda.to_device(results)

# Launch the lookup kernel on the device
lookup_kernel[blocks_per_grid, threads_per_block](d_keys, d_values, d_table, d_results)

# Copy the result array back to the host memory
results = d_results.copy_to_host()

# Print the lookup results
for key, result in zip(query_keys, results):
    print(f"{key} -> {result}")


In [None]:
import argparse
import os
from copy import deepcopy
from functools import partial
from typing import Optional, Tuple

import gymnasium
import numpy as np
import torch
from pettingzoo.classic import tictactoe_v3
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import (
    BasePolicy,
    DQNPolicy,
    MultiAgentPolicyManager,
    RandomPolicy,
)
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net


def get_env(render_mode: Optional[str] = None):
    return PettingZooEnv(tictactoe_v3.env(render_mode=render_mode))


def get_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=1626)
    parser.add_argument('--eps-test', type=float, default=0.05)
    parser.add_argument('--eps-train', type=float, default=0.1)
    parser.add_argument('--buffer-size', type=int, default=20000)
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument(
        '--gamma', type=float, default=0.9, help='a smaller gamma favors earlier win'
    )
    parser.add_argument('--n-step', type=int, default=3)
    parser.add_argument('--target-update-freq', type=int, default=320)
    parser.add_argument('--epoch', type=int, default=50)
    parser.add_argument('--step-per-epoch', type=int, default=1000)
    parser.add_argument('--step-per-collect', type=int, default=10)
    parser.add_argument('--update-per-step', type=float, default=0.1)
    parser.add_argument('--batch-size', type=int, default=64)
    parser.add_argument(
        '--hidden-sizes', type=int, nargs='*', default=[128, 128, 128, 128]
    )
    parser.add_argument('--training-num', type=int, default=10)
    parser.add_argument('--test-num', type=int, default=10)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.1)
    parser.add_argument(
        '--win-rate',
        type=float,
        default=0.6,
        help='the expected winning rate: Optimal policy can get 0.7'
    )
    parser.add_argument(
        '--watch',
        default=False,
        action='store_true',
        help='no training, '
        'watch the play of pre-trained models'
    )
    parser.add_argument(
        '--agent-id',
        type=int,
        default=2,
        help='the learned agent plays as the'
        ' agent_id-th player. Choices are 1 and 2.'
    )
    parser.add_argument(
        '--resume-path',
        type=str,
        default='',
        help='the path of agent pth file '
        'for resuming from a pre-trained agent'
    )
    parser.add_argument(
        '--opponent-path',
        type=str,
        default='',
        help='the path of opponent agent pth file '
        'for resuming from a pre-trained agent'
    )
    parser.add_argument(
        '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu'
    )
    return parser


def get_args() -> argparse.Namespace:
    parser = get_parser()
    return parser.parse_known_args()[0]


def get_agents(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = get_env()
    observation_space = env.observation_space['observation'] if isinstance(
        env.observation_space, gymnasium.spaces.Dict
    ) else env.observation_space
    args.state_shape = observation_space.shape or observation_space.n
    args.action_shape = env.action_space.shape or env.action_space.n
    if agent_learn is None:
        # model
        net = Net(
            args.state_shape,
            args.action_shape,
            hidden_sizes=args.hidden_sizes,
            device=args.device
        ).to(args.device)
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=args.lr)
        agent_learn = DQNPolicy(
            net,
            optim,
            args.gamma,
            args.n_step,
            target_update_freq=args.target_update_freq
        )
        if args.resume_path:
            agent_learn.load_state_dict(torch.load(args.resume_path))

    if agent_opponent is None:
        if args.opponent_path:
            agent_opponent = deepcopy(agent_learn)
            agent_opponent.load_state_dict(torch.load(args.opponent_path))
        else:
            agent_opponent = RandomPolicy()

    if args.agent_id == 1:
        agents = [agent_learn, agent_opponent]
    else:
        agents = [agent_opponent, agent_learn]
    policy = MultiAgentPolicyManager(agents, env)
    return policy, optim, env.agents


def train_agent(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[dict, BasePolicy]:

    train_envs = DummyVectorEnv([get_env for _ in range(args.training_num)])
    test_envs = DummyVectorEnv([get_env for _ in range(args.test_num)])
    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_envs.seed(args.seed)
    test_envs.seed(args.seed)

    policy, optim, agents = get_agents(
        args, agent_learn=agent_learn, agent_opponent=agent_opponent, optim=optim
    )

    # collector
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(args.buffer_size, len(train_envs)),
        exploration_noise=True
    )
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # policy.set_eps(1)
    train_collector.collect(n_step=args.batch_size * args.training_num)
    # log
    log_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn')
    writer = SummaryWriter(log_path)
    writer.add_text("args", str(args))
    logger = TensorboardLogger(writer)

    def save_best_fn(policy):
        if hasattr(args, 'model_save_path'):
            model_save_path = args.model_save_path
        else:
            model_save_path = os.path.join(
                args.logdir, 'tic_tac_toe', 'dqn', 'policy.pth'
            )
        torch.save(
            policy.policies[agents[args.agent_id - 1]].state_dict(), model_save_path
        )

    def stop_fn(mean_rewards):
        return mean_rewards >= args.win_rate

    def train_fn(epoch, env_step):
        policy.policies[agents[args.agent_id - 1]].set_eps(args.eps_train)

    def test_fn(epoch, env_step):
        policy.policies[agents[args.agent_id - 1]].set_eps(args.eps_test)

    def reward_metric(rews):
        return rews[:, args.agent_id - 1]

    # trainer
    result = offpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        args.step_per_collect,
        args.test_num,
        args.batch_size,
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=args.update_per_step,
        logger=logger,
        test_in_train=False,
        reward_metric=reward_metric
    )

    return result, policy.policies[agents[args.agent_id - 1]]


def watch(
    args: argparse.Namespace = get_args(),
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
) -> None:
    env = DummyVectorEnv([partial(get_env, render_mode="human")])
    policy, optim, agents = get_agents(
        args, agent_learn=agent_learn, agent_opponent=agent_opponent
    )
    policy.eval()
    policy.policies[agents[args.agent_id - 1]].set_eps(args.eps_test)
    collector = Collector(policy, env, exploration_noise=True)
    result = collector.collect(n_episode=1, render=args.render)
    rews, lens = result["rews"], result["lens"]
    print(f"Final reward: {rews[:, args.agent_id - 1].mean()}, length: {lens.mean()}")

In [None]:
arg = get_args()
ma,a,b = get_agents(arg)

In [None]:
b

In [None]:
ma.policies['player_2'].agent_id

In [None]:
train_envs = DummyVectorEnv([get_env for _ in range(10)])

In [None]:
train_collector = Collector(
        ma,
        train_envs,
        VectorReplayBuffer(10000, len(train_envs)),
        exploration_noise=True
    )

In [None]:
train_collector.collect(n_step=64)

In [None]:
train_collector.buffer.sample(1)[0]