In [1]:
import os
import logging
import torch

from pufferlib.vectorization import Serial, Multiprocessing
from pufferlib.policy_store import DirectoryPolicyStore
from pufferlib.frameworks import cleanrl

import environment

from reinforcement_learning import clean_pufferl, policy, config

# NOTE: this file changes when running curriculum generation track
# Run test_task_encoder.py to regenerate this file (or get it from the repo)
BASELINE_CURRICULUM_FILE = "reinforcement_learning/curriculum_with_embedding.pkl"
CUSTOM_CURRICULUM_FILE = "curriculum_generation/custom_curriculum_with_embedding.pkl"

In [2]:
def setup_env(args):
    run_dir = os.path.join(args.runs_dir, args.run_name)
    os.makedirs(run_dir, exist_ok=True)
    logging.info("Training run: %s (%s)", args.run_name, run_dir)
    logging.info("Training args: %s", args)

    policy_store = None
    if args.policy_store_dir is None:
        args.policy_store_dir = os.path.join(run_dir, "policy_store")
        logging.info("Using policy store from %s", args.policy_store_dir)
        policy_store = DirectoryPolicyStore(args.policy_store_dir)

    def make_policy(envs):
        learner_policy = policy.Baseline(
            envs.driver_env,
            input_size=args.input_size,
            hidden_size=args.hidden_size,
            task_size=args.task_size
        )
        return cleanrl.Policy(learner_policy)

    trainer = clean_pufferl.CleanPuffeRL(
        device=torch.device(args.device),
        seed=args.seed,
        env_creator=environment.make_env_creator(args),
        env_creator_kwargs={},
        agent_creator=make_policy,
        data_dir=run_dir,
        exp_name=args.run_name,
        policy_store=policy_store,
        wandb_entity=args.wandb_entity,
        wandb_project=args.wandb_project,
        wandb_extra_data=args,
        checkpoint_interval=args.checkpoint_interval,
        vectorization=Serial if args.use_serial_vecenv else Multiprocessing,
        total_timesteps=args.train_num_steps,
        num_envs=args.num_envs,
        num_cores=args.num_cores or args.num_envs,
        num_buffers=args.num_buffers,
        batch_size=args.rollout_batch_size,
        learning_rate=args.ppo_learning_rate,
        selfplay_learner_weight=args.learner_weight,
        selfplay_num_policies=args.max_opponent_policies + 1,
        #record_loss = args.record_loss,
    )
    return trainer

def reinforcement_learning_track(trainer, args):
    while not trainer.done_training():
        trainer.evaluate()
        trainer.train(
            update_epochs=args.ppo_update_epochs,
            bptt_horizon=args.bptt_horizon,
            batch_rows=args.ppo_training_batch_size // args.bptt_horizon,
            clip_coef=args.clip_coef,
        )

def curriculum_generation_track(trainer, args, use_elm=True):
    from curriculum_generation.task_encoder import TaskEncoder
    LLM_CHECKPOINT = "Salesforce/codegen25-7b-instruct"

    if use_elm:
        from curriculum_generation import manual_curriculum
        from curriculum_generation.elm import OpenELMTaskGenerator
        NUM_SEED_TASKS = 20
        NUM_NEW_TASKS = 5
        ELM_DEBUG = True

        task_encoder = TaskEncoder(LLM_CHECKPOINT, manual_curriculum, batch_size=2)
        task_generator = OpenELMTaskGenerator(manual_curriculum.curriculum, LLM_CHECKPOINT)

        # Generating new tasks and evaluating all candidate training tasks
        for _ in range(3):
            # NOTE: adjust NUM_SEED_TASKS to fit your gpu
            seed_task_list = task_generator.sample_tasks(NUM_SEED_TASKS, random_ratio=1)
            new_task_list = task_generator.evolve_tasks(seed_task_list, NUM_NEW_TASKS, debug=ELM_DEBUG)
            task_generator.add_tasks(new_task_list)
            task_encoder.get_task_embedding(seed_task_list + new_task_list, save_to_file=CUSTOM_CURRICULUM_FILE)
            # CHECK ME: the trainer will automatically use the new task embedding file
            _, _, infos = trainer.evaluate()
            task_generator.update(infos) # update the task stats

        # NOTE: sample_tasks() uses task stats to sample learnable tasks
        curriculum = task_generator.sample_tasks(NUM_SEED_TASKS*3, random_ratio=0.3) # NOTE: arbitrary numbers

    else:
        from curriculum_generation import curriculum_tutorial  # custom tutorial
        task_encoder = TaskEncoder(LLM_CHECKPOINT, curriculum_tutorial, batch_size=2)
        curriculum = curriculum_tutorial.curriculum

    # Use the train_task_spec to train agents
    task_encoder.get_task_embedding(curriculum, save_to_file=CUSTOM_CURRICULUM_FILE)
    task_encoder.close()
    trainer.data.sort_keys = []
    reinforcement_learning_track(trainer, args)

In [3]:
logging.basicConfig(level=logging.INFO)

# You can either edit the defaults in config.py or set args
# from the commandline.
args = config.Config
args.local_mode=True
args.runs_dir='/Users/qqxyyy/DeepLearning/nmmo/nmmo2023baseline/run'

# Avoid OOMing your machine for local testing
if args.local_mode:
    args.num_envs = 1
    args.num_buffers = 1
    args.use_serial_vecenv = True
    args.rollout_batch_size = 2**10

In [10]:
args.local_mode

True

In [4]:
args.tasks_path = BASELINE_CURRICULUM_FILE
trainer = setup_env(args)

INFO:root:Training run: nmmo_20231111_235700 (/Users/qqxyyy/DeepLearning/nmmo/nmmo2023baseline/run/nmmo_20231111_235700)
INFO:root:Training args: <class 'reinforcement_learning.config.Config'>
INFO:root:Using policy store from /Users/qqxyyy/DeepLearning/nmmo/nmmo2023baseline/run/nmmo_20231111_235700/policy_store


Allocated 244.99 MB to environments. Only accurate for Serial backend.
PolicyPool sample_weights: [128]


  from .autonotebook import tqdm as notebook_tqdm


Allocated to storage - Pytorch: 0.00 GB, System: 0.10 GB


In [34]:
trainer.agent

Policy(
  (policy): Baseline(
    (tile_encoder): TileEncoder(
      (embedding): Embedding(768, 32)
      (tile_conv_1): Conv2d(96, 32, kernel_size=(3, 3), stride=(1, 1))
      (tile_conv_2): Conv2d(32, 8, kernel_size=(3, 3), stride=(1, 1))
      (tile_fc): Linear(in_features=968, out_features=256, bias=True)
    )
    (player_encoder): PlayerEncoder(
      (embedding): Embedding(7936, 32)
      (agent_fc): Linear(in_features=992, out_features=256, bias=True)
      (my_agent_fc): Linear(in_features=992, out_features=256, bias=True)
    )
    (item_encoder): ItemEncoder(
      (embedding): Embedding(256, 32)
      (fc): Linear(in_features=76, out_features=256, bias=True)
    )
    (inventory_encoder): InventoryEncoder(
      (fc): Linear(in_features=3072, out_features=256, bias=True)
    )
    (market_encoder): MarketEncoder(
      (fc): Linear(in_features=256, out_features=256, bias=True)
    )
    (task_encoder): TaskEncoder(
      (fc): Linear(in_features=4096, out_features=256, bia

In [35]:
dir(trainer.agent)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_compiled_call_impl',
 '_forward_hooks',
 '_forward_hooks_always_called',
 '_forward_hooks_with_kwargs',
 '_forward_pre_hooks',
 '_forward_pre_hooks_with_kwargs',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_se

In [32]:
trainer.buffers[0].num_agents

128

In [18]:
trainer.policy_pool._num_envs,trainer.policy_pool._num_agents

(1, 128)

In [19]:
trainer.agent.is_recurrent

False

In [5]:
trainer.data

namespace(buf=0,
          sort_keys=[],
          next_obs=[[]],
          next_done=[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                             0., 0., 0., 0., 0., 0., 0., 0.], device='mps:0')],
          next_lstm_state=[None],
          obs=tensor([[0., 0., 0.,  ..., 0., 0., 0.],
                      [0., 0., 0.,  ..., 0., 0., 0.],
                      [0., 0., 0.,  ..., 0., 0., 0.],
                      ...,
      

In [6]:
trainer.evaluate()

INFO:root:PolicyPool: Updated policies: dict_keys(['learner'])


Allocated during evaluation - Pytorch: 0.00 GB, System: 0.39 GB
Epoch: 0 - 1K steps - 0:00:53 Elapsed
	Steps Per Second: Env=1715, Inference=348


(namespace(buf=0,
           sort_keys=[(0, 0, 1),
                      (0, 1, 1),
                      (0, 2, 1),
                      (0, 3, 1),
                      (0, 4, 1),
                      (0, 5, 1),
                      (0, 6, 1),
                      (0, 7, 1),
                      (0, 8, 1),
                      (0, 9, 1),
                      (0, 10, 1),
                      (0, 11, 1),
                      (0, 12, 1),
                      (0, 13, 1),
                      (0, 14, 1),
                      (0, 15, 1),
                      (0, 16, 1),
                      (0, 17, 1),
                      (0, 18, 1),
                      (0, 19, 1),
                      (0, 20, 1),
                      (0, 21, 1),
                      (0, 22, 1),
                      (0, 23, 1),
                      (0, 24, 1),
                      (0, 25, 1),
                      (0, 26, 1),
                      (0, 27, 1),
                      (0, 28, 1),
      

In [11]:
trainer.data

namespace(buf=0,
          sort_keys=[(0, 0, 1),
                     (0, 1, 1),
                     (0, 2, 1),
                     (0, 3, 1),
                     (0, 4, 1),
                     (0, 5, 1),
                     (0, 6, 1),
                     (0, 7, 1),
                     (0, 8, 1),
                     (0, 9, 1),
                     (0, 10, 1),
                     (0, 11, 1),
                     (0, 12, 1),
                     (0, 13, 1),
                     (0, 14, 1),
                     (0, 15, 1),
                     (0, 16, 1),
                     (0, 17, 1),
                     (0, 18, 1),
                     (0, 19, 1),
                     (0, 20, 1),
                     (0, 21, 1),
                     (0, 22, 1),
                     (0, 23, 1),
                     (0, 24, 1),
                     (0, 25, 1),
                     (0, 26, 1),
                     (0, 27, 1),
                     (0, 28, 1),
                     (0, 29, 1),
   

In [8]:
%load_ext memory_profiler

In [9]:
%memit

peak memory: 282.71 MiB, increment: 0.01 MiB


In [25]:
import pufferlib

In [26]:
isinstance(trainer.agent,pufferlib.models.Policy)

False

In [28]:
isinstance(trainer.agent,cleanrl.Policy),isinstance(trainer.policy_pool,pufferlib.policy_pool.PolicyPool)

(True, True)

In [37]:
dir(cleanrl.Policy)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_call_impl',
 '_compiled_call_impl',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_name',
 '_load_from_state_dict',
 '_maybe_warn_non_full_backward_hook',
 '_named_members',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_save_to_state_dict',
 '_slow_forward',
 '_version',
 '_wrapped_call_impl',
 'add_module',
 'apply',
 'bfloat16',
 'buffers',
 'call_super_init',
 'children',
 'compile',
 'cpu',
 'cuda',
 'double',
 'dump_patches',
 'eval',


In [12]:
trainer.batch_size

1024

In [13]:
import torch
from torch.distributions import Categorical

# 创建一个 Categorical 分布
probs = torch.tensor([0.2, 0.3, 0.5])
dist = Categorical(probs=probs)

# 从分布中采样
samples = dist.sample((10,))
print(samples)

# 计算样本的对数概率
log_probs = dist.log_prob(samples)
print(log_probs)

tensor([2, 1, 2, 0, 1, 0, 2, 0, 2, 1])
tensor([-0.6931, -1.2040, -0.6931, -1.6094, -1.2040, -1.6094, -0.6931, -1.6094,
        -0.6931, -1.2040])


In [14]:
dir(trainer.agent.policy)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_compiled_call_impl',
 '_forward_hooks',
 '_forward_hooks_always_called',
 '_forward_hooks_with_kwargs',
 '_forward_pre_hooks',
 '_forward_pre_hooks_with_kwargs',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_se

In [18]:
trainer.agent.policy.task_encoder

TaskEncoder(
  (fc): Linear(in_features=4096, out_features=256, bias=True)
)

In [19]:
trainer.data.obs.shape

torch.Size([1025, 26035])

In [33]:
trainer.data.actions.shape

torch.Size([1025, 12])

In [25]:
multienv=trainer.buffers[0].envs[0]

In [28]:
multienv.envs[0].flat_observation_space,multienv.envs[0].flat_observation_structure

({'DActionTargets.DAttack.DStyle.V': Box(0, 1, (3,), int8),
  'DActionTargets.DAttack.DTarget.V': Box(0, 1, (101,), int8),
  'DActionTargets.DBuy.DMarketItem.V': Box(0, 1, (1025,), int8),
  'DActionTargets.DDestroy.DInventoryItem.V': Box(0, 1, (13,), int8),
  'DActionTargets.DGive.DInventoryItem.V': Box(0, 1, (13,), int8),
  'DActionTargets.DGive.DTarget.V': Box(0, 1, (101,), int8),
  'DActionTargets.DGiveGold.DPrice.V': Box(0, 1, (99,), int8),
  'DActionTargets.DGiveGold.DTarget.V': Box(0, 1, (101,), int8),
  'DActionTargets.DMove.DDirection.V': Box(0, 1, (5,), int8),
  'DActionTargets.DSell.DInventoryItem.V': Box(0, 1, (13,), int8),
  'DActionTargets.DSell.DPrice.V': Box(0, 1, (99,), int8),
  'DActionTargets.DUse.DInventoryItem.V': Box(0, 1, (13,), int8),
  'DAgentId.V': Discrete(129),
  'DCurrentTick.V': Discrete(1025),
  'DEntity.V': Box(-32768, 32767, (100, 31), int16),
  'DInventory.V': Box(-32768, 32767, (12, 16), int16),
  'DMarket.V': Box(-32768, 32767, (1024, 16), int16),
  '

In [32]:
trainer.buffers[0].single_observation_space,trainer.buffers[0].single_action_space

(Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (26035,), float64),
 MultiDiscrete([   3  101 1025   13   13  101   99  101    5   13   99   13]))