In [1]:
import numpy as np

import ray
from ray.tune import register_env
from ray.tune.logger import pretty_print
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG

In [2]:
info = ray.init(ignore_reinit_error=True)

2021-02-06 14:55:31,500	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


In [3]:
from utils import retro_wrappers

import retro

In [4]:
def retro_env_creator(game, state):
    base = retro.make(game=game, state=state)
    base = retro_wrappers.wrap_megaman(base)
    return base

In [5]:
def register_retro(game, state):
    env_creator = lambda env_config: retro_env_creator(game, state)
    register_env(game, env_creator)

In [6]:
trainer_config = DEFAULT_CONFIG.copy()
trainer_config['framework'] = 'torch'
trainer_config['num_workers'] = 1
trainer_config["train_batch_size"] = 400
trainer_config["sgd_minibatch_size"] = 64
trainer_config["num_sgd_iter"] = 10

In [7]:
def train(env, config=trainer_config, checkpoint=None, iterations=1000000):
    agent = PPOTrainer(config=config, env=env)

    for i in range(iterations):
        result = agent.train()
        print(pretty_print(result))

        if i % 50 == 0:
            checkpoint = agent.save()
            print("checkpoint salvo em", checkpoint)

In [8]:
register_retro("MegaMan2-Nes", "Normal.Metalman.Fight.state")

In [9]:
trainer = train("MegaMan2-Nes")

22378968
  mean_raw_obs_processing_ms: 0.24782315523846066
time_since_restore: 23036.13481926918
time_this_iter_s: 77.75088739395142
time_total_s: 23036.13481926918
timers:
  learn_throughput: 6.785
  learn_time_ms: 58949.584
  sample_throughput: 76.557
  sample_time_ms: 5224.832
  update_time_ms: 5.815
timestamp: 1612657196
timesteps_since_restore: 0
timesteps_total: 158400
training_iteration: 396

custom_metrics: {}
date: 2021-02-06_21-21-13
done: false
episode_len_mean: 368.96
episode_reward_max: 720.0
episode_reward_mean: 375.65
episode_reward_min: -20.0
episodes_this_iter: 1
episodes_total: 440
experiment_id: 75b89c0b5aae424298536ee0dbb13c02
hostname: debian
info:
  learner:
    default_policy:
      allreduce_latency: 0.0
      cur_kl_coeff: 0.2536541107416499
      cur_lr: 5.000000000000001e-05
      entropy: 0.9954563634736198
      entropy_coeff: 0.0
      kl: 0.022274833704744066
      policy_loss: -0.00725916214287281
      total_loss: 2137.948556082589
      vf_explained_va

KeyboardInterrupt: 

In [36]:
def test(config=trainer_config, checkpoint=None, testdelay=0, render=False, envcreator=None, maxepisodelen=10000000):
    """Tests and renders a previously trained model"""

    agent = PPOTrainer(config=config, env='MegaMan2-Nes')
    if checkpoint is None:
        raise ValEuerror(f"A previously trained checkpoint must be provided for algorithm {alg}")
    agent.restore(checkpoint)

    game_rom = "MegaMan2-Nes" #Nome da rom
    state = "Normal.Metalman.Fight.state" 
    scenario = "scenario"
    env = retro.make(game_rom, state=state, scenario=scenario)
    env = retro_wrappers.wrap_megaman(env)

    while True:
        state = env.reset()
        done = False
        reward_total = 0.0
        step = 0
        while not done and step < maxepisodelen:
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
            if render:
                env.render()
            state = next_state
            step = step + 1
        print("Episode reward", reward_total)


In [37]:
s = "/home/nelson/ray_results/PPO_MegaMan2-Nes_2021-02-06_14-55-43pae6ulzr/checkpoint_401/checkpoint-401"
test(checkpoint=s, render=True)

2021-02-06 22:17:22,661	INFO trainable.py:328 -- Restored on 192.168.0.3 from checkpoint: /home/nelson/ray_results/PPO_MegaMan2-Nes_2021-02-06_14-55-43pae6ulzr/checkpoint_401/checkpoint-401
2021-02-06 22:17:22,666	INFO trainable.py:336 -- Current state after restoring: {'_iteration': 401, '_timesteps_total': None, '_time_total': 23422.335732460022, '_episodes_total': 444}
Episode reward 675.0
Episode reward 675.0
Episode reward 660.0
Episode reward 100.0
Episode reward 660.0
Episode reward 645.0
Episode reward 690.0
Episode reward 645.0
Episode reward 675.0
Episode reward 675.0
Episode reward 690.0
Episode reward 660.0
Episode reward 660.0
Episode reward 100.0
Episode reward 675.0
Episode reward 100.0
Episode reward 675.0
Episode reward 640.0
Episode reward 100.0
Episode reward 60.0
Episode reward 120.0
Episode reward 120.0
Episode reward 660.0
Episode reward 690.0
Episode reward 120.0
Episode reward 690.0
Episode reward 120.0
Episode reward 675.0
Episode reward 645.0
Episode reward 66

KeyboardInterrupt: 

In [12]:
ray.shutdown()

In [35]:
env.close()

NameError: name 'env' is not defined