In [1]:
import numpy as np

import ray
from ray.tune import register_env
from ray.tune.logger import pretty_print
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG

In [2]:
info = ray.init(ignore_reinit_error=True)

2021-02-07 18:39:08,701	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


In [3]:
from utils import retro_wrappers

import retro

In [4]:
def retro_env_creator(game, state):
    base = retro.make(game=game, state=state)
    base = retro_wrappers.wrap_megaman(base, transpose=False)
    return base

In [5]:
def register_retro(game, state):
    env_creator = lambda env_config: retro_env_creator(game, state)
    register_env(game, env_creator)

In [6]:
trainer_config = DEFAULT_CONFIG.copy()
trainer_config['framework'] = 'torch'
trainer_config['lambda'] = 0.95
trainer_config['kl_coeff'] = 0.5
trainer_config['clip_rewards'] = True
trainer_config['clip_param'] = 0.1
trainer_config['vf_clip_param'] = 10.0
trainer_config['entropy_coeff'] = 0.01
trainer_config['num_workers'] = 8
trainer_config["train_batch_size"] = 5000
trainer_config['rollout_fragment_length'] = 100
trainer_config['sgd_minibatch_size'] = 500
trainer_config['num_sgd_iter'] = 10
trainer_config['batch_mode'] = "truncate_episodes"
trainer_config['observation_filter'] = "NoFilter"
trainer_config['model']['vf_share_layers'] = True
trainer_config['num_gpus'] = 1

In [7]:
def train(env, config=trainer_config, checkpoint=None, iterations=1000000):
    agent = PPOTrainer(config=config, env=env)

    if checkpoint is not None:
        try:
            agent.restore(checkpoint)
            print(f"-------------------------------\n"
                  f"Resumed checkpoint {checkpoint}\n"
                  f"-------------------------------\n")
        except:
            print(f"------------------------\n"
                  f"Checkpoint not found: restarted policy network from scratch\n"
                  f"------------------------\n")

    s = "Iteração: {:3d}, Recompensas (Min/Mean/Max): {:6.2f}/{:6.2f}/{:6.2f}, Duração Média: {:6.2f}, Checkpoint: {}"

    for i in range(iterations):
        result = agent.train()
          print(s.format(
                n + 1,
                result["episode_reward_min"],
                result["episode_reward_mean"],
                result["episode_reward_max"],
                result["episode_len_mean"],
                file_name
            ))
            
        if i % 50 == 0:
            checkpoint = agent.save()
            print("checkpoint saved at: ", checkpoint)

In [8]:
register_retro("MegaMan2-Nes", "Normal.Metalman.Fight.state")

In [9]:
s = "/home/nelson/ray_results/PPO_MegaMan2-Nes_2021-02-06_14-55-43pae6ulzr/checkpoint_401/checkpoint-401"
trainer = train("MegaMan2-Nes", checkpoint=s)

2021-02-07 18:39:25,269	INFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2021-02-07 18:39:30,304	INFO trainable.py:328 -- Restored on 192.168.0.3 from checkpoint: /home/nelson/ray_results/PPO_MegaMan2-Nes_2021-02-06_14-55-43pae6ulzr/checkpoint_401/checkpoint-401
2021-02-07 18:39:30,305	INFO trainable.py:336 -- Current state after restoring: {'_iteration': 401, '_timesteps_total': None, '_time_total': 23422.335732460022, '_episodes_total': 444}
-------------------------------Resumed checkpoint /home/nelson/ray_results/PPO_MegaMan2-Nes_2021-02-06_14-55-43pae6ulzr/checkpoint_401/checkpoint-401-------------------------------


KeyboardInterrupt: 

In [8]:
def test(config=trainer_config, checkpoint=None, testdelay=0, render=False, envcreator=None, maxepisodelen=10000000):
    """Tests and renders a previously trained model"""

    agent = PPOTrainer(config=config, env='MegaMan2-Nes')
    if checkpoint is None:
        raise ValEuerror(f"A previously trained checkpoint must be provided for algorithm {alg}")
    agent.restore(checkpoint)

    game_rom = "MegaMan2-Nes" #Nome da rom
    state = "Normal.Metalman.Fight.state" 
    scenario = "scenario"
    env = retro.make(game_rom, state=state, scenario=scenario)
    env = retro_wrappers.wrap_megaman(env)

    while True:
        state = env.reset()
        done = False
        reward_total = 0.0
        step = 0
        while not done and step < maxepisodelen:
            action = agent.compute_action(state)
            next_state, reward, done, _ = env.step(action)
            reward_total += reward
            if render:
                env.render()
            state = next_state
            step = step + 1
        print("Episode reward", reward_total)


In [9]:
s = "/home/nelson/ray_results/PPO_MegaMan2-Nes_2021-02-06_14-55-43pae6ulzr/checkpoint_401/checkpoint-401"
test(checkpoint=s, render=True)

2021-02-07 17:20:03,859	INFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2021-02-07 17:20:07,378	INFO trainable.py:328 -- Restored on 192.168.0.3 from checkpoint: /home/nelson/ray_results/PPO_MegaMan2-Nes_2021-02-06_14-55-43pae6ulzr/checkpoint_401/checkpoint-401
2021-02-07 17:20:07,379	INFO trainable.py:336 -- Current state after restoring: {'_iteration': 401, '_timesteps_total': None, '_time_total': 23422.335732460022, '_episodes_total': 444}
Episode reward 100.0
Episode reward 120.0
Episode reward 40.0
Episode reward 660.0
Episode reward 660.0
Episode reward 690.0
Episode reward 100.0
Episode reward 675.0
Episode reward 100.0
Episode reward 675.0
Episode reward 690.0
Episode reward 705.0
Episode reward 60.0


KeyboardInterrupt: 

In [10]:
ray.shutdown()

In [10]:
env.close()

NameError: name 'env' is not defined