In [1]:
import gym
import random, math
import numpy as np
import arcade
from skimage import data, color
from skimage.transform import rescale, resize, downscale_local_mean
from PIL import Image

        
from LightEnvCopy import LightEnv

import gym.spaces
from gym.spaces import Discrete, Box

from ray.rllib.env.env_context import EnvContext
from ray.rllib.models import ModelCatalog

from collections import namedtuple

# Do the math to figure out our screen dimensions
SCREEN_WIDTH = 800
SCREEN_HEIGHT = 600
SCREEN_TITLE = "Game 1: Let There Be Light!"

# COnvenient data structure to hold information about actions
Action = namedtuple('Action', 'name index delta_i delta_j')

up = Action('up', 0, -1, 0)    
down = Action('down', 1, 1, 0)    
left = Action('left', 2, 0, -1)    
right = Action('right', 3, 0, 1)    

index_to_actions = {}
for action in [up, down, left, right]:
    index_to_actions[action.index] = action
# print(index_to_actions[0].name)
str_to_actions = {}
for action in [up, down, left, right]:
    str_to_actions[action.name] = action
#TF End - Adding in actions for action conversion


class LightEnvWrapper(gym.Env, LightEnv):
    """Class that wraps the Lights Environment to make it 
    compatible with RLLib."""

    metadata = {"render.modes": ["rgb_array", "state_pixels"]}
    
    def __init__(self, config: EnvContext):
        super().__init__(SCREEN_WIDTH, SCREEN_HEIGHT, SCREEN_TITLE)
        self.torch_collected = False
        self.torch_collected_count = []
        self.mygame = LightEnv
        self.steps_taken = 0
        #The action space is a choice of 4 actions: U/D/L/R.
        self.action_space = Discrete(4)
        
        #The observation space is a fixed image of the current game screen
        self.observation_space = Box(low=0, high=255, shape=(84,84, 4), dtype=np.uint8)
        
    def reset(self):
        print("resetting in wrapper")
        
        if self.torch_collected == 1:
            print("Torch was collected this episode!")
        else:
            print("Torch was not collected this episode...")
        self.torch_collected_count.append(self.torch_collected)
        print(self.torch_collected_count)

        self.render(self)
        #Resets the state of the environment for a new episode and an initial observation.
        obs_mygame = self.mygame.reset(self)
        
        #Open up the resetted image to verify working correctly.
        obs_mygame.show()
        
        self.mygame.on_draw(self)
        #Convert observation to 84x84 resolution and np array for rllib.
        obs = self.convert_observations(obs_mygame)
        
        self.steps_taken = 0
        return obs

    def step(self, action):
        self.steps_taken += 1
        
        #Making sure an action is chosen, either: 0, 1, 2, 3.
        assert action in [0, 1, 2, 3] #0-up,1-down,2-left,3-right.
        
        #Convert the numeric action to a keyword: up, down, left, right.
        actions_myenv = index_to_actions[action].name #returns a word, one of: up/down/left/right
#         print(f"action taken: {actions_myenv}")
        
        #Update the window with on_update()
        self.render(self)
#         print("env rendered")
        #Compute observation extracted from the window (800x600), with reward and done flag.
        obs, reward, done, torch_collected, fps_check = self.mygame.step(self,actions_myenv)
        if torch_collected == True:
            self.torch_collected = 1
        else:
            self.torch_collected = 0
                    
        if self.steps_taken % 100 == 0: #33 steps roughly equates to 1 second in game time
            print(f"total score is {self.score} at time: {self.mygame.time_taken_reported(self)}")
            print(f"FPS is currently: {fps_check}")
            print(f"steps taken: {self.steps_taken}")

        #Convert observation to 84x84 resolution and np array for rllib.
        obs_mygame = self.convert_observations(obs)
        
        #If the reward has been obtained, reset the environment and start again
        if done == True:
            print(f"done is {done}, resetting environment in wrapper.")
            print(f"steps taken: {self.steps_taken}")
            obs.show()
            self.reset()
        
        return obs_mygame, reward, done, {}

    def seed(self, seed=None):
        random.seed(seed)

    def convert_observations(self, obs_mygame): #resizing and converting to array for rllib processing
        # We normalize and concatenate observations
        obs = obs_mygame
        obs_resized = obs.resize((84,84))
        obsarray = np.array(obs_resized)
        return obsarray
    
    def render(self, mode='state_pixels'):
#         self.mygame.update(self)
        self.mygame.on_draw(self)
        test = self.mygame.time_taken_reported(self)
        

  if (distutils.version.LooseVersion(tf.__version__) <


### Now run the rllib script to train the agent

In [4]:
import ray.rllib.algorithms.sac.sac as sac
from ray.rllib.algorithms.sac.sac import SACConfig

config = SACConfig().training(gamma=0.9, lr=0.1, initial_alpha=5)\
    .resources(num_gpus=0)\
    .rollouts(num_rollout_workers=1, recreate_failed_workers=True)

config.replay_buffer_config['capacity']=100000
# config.replay_buffer_config['learning_starts']=2500
config.optimization['entropy_learning_rate']=0.05

config.env=LightEnvWrapper
print(config.to_dict())

RAY_DISABLE_MEMORY_MONITOR=1

# Build a Algorithm object from the config and run 1 training iteration.
# trainer = config.build(env=LightEnvWrapper)
trainer = sac.SAC(config=config)


avg_rewards = []
num_iterations = []
time_spent = []
for episode in range(100):
    print("Starting episode ", episode)
    # Perform one iteration of training the policy with SAC
    result = trainer.train()
    #print(pretty_print(result))
    print("episode reward mean: ", result['episode_reward_mean'])
    avg_rewards.append(result['episode_reward_mean'])
    num_iterations.append(episode)
#     if episode % 10 == 0:
#         checkpoint = trainer.save()
#         print("checkpoint saved at", checkpoint)
    print("End of episode ", episode)





{'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 'num_gpus': 0, 'num_cpus_per_worker': 1, 'num_gpus_per_worker': 0, '_fake_gpus': False, 'custom_resources_per_worker': {}, 'placement_strategy': 'PACK', 'eager_tracing': False, 'eager_max_retraces': 20, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'env': <class '__main__.LightEnvWrapper'>, 'env_config': {}, 'observation_space': None, 'action_space': None, 'env_task_fn': None, 'render_env': False, 'clip_rewards': None, 'normalize_actions': True, 'clip_actions': False, 'disable_env_checking': False, 'num_workers': 1, 'num_envs_per_worker': 1, 'sample_collector': <class 'ray.rllib.evaluation.collectors.simple_list_collector.SimpleListCollector

[2m[36m(pid=23032)[0m Windows fatal exception: code 0xc0000139
[2m[36m(pid=23032)[0m 
[2m[36m(pid=23032)[0m   if (distutils.version.LooseVersion(tf.__version__) <


[2m[36m(RolloutWorker pid=23032)[0m resetting in wrapper
[2m[36m(RolloutWorker pid=23032)[0m Torch was not collected this episode...
[2m[36m(RolloutWorker pid=23032)[0m [False]
[2m[36m(RolloutWorker pid=23032)[0m resetting


2022-09-28 13:56:52,108	INFO trainable.py:160 -- Trainable.setup took 22.014 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Starting episode  0
[2m[36m(RolloutWorker pid=23032)[0m resetting in wrapper
[2m[36m(RolloutWorker pid=23032)[0m Torch was not collected this episode...
[2m[36m(RolloutWorker pid=23032)[0m [False, 0]
[2m[36m(RolloutWorker pid=23032)[0m resetting


2022-09-28 13:56:57,608	ERROR algorithm.py:2173 -- Error in training or evaluation attempt! Trying to recover.
Traceback (most recent call last):
  File "C:\Users\Tim\Anaconda3\envs\rllib\lib\site-packages\ray\rllib\algorithms\algorithm.py", line 2373, in _run_one_training_iteration
    results = self.training_step()
  File "C:\Users\Tim\Anaconda3\envs\rllib\lib\site-packages\ray\rllib\algorithms\dqn\dqn.py", line 358, in training_step
    new_sample_batch = synchronous_parallel_sample(
  File "C:\Users\Tim\Anaconda3\envs\rllib\lib\site-packages\ray\rllib\execution\rollout_ops.py", line 100, in synchronous_parallel_sample
    sample_batches = ray.get(
  File "C:\Users\Tim\Anaconda3\envs\rllib\lib\site-packages\ray\_private\client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\Tim\Anaconda3\envs\rllib\lib\site-packages\ray\_private\worker.py", line 2275, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RayOutOfMemoryError): 

[2m[36m(RolloutWorker pid=23032)[0m total score is -4 at time: 2
[2m[36m(RolloutWorker pid=23032)[0m FPS is currently: 60
[2m[36m(RolloutWorker pid=23032)[0m steps taken: 100
episode reward mean:  nan
End of episode  0
Starting episode  1
episode reward mean:  nan
End of episode  1
Starting episode  2
[2m[36m(RolloutWorker pid=23032)[0m total score is -5 at time: 3
[2m[36m(RolloutWorker pid=23032)[0m FPS is currently: 72
[2m[36m(RolloutWorker pid=23032)[0m steps taken: 200
[2m[36m(RolloutWorker pid=23032)[0m total score is -7 at time: 5
[2m[36m(RolloutWorker pid=23032)[0m FPS is currently: 71
[2m[36m(RolloutWorker pid=23032)[0m steps taken: 300
episode reward mean:  nan
End of episode  2
Starting episode  3
episode reward mean:  nan
End of episode  3
Starting episode  4
[2m[36m(RolloutWorker pid=23032)[0m total score is -9 at time: 7
[2m[36m(RolloutWorker pid=23032)[0m FPS is currently: 75
[2m[36m(RolloutWorker pid=23032)[0m steps taken: 400
[2m[36

2022-09-28 14:01:56,284	ERROR worker.py:399 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::RolloutWorker.set_weights()[39m (pid=23032, ip=127.0.0.1, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x000001EABC68DF10>)
  File "python\ray\_raylet.pyx", line 620, in ray._raylet.execute_task
  File "C:\Users\Tim\Anaconda3\envs\rllib\lib\site-packages\ray\_private\memory_monitor.py", line 162, in raise_if_low_memory
    raise RayOutOfMemoryError(
ray._private.memory_monitor.RayOutOfMemoryError: More than 95% of the memory on node DESKTOP-BKAPO4O is used (7.4 / 7.73 GB). The top 10 memory consumers are:

PID	MEM	COMMAND
9316	0.69GiB	C:\Users\Tim\Anaconda3\envs\rllib\python.exe -m ipykernel_launcher -f C:\Users\Tim\AppData\Roaming\j
17236	0.48GiB	C:\Users\Tim\AppData\Local\Microsoft\OneDrive\OneDrive.exe /background
23032	0.43GiB	C:\Users\Tim\Anaconda3\envs\rllib\python.exe C:\Users\Tim\Anaconda3\envs\rllib\lib\site-packages\ray
30580	0.3GiB	C

[2m[36m(RolloutWorker pid=23032)[0m total score is -35 at time: 30
[2m[36m(RolloutWorker pid=23032)[0m FPS is currently: 2
[2m[36m(RolloutWorker pid=23032)[0m steps taken: 1800
episode reward mean:  nan
End of episode  17
Starting episode  18
[2m[36m(RolloutWorker pid=23032)[0m total score is -37 at time: 32
[2m[36m(RolloutWorker pid=23032)[0m FPS is currently: 2
[2m[36m(RolloutWorker pid=23032)[0m steps taken: 1900
episode reward mean:  nan
End of episode  18
Starting episode  19
[2m[36m(RolloutWorker pid=23032)[0m total score is -38 at time: 33
[2m[36m(RolloutWorker pid=23032)[0m FPS is currently: 2
[2m[36m(RolloutWorker pid=23032)[0m steps taken: 2000
episode reward mean:  nan
End of episode  19
Starting episode  20
[2m[36m(RolloutWorker pid=23032)[0m total score is -40 at time: 35
[2m[36m(RolloutWorker pid=23032)[0m FPS is currently: 2
[2m[36m(RolloutWorker pid=23032)[0m steps taken: 2100
episode reward mean:  nan
End of episode  20
Starting epis

AssertionError: 