In [1]:
import torch
import time
import pygame
import os #NEW LINE
import gymnasium as gym 
import tensorflow as tf
from stable_baselines3 import PPO, DQN, A2C, SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from tensorflow.python.client import device_lib
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

pygame 2.1.3 (SDL 2.0.22, Python 3.9.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 18236890925598444872
xla_global_id: -1
]


In [3]:
#env = gym.make("FrozenLake-v1", map_name="8x8",render_mode="human", device="cuda")
#env = gym.make("FrozenLake-v1", map_name="8x8",render_mode="human")
#env = gym.make("FrozenLake-v1", render_mode="human", device="cuda")
env = gym.make("FrozenLake-v1", render_mode="human")
log_path = os.path.join('Training', 'Logs')
save_path = os.path.join('Training', 'Saved Models')
# sess.graph contains the graph definition; that enables the Graph Visualizer.
file_writer = tf.summary.create_file_writer(log_path)

# Adding Callbacks

In [4]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, best_model_save_path=save_path, verbose=1)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=1)
'''
t_board_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_path,
    histogram_freq=0,
    write_graph=True,
    write_images=False,
    write_steps_per_second=False,
    update_freq='epoch',
    profile_batch=0,
    embeddings_freq=0,
    embeddings_metadata=None,
)
'''

"\nt_board_callback = tf.keras.callbacks.TensorBoard(\n    log_dir=log_path,\n    histogram_freq=0,\n    write_graph=True,\n    write_images=False,\n    write_steps_per_second=False,\n    update_freq='epoch',\n    profile_batch=0,\n    embeddings_freq=0,\n    embeddings_metadata=None,\n)\n"

# Pre-training: Testing our environment, Exploration

In [5]:
import time

episodes = 3
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        #time.sleep(0.01)
        action = env.action_space.sample()
        #needed to split done into truncated and terminated for it to work. See abvove cell for why
        n_state, reward, terminated, truncated, info = env.step(action)
        score+=reward
        done = truncated or terminated 
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:0.0
Episode:2 Score:0.0
Episode:3 Score:0.0


In [6]:
print("The action space is: {}".format(env.action_space))
print("The observation space is: {}".format(env.observation_space))
print("Sample observation space value: {}".format(env.observation_space.sample()))

The action space is: Discrete(4)
The observation space is: Discrete(16)
Sample observation space value: 9


# Model 1 Evaluation: PPO Algorithm, No GPU

In [7]:
env = gym.make("FrozenLake-v1", render_mode="human",map_name="8x8")
#Reinterprets the env
env = DummyVecEnv([lambda: env])
#Defines the 'agent'
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [None]:
!tensorboard --logdir={training_log_path}

In [8]:
timesteps = 10000
start = time.time()
#Re-running this training will accumulate
model.learn(total_timesteps=timesteps, callback=eval_callback)
stop = time.time()
print("PPO: Total Training time for {} timesteps : {}s".format(timesteps, stop-start))

Logging to Training\Logs\PPO_13
-----------------------------
| time/              |      |
|    fps             | 3    |
|    iterations      | 1    |
|    time_elapsed    | 530  |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 3           |
|    iterations           | 2           |
|    time_elapsed         | 1062        |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004726704 |
|    clip_fraction        | 0.00928     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -21.5       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00226    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00697    |
|    value_loss           | 0.00316     |
-----------------------------------------
--



error: display Surface quit

# Saving, Teardown, and Reloading

In [9]:
PPO_path = os.path.join(save_path, 'PPO_model_frozen_8x8')
model.save(PPO_path)

In [10]:
# Model 1 Evaluation:
evaluate_policy(model, env, n_eval_episodes=12, render=True)
env.close()
del model

In [11]:
#For Recovery Point after the close()
PPO_path = os.path.join(save_path, 'PPO_model_frozen_8x8')
env = gym.make("FrozenLake-v1", render_mode="human",map_name="8x8")
env = DummyVecEnv([lambda: env])
model = PPO.load(PPO_path, env=env)

In [12]:
# Model 1 Test
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done: 
        print('info', info)
        break
env.close()

info [{'prob': 0.3333333333333333, 'TimeLimit.truncated': False, 'terminal_observation': 19}]


In [13]:
'''
For Troubleshooting Display Error:
del model
'''
del model

# Model 2 Evaluation: DQN Algorithm; No GPU

In [None]:
#MAY NEED TO BREAK HEAR, RESTART FROM LOAD IN NEXT CELL
env = gym.make("FrozenLake-v1", render_mode="human",map_name="8x8")
#Reinterprets the env
env = DummyVecEnv([lambda: env])
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
#Re-running this training will accumulate
timesteps = 20000
start = time.time()
model.learn(total_timesteps=timesteps, callback=eval_callback)
stop = time.time()
print("DQN: Total Training time for {} timesteps : {}s".format(timesteps, stop-start))

In [None]:
DQN_path = os.path.join(save_path, 'DQN_model_frozen')
model.save(DQN_path)

In [None]:
evaluate_policy(model, env, n_eval_episodes=12, render=True)
env.close()

# Model 3 Evaluation: A2C Algorithm; No GPU

In [19]:
#MAY NEED TO BREAK HEAR, RESTART FROM LOAD IN NEXT CELL
env = gym.make("FrozenLake-v1", render_mode="human",map_name="8x8")
env = DummyVecEnv([lambda: env])
model = A2C('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [20]:
#Re-running this training will accumulate
timesteps = 20000
start = time.time()
model.learn(total_timesteps=timesteps, callback=eval_callback)
stop = time.time()
print("A2C: Total Training time for {} timesteps : {}s".format(timesteps, stop-start))

Logging to Training\Logs\A2C_5
------------------------------------
| time/                 |          |
|    fps                | 3        |
|    iterations         | 100      |
|    time_elapsed       | 127      |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | -4.56    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.00316  |
|    value_loss         | 8.33e-06 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 3         |
|    iterations         | 200       |
|    time_elapsed       | 256       |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -1.38     |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 199       |
|    policy_loss        | -1.52e-

error: display Surface quit

In [21]:
A2C_path = os.path.join('Training', 'Saved Models', 'A2C_model_frozen_8x8')
model.save(A2C_path)

In [22]:
evaluate_policy(model, env, n_eval_episodes=12, render=True)
env.close()