In [1]:
import torch
import time
import pygame
import os #NEW LINE
import gymnasium as gym 
import tensorflow as tf
from tf import Session as sess
from stable_baselines3 import PPO, DQN, A2C, SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from tensorflow.python.client import device_lib
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

pygame 2.1.3 (SDL 2.0.22, Python 3.9.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4532033940334466157
xla_global_id: -1
]


In [None]:
# GPU config
physical devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [15]:
#env = gym.make("FrozenLake-v1", map_name="8x8",render_mode="human", device="cuda")
#env = gym.make("FrozenLake-v1", map_name="8x8",render_mode="human")
env = gym.make("FrozenLake-v1", render_mode="human", device="cuda")
#env = gym.make("FrozenLake-v1", render_mode="human")
log_path = os.path.join('Training', 'Logs')
save_path = os.path.join('Training', 'Saved Models')
# sess.graph contains the graph definition; that enables the Graph Visualizer.
file_writer = tf.summary.create_file_writer(log_path, sess.graph)

# Adding Callbacks

In [16]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best=stop_callback, eval_freq=10000, best_model_save_path=save_path, verbose=1)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path)
'''
t_board_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_path,
    histogram_freq=0,
    write_graph=True,
    write_images=False,
    write_steps_per_second=False,
    update_freq='epoch',
    profile_batch=0,
    embeddings_freq=0,
    embeddings_metadata=None,
)
'''

# Pre-training: Testing our environment, Exploration

In [None]:
import time

episodes = 3
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        #time.sleep(0.01)
        action = env.action_space.sample()
        #needed to split done into truncated and terminated for it to work. See abvove cell for why
        n_state, reward, terminated, truncated, info = env.step(action)
        score+=reward
        done = truncated or terminated 
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
print("The action space is: {}".format(env.action_space))
print("The observation space is: {}".format(env.observation_space))
print("Sample observation space value: {}".format(env.observation_space.sample()))

# Model 1 Evaluation: PPO Algorithm, With GPU

In [None]:
env = gym.make("FrozenLake-v1", render_mode="human")
#Reinterprets the env
env = DummyVecEnv([lambda: env])
#Defines the 'agent'
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path, device='cuda')

In [None]:
#!tensorboard --logdir={training_log_path}

In [None]:
timesteps = 20000
start = time.time()
#Re-running this training will accumulate
model.learn(total_timesteps=timesteps, callback=eval_callback)
stop = time.time()
print("PPO: Total Training time for {} timesteps : {}s".format(timesteps, stop-start))

# Saving, Teardown, and Reloading, Evaluation

In [None]:
PPO_GPU_path = os.path.join(save_path, 'PPO_model_frozen_GPU')
model.save(PPO_GPU_path)

In [None]:
# Model 1 Evaluation:
evaluate_policy(model, env, n_eval_episodes=12, render=True, callback=tensorboard_callback)
env.close()
del model

In [None]:
#For Recovery Point after the close()
PPO_GPU_path = os.path.join(save_path, 'PPO_model_frozen_GPU')
env = gym.make("FrozenLake-v1", render_mode="human")
env = DummyVecEnv([lambda: env])
model = PPO.load(PPO_GPU_path, env=env)

In [None]:
# Model 1 Test
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    if done: 
        print('info', info)
        break
env.close()

In [None]:
'''
For Troubleshooting Display Error:
del model
'''

# Model 2 Evaluation: DQN Algorithm; With GPU

In [None]:
#MAY NEED TO BREAK HEAR, RESTART FROM LOAD IN NEXT CELL
env = DummyVecEnv([lambda: env])
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path, device='cuda')

In [None]:
#Re-running this training will accumulate
timesteps = 20000
start = time.time()
model.learn(total_timesteps=timesteps, callback=eval_callback)
stop = time.time()
print("DQN (with GPU): Total Training time for {} timesteps : {}s".format(timesteps, stop-start))

In [None]:
DQN_GPU_path = os.path.join(save_path, 'DQN_model_frozen_GPU')
model.save(DQN_GPU_path)

In [None]:
evaluate_policy(model, env, n_eval_episodes=12, render=True)
env.close()

# Model 3 Evaluation: SAC Algorithm; With GPU

In [10]:
#MAY NEED TO BREAK HEAR, RESTART FROM LOAD IN NEXT CELL
env = gym.make("FrozenLake-v1", render_mode="human")
env = DummyVecEnv([lambda: env])
model = A2C('MlpPolicy', env, verbose = 1, tensorboard_log=log_path, device='cuda')

Using cpu device


In [None]:
#Re-running this training will accumulate
timesteps = 20000
start = time.time()
model.learn(total_timesteps=timesteps, callback=eval_callback)
stop = time.time()
print("A2C (with GPU): Total Training time for {} timesteps : {}s".format(timesteps, stop-start))

Logging to Training\Logs\A2C_1
------------------------------------
| time/                 |          |
|    fps                | 3        |
|    iterations         | 100      |
|    time_elapsed       | 142      |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | -659     |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.127    |
|    value_loss         | 0.00981  |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 3        |
|    iterations         | 200      |
|    time_elapsed       | 284      |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | -33.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | -0.00202 |
|    va

In [None]:
A2C_GPU_path = os.path.join('Training', 'Saved Models', 'A2C_model_frozen_GPU')
model.save(SAC_path)

In [None]:
evaluate_policy(model, env, n_eval_episodes=12, render=True)
env.close()