In [9]:
!pip install pygame

Collecting pygame
  Using cached pygame-2.6.1-cp312-cp312-win_amd64.whl.metadata (13 kB)
Using cached pygame-2.6.1-cp312-cp312-win_amd64.whl (10.6 MB)
Installing collected packages: pygame
Successfully installed pygame-2.6.1


In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

In [7]:
watch_env = gym.make('FrozenLake-v1', render_mode='human')

In [3]:
watch_env.action_space

Discrete(4)

In [4]:
watch_env.observation_space

Discrete(16)

In [14]:
episodes = 5
for episode in range(episodes):
    state, _ = watch_env.reset()
    done = False
    truncated = False
    score = 0

    while not done and not truncated:
        action = watch_env.action_space.sample()
        n_state, reward, done, truncated, info = watch_env.step(action)
        score += reward
    print(f"Episode: {episode+1} Score: {score}")
watch_env.close()

Episode: 1 Score: 0.0
Episode: 2 Score: 0.0
Episode: 3 Score: 0.0
Episode: 4 Score: 0.0
Episode: 5 Score: 0.0


In [3]:
train_env = gym.make('FrozenLake-v1')
train_env = DummyVecEnv([lambda : train_env])
model = PPO('MlpPolicy', train_env, verbose=1)

Using cpu device


In [32]:
model.learn(total_timesteps=1000000)

-----------------------------
| time/              |      |
|    fps             | 2115 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1479        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.018039888 |
|    clip_fraction        | 0.0773      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -3.49       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0237     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00998    |
|    value_loss           | 0.0147      |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x175a2a44140>

In [33]:
PPO_path = os.path.join('Saved Models', 'PPO_1M')
model.save(PPO_path)

In [34]:
del model

In [35]:
model = PPO.load(PPO_path)

In [36]:
evaluate_policy(model, train_env, n_eval_episodes=10)

(np.float64(0.6), np.float64(0.4898979485566356))

In [37]:
test_env = gym.make("FrozenLake-v1", render_mode='human')
episodes = 5
for episode in range(episodes):
    obs, _ = test_env.reset()
    done = False
    truncated = False
    score = 0

    while not done and not truncated:
        action, _ = model.predict(obs)
        n_state, reward, done, truncated, info = test_env.step(int(action))
        score += reward
    print(f"Episode: {episode+1} Score: {score}")
test_env.close()

Episode: 1 Score: 0.0
Episode: 2 Score: 0.0
Episode: 3 Score: 0.0
Episode: 4 Score: 0.0
Episode: 5 Score: 0.0


In [11]:
from stable_baselines3 import DQN
model = DQN('MlpPolicy', train_env, verbose=1)
model.learn(total_timesteps=50000)

Using cpu device
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.997    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8401     |
|    time_elapsed     | 0        |
|    total_timesteps  | 18       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 22402    |
|    time_elapsed     | 0        |
|    total_timesteps  | 48       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.982    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 6876     |
|    time_elapsed     | 0        |
|    total_timesteps  | 94       |
----------------------------------
----------------------------------
| r

<stable_baselines3.dqn.dqn.DQN at 0x255731ab6b0>

In [12]:
DQN_path = os.path.join('Saved Models', 'DQN_50K')
model.save(DQN_path)

In [13]:
del model

In [14]:
model = DQN.load(DQN_path)

In [16]:
evaluate_policy(model, train_env, n_eval_episodes=10)

(np.float64(0.0), np.float64(0.0))

In [17]:
test_env = gym.make("FrozenLake-v1", render_mode='human')
episodes = 5
for episode in range(episodes):
    obs, _ = test_env.reset()
    done = False
    truncated = False
    score = 0

    while not done and not truncated:
        action, _ = model.predict(obs)
        n_state, reward, done, truncated, info = test_env.step(int(action))
        score += reward
    print(f"Episode: {episode+1} Score: {score}")
test_env.close()

Episode: 1 Score: 0.0
Episode: 2 Score: 0.0
Episode: 3 Score: 0.0
Episode: 4 Score: 0.0
Episode: 5 Score: 0.0


In [30]:
from stable_baselines3 import A2C
model = A2C('MlpPolicy', train_env, verbose=1)
model.learn(100000)

Using cpu device
------------------------------------
| time/                 |          |
|    fps                | 763      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.36    |
|    explained_variance | -936     |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.000238 |
|    value_loss         | 1.91e-07 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 782       |
|    iterations         | 200       |
|    time_elapsed       | 1         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -1.38     |
|    explained_variance | -86.8     |
|    learning_rate      | 0.0007    |
|    n_updates          | 199       |
|    policy_loss        | -0.000961 |
|    valu

<stable_baselines3.a2c.a2c.A2C at 0x255731a82c0>

In [31]:
A2C_path = os.path.join('Saved Models', 'A2C_1L')
model.save(A2C_path)

In [32]:
del model

In [33]:
model = A2C.load(A2C_path)

In [34]:
evaluate_policy(model, train_env, n_eval_episodes=10)

(np.float64(0.8), np.float64(0.4))

In [36]:
test_env = gym.make("FrozenLake-v1", render_mode='human')
episodes = 5
for episode in range(episodes):
    obs, _ = test_env.reset()
    done = False
    truncated = False
    score = 0

    while not done and not truncated:
        action, _ = model.predict(obs)
        n_state, reward, done, truncated, info = test_env.step(int(action))
        score += reward
    print(f"Episode: {episode+1} Score: {score}")
test_env.close()

Episode: 1 Score: 0.0
Episode: 2 Score: 0.0
Episode: 3 Score: 0.0
Episode: 4 Score: 0.0
Episode: 5 Score: 0.0


In [38]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]
model = PPO('MlpPolicy', train_env, verbose=1, policy_kwargs={'net_arch':net_arch})
model.learn(total_timesteps=50000)

Using cpu device




-----------------------------
| time/              |      |
|    fps             | 1534 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 973         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016897254 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -3.44       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.018      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00769    |
|    value_loss           | 0.0313      |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x25575b17c50>

In [39]:
PPO_path = os.path.join('Saved Models', 'PPO_50K_netArch')
model.save(PPO_path)

In [40]:
del model

In [41]:
model = PPO.load(PPO_path)

In [42]:
evaluate_policy(model, train_env, n_eval_episodes=10)

(np.float64(0.3), np.float64(0.45825756949558394))

In [43]:
test_env = gym.make("FrozenLake-v1", render_mode='human')
episodes = 5
for episode in range(episodes):
    obs, _ = test_env.reset()
    done = False
    truncated = False
    score = 0

    while not done and not truncated:
        action, _ = model.predict(obs)
        n_state, reward, done, truncated, info = test_env.step(int(action))
        score += reward
    print(f"Episode: {episode+1} Score: {score}")
test_env.close()

Episode: 1 Score: 0.0
Episode: 2 Score: 0.0
Episode: 3 Score: 1.0
Episode: 4 Score: 0.0
Episode: 5 Score: 0.0
