# Q-Learning Case Study: Elevator Choosing Floors

**Analogy:** Imagine an elevator learning which floor to go to. Here, we simulate with FrozenLake-v1.

In [6]:
import numpy as np
import gymnasium as gym

env = gym.make('FrozenLake-v1', is_slippery=False)
n_states = env.observation_space.n
n_actions = env.action_space.n
Q = np.zeros((n_states, n_actions))

# alpha, gamma = 0.1, 0.99
# epsilon, eps_min, eps_decay = 1.0, 0.01, 0.995
returns = []

In [7]:
n_episodes = 20000   # more training
alpha = 0.8          # stronger updates
gamma = 0.95         # discount
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.999

In [8]:
def epsilon_greedy_action(Q, state, epsilon, action_space):
    return action_space.sample() if np.random.rand() < epsilon else int(np.argmax(Q[state]))

In [9]:
for ep in range(n_episodes):
    state, _ = env.reset()
    done = False
    while not done:
        action = epsilon_greedy_action(Q, state, epsilon, env.action_space)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
        state = next_state
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# Evaluation
successes = 0
for _ in range(500):
    s,_=env.reset()
    done=False
    while not done:
        a = np.argmax(Q[s])
        s,r,terminated,truncated,_ = env.step(a)
        done=terminated or truncated
        successes+=r
print("Success rate over 500 eval episodes:", successes/500)

Success rate over 500 eval episodes: 1.0


In [19]:
# import time

# env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human")  # for local machine
# state, _ = env.reset()

# for _ in range(50):   # play 50 steps
#     action = np.argmax(Q[state])  # follow learned policy
#     state, reward, terminated, truncated, info = env.step(action)
#     time.sleep(0.5)   # slow down for visibility
#     if terminated or truncated:
#         state, _ = env.reset()
#         time.sleep(2)

# env.close()

# DQN Case Study: Playing Atari

Analogy: Computer learns to play Atari. Here simulated with CartPole.

In [21]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Downloading stable_baselines3-2.7.0-py3-none-any.whl (187 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable-baselines3
Successfully installed stable-baselines3-2.7.0


In [22]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('CartPole-v1')
model = DQN('MlpPolicy', env, learning_rate=1e-3, buffer_size=50000,
            exploration_fraction=0.1, exploration_final_eps=0.02, verbose=1)
model.learn(total_timesteps=20000)
mean,std=evaluate_policy(model,env,n_eval_episodes=10)
print('DQN Eval:',mean,std)

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  return datetime.utcnow().replace(tzinfo=utc)


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23       |
|    ep_rew_mean      | 23       |
|    exploration_rate | 0.955    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11747    |
|    time_elapsed     | 0        |
|    total_timesteps  | 92       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 21.1     |
|    ep_rew_mean      | 21.1     |
|    exploration_rate | 0.917    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1263     |
|    time_elapsed     | 0        |
|    total_timesteps  | 169      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.328    |
|    n_updates        | 17       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean    

  return datetime.utcnow().replace(tzinfo=utc)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    learning_rate    | 0.001    |
|    loss             | 0.000433 |
|    n_updates        | 1271     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.1     |
|    ep_rew_mean      | 10.1     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 420      |
|    fps              | 1407     |
|    time_elapsed     | 3        |
|    total_timesteps  | 5229     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.000257 |
|    n_updates        | 1282     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.1     |
|    ep_rew_mean      | 10.1     |
|    exploration_rate | 0.02     |
| time/               |          |
|    episodes         | 424      |
|    fps              | 1



In [None]:
# import time

# env = gym.make("CartPole-v1", render_mode="human")  # needs render_mode
# obs, _ = env.reset()

# for _ in range(500):
#     action, _ = model.predict(obs, deterministic=True)
#     obs, reward, terminated, truncated, info = env.step(action)
#     time.sleep(0.02)   # smooth animation
#     if terminated or truncated:
#         obs, _ = env.reset()

# PPO Case Study: Robot Arm Control

Analogy: A robot arm must move smoothly. PPO ensures small updates.

In [23]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

env=gym.make('Pendulum-v1')
model=PPO('MlpPolicy',env,verbose=1)
model.learn(total_timesteps=50000)
mean,std=evaluate_policy(model,env,n_eval_episodes=5)
print('PPO Eval:',mean,std)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.14e+03 |
| time/              |           |
|    fps             | 1899      |
|    iterations      | 1         |
|    time_elapsed    | 1         |
|    total_timesteps | 2048      |
----------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -1.12e+03  |
| time/                   |            |
|    fps                  | 884        |
|    iterations           | 2          |
|    time_elapsed         | 4          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00219273 |
|    clip_fraction        | 0.00732    |
|    clip_range           | 0.2        |
|    entropy_loss      

In [None]:
# env = gym.make("Pendulum-v1", render_mode="human")
# obs, _ = env.reset()

# for _ in range(500):
#     action, _ = model.predict(obs, deterministic=True)
#     obs, reward, terminated, truncated, info = env.step(action)
#     if terminated or truncated:
#         obs, _ = env.reset()


# A2C Case Study: Self-Driving Cars

Analogy: Actor (driver) and Critic (coach) for safe driving. Simulated with LunarLander.

In [27]:
!pip install swig
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.3.1.post0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1.post0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.9 MB[0m [31m11.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m32.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1.post0
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[

In [28]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

env=gym.make('LunarLander-v3')
model=A2C('MlpPolicy',env,verbose=1)
model.learn(total_timesteps=50000)
mean,std=evaluate_policy(model,env,n_eval_episodes=5)
print('A2C Eval:',mean,std)



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 88.4     |
|    ep_rew_mean        | -342     |
| time/                 |          |
|    fps                | 804      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.000633 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -17.1    |
|    value_loss         | 169      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 91.6     |
|    ep_rew_mean        | -324     |
| time/                 |          |
|    fps                | 839      |
|    iterations         | 200      |
|    time_elapsed 



A2C Eval: 157.85720770067437 93.98855303882262


In [None]:
# env = gym.make("LunarLander-v3", render_mode="human")
# obs, _ = env.reset()

# for _ in range(1000):
#     action, _ = model.predict(obs, deterministic=True)
#     obs, reward, terminated, truncated, info = env.step(action)
#     if terminated or truncated:
#         obs, _ = env.reset()
