In [8]:
!pip install pygame

Collecting pygame
  Using cached pygame-2.6.1-cp312-cp312-win_amd64.whl.metadata (13 kB)
Using cached pygame-2.6.1-cp312-cp312-win_amd64.whl (10.6 MB)
Installing collected packages: pygame
Successfully installed pygame-2.6.1


In [25]:
!pip install --upgrade gym numpy



In [31]:
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

In [38]:
import gymnasium as gym
import os
from stable_baselines3 import PPO
from stable_baselines3.common. vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

### Understanding the Environment

In [33]:
watch_env = gym.make("MountainCar-v0", render_mode='human')

In [14]:
watch_env.action_space

Discrete(3)

In [15]:
watch_env.observation_space

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)

In [34]:
episodes = 5
for episode in range(episodes):
    state, _ = watch_env.reset()
    done = False
    truncate = False
    score = 0

    while not done and not truncate:
        action = watch_env.action_space.sample()
        n_state, reward, done, truncate, info = watch_env.step(action)
        score += reward
    print(f"Episode:{episode+1} Score:{score}")
watch_env.close()

Episode:1 Score:-200.0
Episode:2 Score:-200.0
Episode:3 Score:-200.0
Episode:4 Score:-200.0
Episode:5 Score:-200.0


### Training the RL Model

In [45]:
env = gym.make("MountainCar-v0")
env = DummyVecEnv([lambda : env])
model = PPO('MlpPolicy', env, verbose=1)

Using cpu device


In [46]:
model.learn(total_timesteps=250000)

-----------------------------
| time/              |      |
|    fps             | 2160 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1409        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010163624 |
|    clip_fraction        | 0.0239      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.0005      |
|    learning_rate        | 0.0003      |
|    loss                 | 15.1        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00423    |
|    value_loss           | 134         |
-----------------------------------------
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x21ae197b350>

In [47]:
PPO_path = os.path.join('Saved Model', 'PPO_2.5L')
model.save(PPO_path)

### Evaluate Model

In [48]:
eval_env = gym.make("MountainCar-v0", render_mode='human')
evaluate_policy(model, eval_env, n_eval_episodes=5, render=True)
eval_env.close()

### Test the Model

In [49]:
test_env = gym.make("MountainCar-v0", render_mode='human')
episodes = 5
for episode in range(episodes):
    obs, _ = test_env.reset()
    done = False
    truncate = False
    score = 0

    while not done and not truncate:
        action, _ = model.predict(obs)
        obs, reward, done, truncate, info = test_env.step(action)
        score += reward
    print(f"Episode:{episode+1} Score:{score}")
test_env.close()

Episode:1 Score:-200.0
Episode:2 Score:-200.0
Episode:3 Score:-200.0
Episode:4 Score:-200.0
Episode:5 Score:-200.0


### Alteranate Algos

In [50]:
from stable_baselines3 import DQN

In [51]:
model = DQN('MlpPolicy', env, verbose=1)

Using cpu device


In [52]:
model.learn(total_timesteps=1000000)

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.992    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2045     |
|    time_elapsed     | 0        |
|    total_timesteps  | 800      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.127    |
|    n_updates        | 174      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1985     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1600     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00351  |
|    n_updates        | 374      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rat

<stable_baselines3.dqn.dqn.DQN at 0x21ae1979610>

In [53]:
DQN_Path = os.path.join('Saved Model', 'DQN_1M')
model.save(DQN_Path)

In [54]:
test_env = gym.make("MountainCar-v0", render_mode='human')
episodes = 5
for episode in range(episodes):
    obs, _ = test_env.reset()
    done = False
    truncate = False
    score = 0

    while not done and not truncate:
        action, _ = model.predict(obs)
        obs, reward, done, truncate, info = test_env.step(action)
        score += reward
    print(f"Episode:{episode+1} Score:{score}")
test_env.close()

Episode:1 Score:-200.0
Episode:2 Score:-200.0
Episode:3 Score:-200.0
Episode:4 Score:-200.0
Episode:5 Score:-200.0
