## 0. Install library


In [None]:
# !pip install stable-baselines3[extra]  # install once: SB3 + extras for env wrappers and callbacks

## 1. Import dependencies


In [None]:
# imports: os for paths, gymnasium for envs, PPO algorithm, DummyVecEnv to vectorize, evaluate_policy for quick eval
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv  # -> vectorize environment
from stable_baselines3.common.evaluation import evaluate_policy

### Important RL tips (short):

- Start with simple envs (CartPole) to understand observation/action spaces.
- Watch renderings to debug agent behavior, but disable during heavy training.
- Track rewards and use evaluation callbacks to avoid overfitting or wasted compute.
- Small experiments (network size, timesteps) teach faster than big runs.


## 2. load environments


In [None]:
env_name = "CartPole-v1"
# render_mode='human' shows the env (use only for testing/visualizing)
env = gym.make(env_name, render_mode="human")
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<CartPoleEnv<CartPole-v1>>>>>

In [None]:
# (must be run every time)  # simple random-play loop to see env dynamics
env = gym.make(env_name, render_mode="human")

episodes = 5
for episode in range(1, episodes+1):
    # gymnasium may return (obs, info) in newer API; simple cases sometimes return just obs
    state = env.reset()
    terminated = False
    truncated = False
    done = False
    score = 0

    while not done:
        env.render()
        # random action (useful to inspect action space)
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated  # Set 'done' for the while loop condition
        score += reward

    print('Episode:{} Score:{}'.format(episode, score))

env.close()

Episode:1 Score:27.0
Episode:2 Score:19.0
Episode:3 Score:14.0
Episode:4 Score:30.0
Episode:5 Score:34.0


In [None]:
env.action_space  # check action space type and shape

Discrete(2)

In [None]:
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()  # -> example random action

np.int64(1)

In [None]:
env.observation_space  # inspect observation space shape and bounds

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

In [None]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()  # -> example random observation

array([-4.4427724 ,  0.7804759 , -0.04131574, -0.873845  ], dtype=float32)

### Action Space

The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
of the fixed force the cart is pushed with.

| Num | Action                 |
| --- | ---------------------- |
| 0   | Push cart to the left  |
| 1   | Push cart to the right |

**Note**: The velocity that is reduced or increased by the applied force is not fixed and it depends on the angle
the pole is pointing. The center of gravity of the pole varies the amount of energy needed to move the cart underneath it

### Observation Space

The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:

| Num | Observation           | Min                 | Max               |
| --- | --------------------- | ------------------- | ----------------- |
| 0   | Cart Position         | -4.8                | 4.8               |
| 1   | Cart Velocity         | -Inf                | Inf               |
| 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
| 3   | Pole Angular Velocity | -Inf                | Inf               |


## 3. Train the model


![image.png](attachment:image.png)


In [None]:
log_path = os.path.join("Training", "Logs")  # path for tensorboard logs
log_path

'Training\\Logs'

In [None]:
env = gym.make(env_name)  # create non-rendering env for training (faster)
# -> wrapping the env with the dummy vectorized env (SB3 expects vectorized envs)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1,
            tensorboard_log=log_path)  # PPO with MLP policy

Using cuda device




In [None]:
# start training; adjust timesteps based on convergence
model.learn(total_timesteps=20000)

## 4. Save model


In [None]:
PPO_Path = os.path.join("Training", "Saved_Models",
                        "PPO_Model_CartPole")  # save path for model

In [None]:
model.save(PPO_Path)  # save trained policy

In [None]:
del model  # delete to demonstrate loading

In [None]:
# load model (provide env for continuing training/evaluation)
model = PPO.load(PPO_Path, env=env)

## 5. Evaluate the model


In [None]:
# env = gym.make(env_name, render_mode="human")
env = gym.make(env_name)

# -> returns: avg_rewards, std (use render=True to watch)
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(np.float64(500.0), np.float64(0.0))

## 6. Test the model


In [None]:
# (must be run every time)  # deterministic play using the trained model to inspect behavior
env = gym.make(env_name, render_mode="human")

episodes = 5
for episode in range(1, episodes+1):
    # Unpack the result of env.reset() into observation and info
    observations, info = env.reset()  # gymnasium returns (obs, info)
    terminated = False
    truncated = False
    done = False
    score = 0

    while not done:
        env.render()
        # -> use model's predicted action (not random)
        action, _ = model.predict(observations)
        observations, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated  # Set 'done' for the while loop condition
        score += reward

    print('Episode:{} Score:{}'.format(episode, score))

env.close()

Episode:1 Score:500.0
Episode:2 Score:264.0
Episode:3 Score:238.0
Episode:4 Score:317.0
Episode:5 Score:500.0


In [None]:
model.predict(observations)  # quick single-step prediction

(array(0), None)

In [None]:
observations, info = env.reset()
action, _ = model.predict(observations)
env.step(action)  # step the env with model action

(array([-0.04126324, -0.19534215,  0.00097332,  0.27127814], dtype=float32),
 1.0,
 False,
 False,
 {})

## 6. Adding calbacks


In [None]:
# callbacks to auto-eval and stop training
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [None]:
save_path = os.path.join("Training", "Saved_Models")
env = gym.make(env_name)

stop_callback = StopTrainingOnRewardThreshold(
    reward_threshold=300, verbose=1)  # stop when target reward reached
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)  # evaluate periodically and save best model

In [None]:
# (re)create model to use with callbacks
model = PPO("MlpPolicy", env, verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [None]:
# training with evaluation & early stopping
model.learn(total_timesteps=20000, callback=eval_callback)

## 7. Change Policy


In [None]:
net_arch = [dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]
model = PPO('MlpPolicy', env, verbose=1,
            # new policy
            policy_kwargs={'net_arch': net_arch})  # customize policy network sizes

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [None]:
# train with the new policy architecture
model.learn(total_timesteps=20000, callback=eval_callback)

## Using an alternate algorithm


In [None]:
# DQN is another RL algorithm (value-based), good for discrete actions
from stable_baselines3 import DQN

In [None]:
model = DQN("MlpPolicy", env, verbose=1)  # create DQN model

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
model.learn(total_timesteps=20000)  # train DQN