# MVP Pipeline

## Step 1 : Install dependencies

In [1]:
import gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt 

## Step 2 : setup RL environment

In [2]:
# Create environment
env_name = 'LunarLander-v2'
env = gym.make(env_name)

## Step 3 : RL model training

In [5]:
# Instantiate the agent
model = DQN('MlpPolicy', env, verbose=1)
## exploration_final_eps=0.1, target_update_interval=250

# Train the agent
model.learn(total_timesteps=int(1e6))

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 95       |
|    ep_rew_mean      | -135     |
|    exploration rate | 0.996    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4961     |
|    time_elapsed     | 0        |
|    total timesteps  | 380      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 105      |
|    ep_rew_mean      | -158     |
|    exploration rate | 0.992    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6215     |
|    time_elapsed     | 0        |
|    total timesteps  | 842      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 107      |
|    ep_rew_mean      | -146     |
|    exploration rate | 0.988    |
| time/               |          |
|    episodes       

<stable_baselines3.dqn.dqn.DQN at 0x14ed440a0>

## Step 4 : RL performance evaluation

In [6]:
# Evaluate rewards of X episodes
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=50)
print(f"mean_reward = {mean_reward:.2f} +/- {std_reward}")

mean_reward = 206.94 +/- 66.29538645610968


In [5]:
# Evaluate score of X episodes
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        # action = env.action_space.sample()
        action, _states = model.predict(state, deterministic=True)
        state, reward, done, info = env.step(action)
        score += reward
        env.render()
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:253.03489491225028
Episode:2 Score:241.82517805466304
Episode:3 Score:124.11066242996381
Episode:4 Score:111.84710417887614
Episode:5 Score:182.53179117837925
Episode:6 Score:228.1282998993443
Episode:7 Score:223.57938434818303
Episode:8 Score:247.12375418244366
Episode:9 Score:284.8110964051093
Episode:10 Score:250.41523609771204


In [23]:
# render X episodes
evaluate_policy(model, env, n_eval_episodes=5, render=True)
env.close()

In [8]:
# render 1 episode
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    # env_screen = env.render(mode='rgb_array')  # collect frame of the env.
    if done:
        print("done at", i)
        break
env.close()
# plt.imshow(env_screen)

done at 489


## Step 5 : save RL model

In [9]:
# Save the agent
model.save("dqn_lunar_X")

In [10]:
del model

In [3]:
# Load the trained agent
model = DQN.load("dqn_lunar_2", env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [4]:
model.policy

DQNPolicy(
  (q_net): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=8, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=4, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=8, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=4, bias=True)
    )
  )
)

# ENV modifications (optional)

## Step 1 : Explore Observation and Action Spaces

In [14]:
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: ", obs_space)
print("The action space: ", action_space)

The observation space:  Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)
The action space:  Discrete(4)


In [16]:
print("Upper Bound for Env Observation", obs_space.high)
print("Lower Bound for Env Observation", obs_space.low)

Upper Bound for Env Observation [inf inf inf inf inf inf inf inf]
Lower Bound for Env Observation [-inf -inf -inf -inf -inf -inf -inf -inf]


In [18]:
print("The observation space's shape: ", obs_space.shape)

The observation space's shape:  (8,)


In [48]:
# Explore action_space

env.reset()

# Works
env.step(2)
print("It works!")

# Doesn't work (Discrete(4) have actions 0-3)
env.step(4)
print("It works!")

It works!


AssertionError: 4 (<class 'int'>) invalid 

## Step 2 : Apply Wrapper

### vanilla wrapper
objective: make time-series observation_space

In [31]:
from collections import deque
from gym import spaces
import numpy as np

class ConcatObs(gym.Wrapper):
    def __init__(self, env, k):
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=((k,) + shp), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        return np.array(self.frames)

    def test(self):
        print("test")

In [32]:
env_name = 'LunarLander-v2'
env = gym.make(env_name)
vanilla_wrapped_env = ConcatObs(env, 4)
print("The new observation space is", vanilla_wrapped_env.observation_space)
print("The new observation space's shape: ", vanilla_wrapped_env.observation_space.shape)

The new observation space is Box([[-inf -inf -inf -inf -inf -inf -inf -inf]
 [-inf -inf -inf -inf -inf -inf -inf -inf]
 [-inf -inf -inf -inf -inf -inf -inf -inf]
 [-inf -inf -inf -inf -inf -inf -inf -inf]], [[inf inf inf inf inf inf inf inf]
 [inf inf inf inf inf inf inf inf]
 [inf inf inf inf inf inf inf inf]
 [inf inf inf inf inf inf inf inf]], (4, 8), float32)
The new observation space's shape:  (4, 8)


In [45]:
# class attributes and methods
print([temp for temp in dir(vanilla_wrapped_env) if not temp.startswith("_")])

['action_space', 'class_name', 'close', 'compute_reward', 'env', 'frames', 'k', 'metadata', 'observation_space', 'render', 'reset', 'reward_range', 'seed', 'spec', 'step', 'test', 'unwrapped']


In [42]:
# Reset the Env
obs = vanilla_wrapped_env.reset()
print("Intial obs is of the shape", obs.shape)
print(obs.round(2))

# Take one step
obs, _, _, _  = vanilla_wrapped_env.step(2)
print("Obs after taking a step is", obs.shape)
print(obs.round(2))

Intial obs is of the shape (4, 8)
[[-0.    1.41 -0.37 -0.11  0.    0.08  0.    0.  ]
 [-0.    1.41 -0.37 -0.11  0.    0.08  0.    0.  ]
 [-0.    1.41 -0.37 -0.11  0.    0.08  0.    0.  ]
 [-0.    1.41 -0.37 -0.11  0.    0.08  0.    0.  ]]
Obs after taking a step is (4, 8)
[[-0.    1.41 -0.37 -0.11  0.    0.08  0.    0.  ]
 [-0.    1.41 -0.37 -0.11  0.    0.08  0.    0.  ]
 [-0.    1.41 -0.37 -0.11  0.    0.08  0.    0.  ]
 [-0.01  1.41 -0.39 -0.09  0.01  0.07  0.    0.  ]]


### specific wrappers

In [16]:
import random

class ObservationWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def observation(self, obs):
        # Normalise observation by 255
        return obs / 255.0

    def test(self):
        print("test")
    
class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, reward):
        # Clip reward between 0 to 1
        return np.clip(reward, 0, 1)
    
class ActionWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def action(self, action):
        if action == 2:
            return random.choice([0,1,3])
        else:
            return action

In [17]:
env_name = 'LunarLander-v2'
env = gym.make(env_name)
wrapped_env = ObservationWrapper(RewardWrapper(ActionWrapper(env)))

In [54]:
obs = wrapped_env.reset()

for i in range(500):
    action = wrapped_env.action_space.sample()
    obs, reward, done, info = wrapped_env.step(action)
    
    # Raise a flag if values have not been vectorised properly
    if (obs > 2.0).any() or (obs < -2.0).any():
        print("Max or min value of observations out of range")
    
    # Raise a flag if reward has not been clipped.
    if reward < 0.0 or reward > 1.0:
        assert False, "Reward out of bounds"
    
    # Check the rendering if the spaceship don't fire the main engine
    wrapped_env.render()
    
    # if done:
    #     print("done at", i)
    #     break

wrapped_env.close()

print("All checks passed")

All checks passed


In [46]:
# class attributes and methods
print([temp for temp in dir(wrapped_env) if not temp.startswith("_")])

['action_space', 'class_name', 'close', 'compute_reward', 'env', 'metadata', 'observation', 'observation_space', 'render', 'reset', 'reward_range', 'seed', 'spec', 'step', 'test', 'unwrapped']


## Step 3 : Vectorized Environment

In [27]:
import gym
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv

# List of Env
num_envs = 3
env_name = 'LunarLander-v2'
envs = [lambda: gym.make(env_name) for i in range(num_envs)]

# Vec Env 
envs = SubprocVecEnv(envs)

In [28]:
# Get initial state
init_obs = envs.reset()

# We get a list of observations corresponding to parallel environments 
print("Number of Envs:", len(init_obs))
print(init_obs)

# Check out of the obs 
one_obs = init_obs[0]
print("\nShape of one Env:", one_obs.shape)
print(one_obs)

# prepare a list of actions and apply them to each environment 
actions = [0, 1, 2]
obs, rewards, dones, _ = envs.step(actions)
print("\nResults of step function:")
print("obs: ", obs)
print("rewards: ", rewards)
print("dones: ", dones)

Number of Envs: 3
[[ 2.0866394e-04  1.4159008e+00  2.1123327e-02  2.2136579e-01
  -2.3503168e-04 -4.7847279e-03  0.0000000e+00  0.0000000e+00]
 [-6.9787027e-03  1.4141301e+00 -7.0688903e-01  1.4264348e-01
   8.0934241e-03  1.6012080e-01  0.0000000e+00  0.0000000e+00]
 [ 6.8333624e-03  1.3996953e+00  6.9212472e-01 -4.9890202e-01
  -7.9113161e-03 -1.5677658e-01  0.0000000e+00  0.0000000e+00]]

Shape of one Env: (8,)
[ 2.0866394e-04  1.4159008e+00  2.1123327e-02  2.2136579e-01
 -2.3503168e-04 -4.7847279e-03  0.0000000e+00  0.0000000e+00]

Results of step function:
obs:  [[ 4.1723251e-04  1.4203035e+00  2.1094989e-02  1.9567196e-01
  -4.7141171e-04 -4.7273817e-03  0.0000000e+00  0.0000000e+00]
 [-1.4029121e-02  1.4167694e+00 -7.1484143e-01  1.1721561e-01
   1.7791443e-02  1.9397831e-01  0.0000000e+00  0.0000000e+00]
 [ 1.3715362e-02  1.3892778e+00  6.9584286e-01 -4.6305344e-01
  -1.5459266e-02 -1.5097317e-01  0.0000000e+00  0.0000000e+00]]
rewards:  [ 2.09264548 -1.59399329  1.7180019 ]
do

In [None]:
init_obs = envs.reset()

for i in range(1000):
    actions = [envs.action_space.sample() for i in range(num_envs)]
    envs.step(actions)
    envs.render()

envs.close()