In [None]:
# Let's import some useful libraries
import warnings
warnings.filterwarnings(action="ignore", category=FutureWarning)
%matplotlib notebook
%load_ext autoreload
%autoreload 2
from change_param import Param
import matplotlib.pyplot as plt
import gym
import numpy as np
import param
from gym_film.envs import make_env
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.common.vec_env import DummyVecEnv
p= Param()

# Exploiting locality by making each jet a separate environment

### Parallelism in DRL

One of the nice things you can do in DRL to accelerate learning is **using several cloned environments** executing in parallel to train the **same agent**.

**These environments are expected to be the same**, except they will go in different directions because of the policy being stochastic, allowing the agent to explore more trajectories in a quicker way than with a single environment.

### In our case

Now we will use that. But **we will not actually use cloned environments** of the environment we used before. That would mean duplicating the simulation itself, which would be very compute-intensive, the simulation being the bottleneck of our computation.

Instead, we will make an environment for each jet, **"tricking" the agent into thinking that every one of our jets is the same entity !** The agent doesn't need to understand the actual reality of the simulation.

I will spare you the details of the implementation, but let's see a figure that simplifies what's going on here:

![m3](img/method3.png)

### Let's have some training going

Let's try something - we want **a policy that can work when we use a large number of jets.**

Then, let's have two jets:
- one at $x=150$, where there are very small waves starting developping
- one at $x=200$, where there are already large waves 

This way, our agent will see quite **different trajectories** from these two jets, and will learn a policy that can adapt to both situations.

We can then see how good it is on more jets ($10+$)

In [None]:
n_jets = 2
position_first_jet = 150
space_between_jets = 50
# jet power
size_obs_to_reward=20
JET_MAX_POWER=5.0
p.update_dic({'n_jets': n_jets, 
              'position_first_jet': position_first_jet,
              'space_between_jets': space_between_jets,
              'size_obs_to_reward': size_obs_to_reward,
              'JET_MAX_POWER': JET_MAX_POWER})

port = 12000 # we are using sockets in the implementation of this method to communicate between
             # the simulation and the environments, and they need a communication port, but don't mind that

In [None]:
port+=1

from gym_film.envs import make_env
from stable_baselines import PPO2
from gym_film.model.custom_mlp import CustomPolicy
policy = CustomPolicy

envs = make_env.make_env('1env_1jet', param.n_jets, param.jets_position, render=False, port=port)
env=DummyVecEnv(envs)
obs = env.reset()

model = PPO2(policy, env=env, n_steps=param.nb_timestep_per_simulation, verbose=1)

# Let's train him for 40000 environment steps
n_step_training = 800*50 # 1 episode is 800 steps now, because we now have two environments
                         #  that both need 400 steps to complete an episode
model.learn(n_step_training)

In [None]:
from gym_film.envs import make_env
envs = make_env.make_env('1env_1jet', param.n_jets, param.jets_position, render=True, plot_jets=True)
env=DummyVecEnv(envs)
obs = env.reset()

# Duration of the rendering here - 
# you can increase it to see how the control adapt to big waves created by a perturbation jet
time_simulation = 20
render_total_timesteps = int(time_simulation/param.simulation_step_time)

obs = env.reset()
for i in range(render_total_timesteps):
    use_agent = True
    if use_agent:
        action, _states = model.predict(obs)
    else:
        action = [np.array([0 for k in range(param.n_jets)])]
    obs, rewards, done, info = env.step(action)

## Let's transfer that policy to a different case

As I said before, with this method **the number of jets doesn't matter**, so let's put more and see how our model does !

I tried with 20 jets, with no space between them, let's see the render :

In [None]:
n_jets = 20
space_between_jets = 5
p.update_dic({'n_jets': n_jets,
              'space_between_jets': space_between_jets})

In [None]:
port+=1
from gym_film.envs import make_env
envs = make_env.make_env('1env_1jet', param.n_jets, param.jets_position, render=True, plot_jets=True, port=port)
env=DummyVecEnv(envs)
obs = env.reset()

# Duration of the rendering here - 
# you can increase it to see how the control adapt to big waves created by a perturbation jet
time_simulation = 20
render_total_timesteps = int(time_simulation/param.simulation_step_time)

obs = env.reset()
for i in range(render_total_timesteps):
    use_agent = True
    if use_agent:
        action, _states = model.predict(obs)
    else:
        action = [np.array([0 for k in range(param.n_jets)])]
    obs, rewards, done, info = env.step(action)

## Good policy

Let's see what a good policy can do !

This one was trained with **20 jets**, without any space between them.

The jets' maximum power was $5.0$, and the agent was trained during $1.2$M steps, which means $150$ episodes of the simulation. However, the training reached a reward plateau after the $30$th episode, showing no improvement after that.

In [None]:
n_jets = 15
space_between_jets = 10
JET_MAX_POWER=1
p.update_dic({'n_jets': n_jets,
              'space_between_jets': space_between_jets,
              'JET_MAX_POWER': JET_MAX_POWER})

As you can see, I don't even use the same parameters as during the training, but **the results are even better** this way ! You can try using the training's parameters in comparison.

This lets me think we could change dynamically the maximum power of the jets during training or validation phase to have better results. 

Anyway, let's see the result:

In [None]:
port+=1
from gym_film.envs import make_env
envs = make_env.make_env('1env_1jet', param.n_jets, param.jets_position, render=True, plot_jets=True, port=port)
env=DummyVecEnv(envs)
obs = env.reset()

# Here's a model trained on X timesteps
model_path = 'm3.zip'
model = PPO2.load(model_path, env=env)

# Duration of the rendering here - 
# you can increase it to see how the control adapt to big waves created by a perturbation jet
time_simulation = 20
render_total_timesteps = int(time_simulation/param.simulation_step_time)

obs = env.reset()
for i in range(render_total_timesteps):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)

### This is it for me

I hope you liked this little practical exercice, and that you now want to know more about DRL. It's a very quickly growing field, and one that shows very promising results already.