# Space Navigator with Baseline

In [1]:
import numpy as np

from space_navigator.models.CE import CrossEntropy
from space_navigator.models.baseline import Baseline
from space_navigator.utils import read_environment

In [2]:
env_path = "../../../data/environments/collision.env"
env = read_environment(env_path)

# Training

### Init

In [3]:
# __init__ docstring
print(Baseline.__init__.__doc__)


        Agrs:
            env (Environment): environment with given parameteres.
            step (float): time step in simulation.
            reverse (bool): 
                if True: there are selected exactly 2 maneuvers
                    while the second of them is reversed to the first one;
                if False: one maneuver.

        


In [4]:
init_parameters = {
    "env": env,
    "step": 1e-6,
    "reverse": True, 
}

In [5]:
baseline_model = Baseline(**init_parameters)

### Iteration

(currently it is just one iteration)

In [6]:
# iteration docstring
print(baseline_model.iteration.__doc__)

Training iteration.

        Args:
            print_out (bool): print iteration information.
            n_sessions (int): number of sessions to generate.

        Returns:
            stop (bool): whether to stop training after iteration.

        


In [7]:
iteration_parameters = {
    "n_sessions": 1000,
}

In [8]:
# one iteration
# baseline_model.iteration(**iteration_parameters)

### Train

In [9]:
# train docstring
print(baseline_model.train.__doc__)

Training agent policy (self.action_table).

        Args:
            n_iterations (int): number of iterations.
            print_out (bool): print information during the training.
            *args and **kwargs: iteration arguments, depend on method (inheritor class).

        TODO:
            add early stopping
            add log
            decorate by print_out and log?
        


In [10]:
n_iterations = 1
print_out = True

In [11]:
baseline_model.train(n_iterations, print_out, **iteration_parameters)

  0%|          | 0/1000 [00:00<?, ?it/s]


Start training.

Initial action table:
[]
Initial Reward: -13489.530475182863

iteration: 1/1


100%|██████████| 1000/1000 [00:09<00:00, 107.06it/s]


Training completed in 9.3426 sec.
Total Reward: -0.5260470455285451
Action Table:
[[ 0.          0.          0.          0.        ]
 [ 1.60913022  1.51055235  0.02636679  0.07934858]
 [-1.60913022 -1.51055235 -0.02636679         nan]]





In [12]:
# obtained table of actions
baseline_model.action_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.60913022,  1.51055235,  0.02636679,  0.07934858],
       [-1.60913022, -1.51055235, -0.02636679,         nan]])

In [13]:
# save_path =
# baseline_model.save_action_table(save_path)

# Tuning

In [14]:
CE_init_parameters = {
    "env": env,
    "step": 1e-6,
    "reverse": True, 
    "first_maneuver_time": 'early',
    "n_maneuvers": 2,
    "lr": 0.9,
    "percentile": 95,
    "sigma_dV": None,
    "sigma_t": None,
}
CE_iteration_parameters = {
    "n_sessions": 50,
    "sigma_decay": 0.9,
    "lr_decay": 0.9,
    "percentile_growth": 1.01,
    "show_progress": False,
    "dV_angle": 'complanar',
    "step_if_low_reward": False,
    "early_stopping": True,
}

In [15]:
tune_CE_model = CrossEntropy(**init_parameters)

In [16]:
n_iterations = 50
print_out = False

# set initial actions table for tuning
tune_CE_model.set_action_table(baseline_model.action_table)
# tuning
tune_CE_model.train(n_iterations, print_out, **CE_iteration_parameters)

print(f"Actions table after tuning:\n{tune_CE_model.action_table}")
print(f"\nReward: {tune_CE_model.policy_reward}")

100%|██████████| 50/50 [00:00<00:00, 95.70it/s]
100%|██████████| 50/50 [00:00<00:00, 97.24it/s] 
100%|██████████| 50/50 [00:00<00:00, 102.37it/s]
100%|██████████| 50/50 [00:00<00:00, 100.37it/s]
100%|██████████| 50/50 [00:00<00:00, 101.10it/s]
100%|██████████| 50/50 [00:00<00:00, 99.67it/s] 
100%|██████████| 50/50 [00:00<00:00, 99.80it/s] 
100%|██████████| 50/50 [00:00<00:00, 97.30it/s]
100%|██████████| 50/50 [00:00<00:00, 99.84it/s] 
100%|██████████| 50/50 [00:00<00:00, 101.80it/s]
100%|██████████| 50/50 [00:00<00:00, 101.58it/s]
100%|██████████| 50/50 [00:00<00:00, 101.42it/s]
100%|██████████| 50/50 [00:00<00:00, 101.26it/s]
100%|██████████| 50/50 [00:00<00:00, 101.64it/s]
100%|██████████| 50/50 [00:00<00:00, 102.21it/s]
100%|██████████| 50/50 [00:00<00:00, 101.11it/s]
100%|██████████| 50/50 [00:00<00:00, 99.65it/s] 
100%|██████████| 50/50 [00:00<00:00, 99.72it/s] 
100%|██████████| 50/50 [00:00<00:00, 100.02it/s]
100%|██████████| 50/50 [00:00<00:00, 100.44it/s]

Actions table after tuning:
[[ 0.          0.          0.          0.        ]
 [ 1.60150488  1.51416711  0.02642989  0.07934858]
 [-1.60150488 -1.51416711 -0.02642989         nan]]

Reward: -0.5259282348274475



