# Space Navigator with CE

In [1]:
import numpy as np

from space_navigator.models.CE import CrossEntropy
from space_navigator.models.collinear_GS import CollinearGridSearch
from space_navigator.utils import read_environment

In [2]:
env_path = "../../../data/environments/collision.env"
env = read_environment(env_path)

# Training

### Init

In [3]:
# __init__ docstring
print(CrossEntropy.__init__.__doc__)


        Agrs:
            env (Environment): environment with given parameteres.
            step (float): time step in simulation.
            reverse (bool): if True, there are selected exactly 2 maneuvers
                while the second of them is reversed to the first one.
            first_maneuver_time (str): time to the first maneuver. Could be:
                "early": max time to the first maneuver, namely
                    max(0, 0.5, 1.5, 2.5 ... orbital_periods before collision);
                "auto".
            n_maneuvers (int): total number of maneuvers.
            lr (float): learning rate for stability.
            percentile_growth (float): coefficient of changing percentile.
            sigma_dV, sigma_t (float): sigma of dV and sigma of time_to_req.
                If None, the values are calculated automatically.

        TODO:
            path to save plots.
            variable step propagation step.

        


In [4]:
init_parameters = {
    "env": env,
    "step": 1e-6,
    "reverse": True, 
    "first_maneuver_time": 'early',
    "n_maneuvers": 2,
    "lr": 0.7,
    "percentile": 80,
    "sigma_dV": None,
    "sigma_t": None,
}

In [5]:
CE_model = CrossEntropy(**init_parameters)

### Iteration

In [6]:
# iteration docstring
print(CE_model.iteration.__doc__)

Training iteration.

        Args:
            print_out (bool): print iteration information.
            n_sessions (int): number of sessions per iteration.
            sigma_decay (float): coefficient of changing sigma per iteration.
            lr_decay (float): coefficient of changing learning rate per iteration.
            percentile_growth (float): coefficient of changing percentile.
            show_progress (bool): show training chart.
            dV_angle (str): "complanar", "collinear" or "auto".
            step_if_low_reward (bool): whether to step to the new table
                if reward is lower than current or not.
            early_stopping (bool): whether to stop training
                if change of reward is negligibly small or not.

        Returns:
            stop (bool): whether to stop training after iteration.

        TODO:
            parallel
            log
            test

        


In [7]:
iteration_parameters = {
    "n_sessions": 30,
    "sigma_decay": 0.98,
    "lr_decay": 0.98,
    "percentile_growth": 1.005,
    "show_progress": False,
    "dV_angle": 'complanar',
    "step_if_low_reward": False,
    "early_stopping": True,
}

In [8]:
# one iteration
# CE_model.iteration(**iteration_parameters)

### Train

In [9]:
# train docstring
print(CE_model.train.__doc__)

Training agent policy (self.action_table).

        Args:
            n_iterations (int): number of iterations.
            print_out (bool): print information during the training.
            *args and **kwargs: iteration arguments, depend on method (inheritor class).

        TODO:
            add early stopping
            add log
            decorate by print_out and log?
        


In [10]:
n_iterations = 10
print_out = True

In [11]:
CE_model.train(n_iterations, print_out, **iteration_parameters)

  0%|          | 0/30 [00:00<?, ?it/s]


Start training.

Initial action table:
[[0.         0.         0.         0.        ]
 [0.         0.         0.         0.03333333]
 [0.         0.         0.                nan]]
Initial Reward: -13489.530475182863

iteration: 1/10


100%|██████████| 30/30 [00:00<00:00, 86.73it/s]
 27%|██▋       | 8/30 [00:00<00:00, 79.07it/s]

Policy Reward: -4334.831385075398
Mean Reward:   -211.18501199231866
Max Reward:    -58.81214384367388
Threshold:     -59.51630430836516

iteration: 2/10


100%|██████████| 30/30 [00:00<00:00, 78.65it/s]
 30%|███       | 9/30 [00:00<00:00, 83.51it/s]

Policy Reward: -4334.831385075398
Mean Reward:   -954.0713019521978
Max Reward:    -45.99339565139638
Threshold:     -46.884056743425894

iteration: 3/10


100%|██████████| 30/30 [00:00<00:00, 84.74it/s]
 30%|███       | 9/30 [00:00<00:00, 85.07it/s]

Policy Reward: -4334.831385075398
Mean Reward:   -476.17674687232386
Max Reward:    -58.37479359328544
Threshold:     -69.41031537138508

iteration: 4/10


100%|██████████| 30/30 [00:00<00:00, 80.12it/s]
 30%|███       | 9/30 [00:00<00:00, 81.16it/s]

Policy Reward: -1166.223965270826
Mean Reward:   -863.589874523709
Max Reward:    -6.378934485327951
Threshold:     -42.35021561815753

iteration: 5/10


100%|██████████| 30/30 [00:00<00:00, 83.19it/s]
 33%|███▎      | 10/30 [00:00<00:00, 89.06it/s]

Policy Reward: -20.813393154452054
Mean Reward:   -485.81189305686513
Max Reward:    -23.509985662744135
Threshold:     -34.96188717512047

iteration: 6/10


100%|██████████| 30/30 [00:00<00:00, 83.59it/s]
 33%|███▎      | 10/30 [00:00<00:00, 95.83it/s]

Policy Reward: -5.972422555274126
Mean Reward:   -1165.8291152748013
Max Reward:    -50.8567699230661
Threshold:     -51.333481862693

iteration: 7/10


100%|██████████| 30/30 [00:00<00:00, 89.42it/s]
 30%|███       | 9/30 [00:00<00:00, 84.16it/s]

Policy Reward: -5.972422555274126
Mean Reward:   -310.48981825255373
Max Reward:    -1.1999662744447743
Threshold:     -16.045335484101777

iteration: 8/10


100%|██████████| 30/30 [00:00<00:00, 85.93it/s]
 30%|███       | 9/30 [00:00<00:00, 89.73it/s]

Policy Reward: -1.250985909188426
Mean Reward:   -318.3544154687656
Max Reward:    -5.3311370296320995
Threshold:     -35.572154159373625

iteration: 9/10


100%|██████████| 30/30 [00:00<00:00, 83.09it/s]
 30%|███       | 9/30 [00:00<00:00, 86.54it/s]

Policy Reward: -1.250985909188426
Mean Reward:   -243.46379630848887
Max Reward:    -6.132162269411417
Threshold:     -38.151115719375795

iteration: 10/10


100%|██████████| 30/30 [00:00<00:00, 87.72it/s]

Policy Reward: -1.250985909188426
Mean Reward:   -424.6471472863572
Max Reward:    -28.392408561735632
Threshold:     -32.0333861449681

Training completed in 3.7023 sec.
Total Reward: -1.250985909188426
Action Table:
[[ 0.          0.          0.          0.        ]
 [ 1.28539765  0.70594096  0.01232225  0.07934858]
 [-1.28539765 -0.70594096 -0.01232225         nan]]





In [12]:
# obtained table of actions
CE_model.action_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.28539765,  0.70594096,  0.01232225,  0.07934858],
       [-1.28539765, -0.70594096, -0.01232225,         nan]])

In [13]:
# save_path =
# CE_model.save_action_table(save_path)

# Tuning

In [14]:
init_parameters = {
    "env": env,
    "step": 1e-6,
    "reverse": True, 
    "first_maneuver_time": 'early',
    "n_maneuvers": 2,
    "lr": 0.9,
    "percentile": 95,
    "sigma_dV": None,
    "sigma_t": None,
}
iteration_parameters = {
    "n_sessions": 50,
    "sigma_decay": 0.9,
    "lr_decay": 0.9,
    "percentile_growth": 1.01,
    "show_progress": False,
    "dV_angle": 'complanar',
    "step_if_low_reward": False,
    "early_stopping": False,
}

In [15]:
init_model = CollinearGridSearch(env=env, step=1e-6, reverse=True)
tune_CE_model = CrossEntropy(**init_parameters)

In [16]:
# train init model
init_model.train()
# get obtained actions table
init_action_table = init_model.action_table
init_reward = init_model.policy_reward

print(f"Actions table for tuning:\n{init_action_table}")
print(f"\nReward: {init_reward}")

100%|██████████| 100/100 [00:01<00:00, 98.07it/s]

Actions table for tuning:
[[ 0.          0.          0.          0.        ]
 [ 1.58325676  1.48626394  0.02594283  0.07934858]
 [-1.58325676 -1.48626394 -0.02594283         nan]]

Reward: -0.7804488228736668





In [17]:
n_iterations = 10
print_out = True

# set initial actions table for tuning
tune_CE_model.set_action_table(init_action_table)
# tuning
tune_CE_model.train(n_iterations, print_out, **iteration_parameters)

print(f"Actions table after tuning:\n{tune_CE_model.action_table}")
print(f"\nReward: {tune_CE_model.policy_reward}")

 20%|██        | 10/50 [00:00<00:00, 95.28it/s]


Start training.

Initial action table:
[[ 0.          0.          0.          0.        ]
 [ 1.58325676  1.48626394  0.02594283  0.07934858]
 [-1.58325676 -1.48626394 -0.02594283         nan]]
Initial Reward: -0.7804488228736668

iteration: 1/10


100%|██████████| 50/50 [00:00<00:00, 92.86it/s]
 20%|██        | 10/50 [00:00<00:00, 96.15it/s]

Policy Reward: -0.7804488228736668
Mean Reward:   -205.85944084916238
Max Reward:    -1.0865420760132871
Threshold:     -1.1656146652801582

iteration: 2/10


100%|██████████| 50/50 [00:00<00:00, 93.69it/s]
 20%|██        | 10/50 [00:00<00:00, 94.47it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -125.00055635666851
Max Reward:    -1.293161415468465
Threshold:     -9.564547401108538

iteration: 3/10


100%|██████████| 50/50 [00:00<00:00, 91.99it/s]
 20%|██        | 10/50 [00:00<00:00, 93.91it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -267.4718325081348
Max Reward:    -0.7955816567957126
Threshold:     -2.878131935869903

iteration: 4/10


100%|██████████| 50/50 [00:00<00:00, 91.62it/s]
 16%|█▌        | 8/50 [00:00<00:00, 79.21it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -184.13633857728536
Max Reward:    -0.8309265852610184
Threshold:     -1.2278645283321548

iteration: 5/10


100%|██████████| 50/50 [00:00<00:00, 88.64it/s]
 20%|██        | 10/50 [00:00<00:00, 93.96it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -174.5851165301085
Max Reward:    -1.3490737420974879
Threshold:     -1.4984324299824419

iteration: 6/10


100%|██████████| 50/50 [00:00<00:00, 93.14it/s]
 20%|██        | 10/50 [00:00<00:00, 90.90it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -130.7456229997302
Max Reward:    -1.1906639271156187
Threshold:     -1.3785332629441558

iteration: 7/10


100%|██████████| 50/50 [00:00<00:00, 91.86it/s]
 18%|█▊        | 9/50 [00:00<00:00, 87.83it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -85.56890943161889
Max Reward:    -0.5271301554347526
Threshold:     -0.5365806489534398

iteration: 8/10


100%|██████████| 50/50 [00:00<00:00, 86.95it/s]
 20%|██        | 10/50 [00:00<00:00, 92.69it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -60.325112879613954
Max Reward:    -0.7800767517500613
Threshold:     -1.0973592496491154

iteration: 9/10


100%|██████████| 50/50 [00:00<00:00, 89.76it/s]
 18%|█▊        | 9/50 [00:00<00:00, 88.08it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -50.642182358885414
Max Reward:    -1.1057049100654184
Threshold:     -1.1436183294446007

iteration: 10/10


100%|██████████| 50/50 [00:00<00:00, 85.71it/s]

Policy Reward: -0.6663004448026217
Mean Reward:   -88.17987576634405
Max Reward:    -0.5297031697990151
Threshold:     -0.5573985618510595

Training completed in 5.6531 sec.
Total Reward: -0.6663004448026217
Action Table:
[[ 0.          0.          0.          0.        ]
 [ 1.50530357  0.66501406  0.01160786  0.07934858]
 [-1.50530357 -0.66501406 -0.01160786         nan]]
Actions table after tuning:
[[ 0.          0.          0.          0.        ]
 [ 1.50530357  0.66501406  0.01160786  0.07934858]
 [-1.50530357 -0.66501406 -0.01160786         nan]]

Reward: -0.6663004448026217



