# Space Navigator with Collinear Grid Search

In [1]:
import numpy as np

from space_navigator.models.CE import CrossEntropy
from space_navigator.models.collinear_GS import CollinearGridSearch
from space_navigator.utils import read_environment

In [2]:
env_path = "../../../data/environments/collision.env"
env = read_environment(env_path)

# Training

### Init

In [3]:
# __init__ docstring
print(CollinearGridSearch.__init__.__doc__)


        Agrs:
            env (Environment): environment with given parameteres.
            step (float): time step in simulation.
            reverse (bool): 
                if True: there are selected exactly 2 maneuvers
                    while the second of them is reversed to the first one;
                if False: one maneuver.
            first_maneuver_direction (str): first maneuver is collinear
                to the velocity vector and could be:
                    "forward" (co-directed)
                    "backward" (oppositely directed)
                    "auto" (just collinear).
        


In [4]:
init_parameters = {
    "env": env,
    "step": 1e-6,
    "reverse": True, 
}

In [5]:
collinear_GS_model = CollinearGridSearch(**init_parameters)

### Iteration

(currently it is just one iteration)

In [6]:
# iteration docstring
print(collinear_GS_model.iteration.__doc__)

Training iteration.

        Args:
            print_out (bool): print iteration information.
            n_sessions (int): number of sessions to generate.

        Returns:
            stop (bool): whether to stop training after iteration.

        


In [7]:
iteration_parameters = {
    "n_sessions": 1000,
}

In [8]:
# one iteration
# collinear_GS_model.iteration(**iteration_parameters)

### Train

In [9]:
# train docstring
print(collinear_GS_model.train.__doc__)

Training agent policy (self.action_table).

        Args:
            n_iterations (int): number of iterations.
            print_out (bool): print information during the training.
            *args and **kwargs: iteration arguments, depend on method (inheritor class).

        TODO:
            add early stopping
            add log
            decorate by print_out and log?
        


In [10]:
n_iterations = 1
print_out = True

In [11]:
collinear_GS_model.train(n_iterations, print_out, **iteration_parameters)

  1%|          | 11/1000 [00:00<00:09, 107.68it/s]


Start training.

Initial action table:
[]
Initial Reward: -13489.530475182863

iteration: 1/1


100%|██████████| 1000/1000 [00:09<00:00, 105.58it/s]


Training completed in 9.4744 sec.
Total Reward: -0.5260470455285451
Action Table:
[[ 0.          0.          0.          0.        ]
 [ 1.60913022  1.51055235  0.02636679  0.07934858]
 [-1.60913022 -1.51055235 -0.02636679         nan]]





In [12]:
# obtained table of actions
collinear_GS_model.action_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.60913022,  1.51055235,  0.02636679,  0.07934858],
       [-1.60913022, -1.51055235, -0.02636679,         nan]])

In [13]:
# save_path =
# collinear_GS_model.save_action_table(save_path)

# Tuning

In [14]:
CE_init_parameters = {
    "env": env,
    "step": 1e-6,
    "reverse": True, 
    "first_maneuver_time": 'early',
    "n_maneuvers": 2,
    "lr": 0.9,
    "percentile": 95,
    "sigma_dV": None,
    "sigma_t": None,
}
CE_iteration_parameters = {
    "n_sessions": 100,
    "sigma_decay": 0.9,
    "lr_decay": 0.9,
    "percentile_growth": 1.01,
    "show_progress": False,
    "dV_angle": 'complanar',
    "step_if_low_reward": False,
    "early_stopping": True,
}

In [15]:
tune_CE_model = CrossEntropy(**init_parameters)

In [16]:
n_iterations = 50
print_out = True

# set initial actions table for tuning
tune_CE_model.set_action_table(collinear_GS_model.action_table)
# tuning
tune_CE_model.train(n_iterations, print_out, **CE_iteration_parameters)

print(f"Actions table after tuning:\n{tune_CE_model.action_table}")
print(f"\nReward: {tune_CE_model.policy_reward}")

  8%|▊         | 8/100 [00:00<00:01, 76.83it/s]


Start training.

Initial action table:
[[ 0.          0.          0.          0.        ]
 [ 1.60913022  1.51055235  0.02636679  0.07934858]
 [-1.60913022 -1.51055235 -0.02636679         nan]]
Initial Reward: -0.5260470455285451

iteration: 1/50


100%|██████████| 100/100 [00:01<00:00, 91.56it/s]
 10%|█         | 10/100 [00:00<00:00, 95.32it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -233.1354991146465
Max Reward:    -15.547391400031907
Threshold:     -35.594369310229496

iteration: 2/50


100%|██████████| 100/100 [00:01<00:00, 99.42it/s]
 10%|█         | 10/100 [00:00<00:00, 95.06it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -169.2179326244298
Max Reward:    -0.6648954874118923
Threshold:     -22.661941460580636

iteration: 3/50


100%|██████████| 100/100 [00:00<00:00, 100.03it/s]
 10%|█         | 10/100 [00:00<00:00, 98.07it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -134.13024309572742
Max Reward:    -4.505469222120893
Threshold:     -14.713987741470781

iteration: 4/50


100%|██████████| 100/100 [00:01<00:00, 95.37it/s]
 10%|█         | 10/100 [00:00<00:00, 96.98it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -135.9158082609062
Max Reward:    -23.791370914330287
Threshold:     -23.95303424854044

iteration: 5/50


100%|██████████| 100/100 [00:01<00:00, 99.61it/s]
 10%|█         | 10/100 [00:00<00:00, 97.08it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -88.79543734816703
Max Reward:    -9.397669260166847
Threshold:     -13.71184888257834

iteration: 6/50


100%|██████████| 100/100 [00:01<00:00, 99.45it/s]
 11%|█         | 11/100 [00:00<00:00, 102.43it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -72.3236538347343
Max Reward:    -3.3561204523590993
Threshold:     -15.970342113724119

iteration: 7/50


100%|██████████| 100/100 [00:00<00:00, 101.41it/s]
 10%|█         | 10/100 [00:00<00:00, 98.80it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -62.13333797135094
Max Reward:    -1.851620578046373
Threshold:     -3.669993854349833

iteration: 8/50


100%|██████████| 100/100 [00:01<00:00, 99.83it/s]
 10%|█         | 10/100 [00:00<00:00, 98.53it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -56.060379718053845
Max Reward:    -0.4292906125915815
Threshold:     -5.535676195610343

iteration: 9/50


100%|██████████| 100/100 [00:01<00:00, 92.66it/s]
 10%|█         | 10/100 [00:00<00:00, 91.49it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -52.1756787073549
Max Reward:    -0.6909815705539876
Threshold:     -5.451149769021014

iteration: 10/50


100%|██████████| 100/100 [00:01<00:00, 99.51it/s]
 10%|█         | 10/100 [00:00<00:00, 95.05it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -53.17564993041132
Max Reward:    -3.8372358305704988
Threshold:     -6.523882662034369

iteration: 11/50


100%|██████████| 100/100 [00:01<00:00, 95.29it/s]
 10%|█         | 10/100 [00:00<00:00, 95.90it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -43.94856124427077
Max Reward:    -2.8244018009937752
Threshold:     -6.125561430573137

iteration: 12/50


100%|██████████| 100/100 [00:01<00:00, 99.40it/s]
 10%|█         | 10/100 [00:00<00:00, 95.19it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -48.10064367425386
Max Reward:    -1.3834131768786952
Threshold:     -5.833776932025648

iteration: 13/50


100%|██████████| 100/100 [00:00<00:00, 100.22it/s]
 10%|█         | 10/100 [00:00<00:00, 99.95it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -49.768805839307475
Max Reward:    -0.800628829073637
Threshold:     -3.4054794690021706

iteration: 14/50


100%|██████████| 100/100 [00:01<00:00, 99.10it/s]
 10%|█         | 10/100 [00:00<00:00, 94.80it/s]

Policy Reward: -0.5260470455285451
Mean Reward:   -50.59319638939412
Max Reward:    -4.138963937982124
Threshold:     -4.401670122848138

iteration: 15/50


100%|██████████| 100/100 [00:01<00:00, 97.85it/s]
 10%|█         | 10/100 [00:00<00:00, 98.33it/s]

Policy Reward: -0.5258205239009067
Mean Reward:   -41.677995515114986
Max Reward:    -0.6866365611725413
Threshold:     -1.3160421641605686

iteration: 16/50


100%|██████████| 100/100 [00:01<00:00, 99.88it/s]
 10%|█         | 10/100 [00:00<00:00, 99.41it/s]

Policy Reward: -0.5258205239009067
Mean Reward:   -49.26818196859784
Max Reward:    -0.819215519944822
Threshold:     -1.5131960491771859

iteration: 17/50


100%|██████████| 100/100 [00:01<00:00, 97.70it/s]
  9%|▉         | 9/100 [00:00<00:01, 86.25it/s]

Policy Reward: -0.5258205239009067
Mean Reward:   -42.02128629126624
Max Reward:    -0.7803658993882148
Threshold:     -1.0315060850902806

iteration: 18/50


100%|██████████| 100/100 [00:01<00:00, 95.03it/s]
 11%|█         | 11/100 [00:00<00:00, 101.18it/s]

Policy Reward: -0.5258205239009067
Mean Reward:   -42.947304519446334
Max Reward:    -0.5292041812633463
Threshold:     -0.7133077016589888

iteration: 19/50


100%|██████████| 100/100 [00:00<00:00, 101.25it/s]
 11%|█         | 11/100 [00:00<00:00, 102.51it/s]

Policy Reward: -0.5258205239009067
Mean Reward:   -44.61188542629712
Max Reward:    -0.6836780165430262
Threshold:     -0.7947430344716759

iteration: 20/50


100%|██████████| 100/100 [00:00<00:00, 101.19it/s]

Policy Reward: -0.5258205239009067
Mean Reward:   -45.424706776887234
Max Reward:    -0.5288992337292051
Threshold:     -0.67696453540506

Early stopping.

Training completed in 20.618 sec.
Total Reward: -0.5258205239009067
Action Table:
[[ 0.          0.          0.          0.        ]
 [ 1.60131478  1.51420806  0.0264306   0.07934858]
 [-1.60131478 -1.51420806 -0.0264306          nan]]
Actions table after tuning:
[[ 0.          0.          0.          0.        ]
 [ 1.60131478  1.51420806  0.0264306   0.07934858]
 [-1.60131478 -1.51420806 -0.0264306          nan]]

Reward: -0.5258205239009067



