diff --git a/.gitignore b/.gitignore index c379545b0..6ea9113c0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ safe_control_gym/envs/gym_control/assets/cartpole.urdf experiments/figure6/trained_gp_model/bak_best_model_*.pth experiments/figure7/safe_exp_results/ experiments/figure8/unsafe_ppo_temp_data/ +experiments/figure8/unsafe_ppo_model/bak_unsafe_ppo_model_30000.pt temp-data/ z_docstring.py TODOs.md diff --git a/experiments/figure6/config_overrides/gp_mpc_quad_training.yaml b/experiments/figure6/config_overrides/gp_mpc_quad_training.yaml index 8262c63d5..4c6758394 100644 --- a/experiments/figure6/config_overrides/gp_mpc_quad_training.yaml +++ b/experiments/figure6/config_overrides/gp_mpc_quad_training.yaml @@ -101,4 +101,3 @@ task_config: distrib: 'uniform' low: -0.1 high: 0.1 - diff --git a/experiments/figure6/gp_mpc_experiment.py b/experiments/figure6/gp_mpc_experiment.py index 9b1e80b2b..5f2efe52e 100644 --- a/experiments/figure6/gp_mpc_experiment.py +++ b/experiments/figure6/gp_mpc_experiment.py @@ -15,7 +15,14 @@ from safe_control_gym.utils.configuration import ConfigFactory -def plot_xz_comparison_diag_constraint(prior_run, run, init_ind, dir=None): +def plot_xz_comparison_diag_constraint(prior_run, + run, + init_ind, + dir=None + ): + """ + + """ state_inds = [0,2] goal = [0, 1] ax = plot_2D_comparison_with_prior(state_inds, prior_run, run, goal, init_ind, dir=dir) @@ -34,7 +41,15 @@ def plot_xz_comparison_diag_constraint(prior_run, run, init_ind, dir=None): plt.tight_layout() -def plot_2D_comparison_with_prior(state_inds, prior_run, run, goal, init_ind, dir=None): +def plot_2D_comparison_with_prior(state_inds, + prior_run, + run, goal, + init_ind, + dir=None + ): + """ + + """ horizon_cov = run.state_horizon_cov[init_ind] horizon_states = run.horizon_states[init_ind] prior_horizon_states = prior_run.horizon_states[init_ind] @@ -94,7 +109,14 @@ def plot_2D_comparison_with_prior(state_inds, prior_run, run, goal, init_ind, di return ax -def add_2d_cov_ellipse(position, cov, ax, legend=False): +def add_2d_cov_ellipse(position, + cov, + ax, + legend=False + ): + """ + + """ evals, evecs = np.linalg.eig(cov) major_axis_ind = np.argmax(evals) minor_axis_ind = 0 if major_axis_ind == 1 else 1 @@ -127,7 +149,6 @@ def add_2d_cov_ellipse(position, cov, ax, legend=False): fac = ConfigFactory() fac.add_argument("--train_only", type=bool, default=False, help="True if only training is performed.") config = fac.merge() - # Create environment. env_func = partial(make, config.task, diff --git a/experiments/figure7/config_overrides/safe_explorer_ppo_cartpole.yaml b/experiments/figure7/config_overrides/safe_explorer_ppo_cartpole.yaml index 9b8429289..ef4fa8928 100644 --- a/experiments/figure7/config_overrides/safe_explorer_ppo_cartpole.yaml +++ b/experiments/figure7/config_overrides/safe_explorer_ppo_cartpole.yaml @@ -2,7 +2,6 @@ algo_config: pretraining: False pretrained: null constraint_slack: 0.05 - log_interval: 1000 save_interval: 1000 num_checkpoints: 10 diff --git a/experiments/figure7/config_overrides/safe_explorer_ppo_cartpole_pretrain.yaml b/experiments/figure7/config_overrides/safe_explorer_ppo_cartpole_pretrain.yaml index 8c39ebda7..6a96cf27e 100644 --- a/experiments/figure7/config_overrides/safe_explorer_ppo_cartpole_pretrain.yaml +++ b/experiments/figure7/config_overrides/safe_explorer_ppo_cartpole_pretrain.yaml @@ -10,7 +10,6 @@ algo_config: constraint_eval_interval: 5 constraint_buffer_size: 1000000 constraint_slack: 0.05 - log_interval: 10 save_interval: 10 num_checkpoints: 5 @@ -27,4 +26,3 @@ task_config: constraint_input_type: 'STATE' active_dims: 0 done_on_violation: True - diff --git a/experiments/figure7/create_fig7.sh b/experiments/figure7/create_fig7.sh index ea7b94124..c20405239 100755 --- a/experiments/figure7/create_fig7.sh +++ b/experiments/figure7/create_fig7.sh @@ -1,3 +1,4 @@ #!/bin/bash +# Safe Explorer vs PPO baselines. python3 ./safe_exp_plots.py --plot_dir ./safe_exp_results diff --git a/experiments/figure7/create_safe_exp_results.sh b/experiments/figure7/create_safe_exp_results.sh index fd716a025..a23b01b24 100755 --- a/experiments/figure7/create_safe_exp_results.sh +++ b/experiments/figure7/create_safe_exp_results.sh @@ -1,30 +1,34 @@ #!/bin/bash -# proper cleanup for background processes + +# Allow proper cleanup of background processes. trap "exit" INT TERM ERR trap "kill 0" EXIT +# Remove previous results. rm -r -f ./safe_exp_results/ +# Writing paths. OUTPUT_DIR="./" TAG_ROOT="safe_exp_results" + +# Configuration files path. CONFIG_PATH_ROOT="./config_overrides" +# Options. seeds=(2 22 222 2222 22222 9 90 998 9999 90001) thread=1 -########################## ppo +# PPO. TAG="ppo" CONFIG_PATH="${CONFIG_PATH_ROOT}/ppo_cartpole.yaml" - for seed in "${seeds[@]}" do python3 ../main.py --algo ppo --task cartpole --overrides $CONFIG_PATH --output_dir ${OUTPUT_DIR} --tag $TAG_ROOT/$TAG --thread $thread --seed $seed done -########################## ppo + reward shaping +# PPO with reward shaping. TAG="ppo_rs" CONFIG_PATH="${CONFIG_PATH_ROOT}/ppo_rs_cartpole.yaml" - tolerances=(0.15 0.2) for tolerance in "${tolerances[@]}" do @@ -34,19 +38,16 @@ do done done -########################## pretrain safe explorer +# Safe Explorer pre-training. TAG="safe_exp_pretrain" CONFIG_PATH="${CONFIG_PATH_ROOT}/safe_explorer_ppo_cartpole_pretrain.yaml" - train_seed=88890 python3 ../main.py --algo safe_explorer_ppo --task cartpole --overrides $CONFIG_PATH --output_dir ${OUTPUT_DIR} --tag $TAG_ROOT/$TAG --thread $thread --seed $train_seed -########################## train safe explorer +# Safe Explorer. PRETRAINED_PATH=(${OUTPUT_DIR}/$TAG_ROOT/$TAG/seed${train_seed}*) - TAG="safe_exp_" CONFIG_PATH="${CONFIG_PATH_ROOT}/safe_explorer_ppo_cartpole.yaml" - slacks=(0.15 0.2) for slack in "${slacks[@]}" do diff --git a/experiments/figure7/safe_exp_plots.py b/experiments/figure7/safe_exp_plots.py index 2b9627127..371d35b4d 100755 --- a/experiments/figure7/safe_exp_plots.py +++ b/experiments/figure7/safe_exp_plots.py @@ -21,9 +21,6 @@ from safe_control_gym.utils.registration import make from safe_control_gym.utils.utils import mkdirs, set_dir_from_config, set_device_from_config, set_seed_from_config, save_video -# ----------------------------------------------------------------------------------- -# Plot Funcs -# ----------------------------------------------------------------------------------- def plot_from_exps(legend_dir_specs, out_path="temp.jpg", @@ -35,7 +32,9 @@ def plot_from_exps(legend_dir_specs, x_num_max=None, num_std=1, use_median_quantile=False): - """Plots 1 stat figure at a time.""" + """Plots 1 stat figure at a time. + + """ # Get all stats. stats = defaultdict(list) for l, dirs in legend_dir_specs.items(): @@ -43,13 +42,11 @@ def plot_from_exps(legend_dir_specs, # Pick from either log source (tensorboard or log text files). path = os.path.join(d, "logs", scalar_name + ".log") _, x, _, y = load_from_log_file(path) - # Smoothing. x, y = np.asarray(x), np.asarray(y) if window: x, y = window_func(x, y, window, np.mean) stats[l].append([x, y]) - # Post-processing. x_max = float("inf") for _, runs in stats.items(): @@ -58,19 +55,16 @@ def plot_from_exps(legend_dir_specs, x_max = min(x_max, len(x)) if x_num_max: x_max = min(x_max, x_num_max) - processed_stats = {} for name, runs in stats.items(): # Use same x for all runs to an algo. x = np.array([x[:x_max] for x, _ in runs])[0] - # Different y for different runs. y = np.stack([y[:x_max] for _, y in runs]) y_mean = np.mean(y, axis=0) y_std = np.std(y, axis=0) y_median = np.median(y, axis=0) y_quantiles = np.quantile(y, [0.25, 0.75], axis=0) - # Record stats. processed_stats[name] = { "x": x, @@ -79,33 +73,28 @@ def plot_from_exps(legend_dir_specs, "median": y_median, "quantiles": y_quantiles, } - # Actual plot. fig = plt.figure(figsize=(10, 5)) plt.clf() for i, name in enumerate(sorted(processed_stats.keys())): color = COLORS[i] x = processed_stats[name]["x"] - if use_median_quantile: y_median = processed_stats[name]["median"] y_quantiles = processed_stats[name]["quantiles"] y_quant_1st = y_quantiles[0] y_quant_3rd = y_quantiles[1] - plt.plot(x, y_median, label=name, color=color) plt.fill_between(x, y_quant_3rd, y_quant_1st, alpha=0.3, color=color) else: y_mean = processed_stats[name]["mean"] y_std = processed_stats[name]["std"] - plt.plot(x, y_mean, label=name, color=color) plt.fill_between(x, y_mean + num_std * y_std, y_mean - num_std * y_std, alpha=0.3, color=color) plt.title(title) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=4) - plt.tight_layout() plt.savefig(out_path) plt.show() @@ -113,7 +102,9 @@ def plot_from_exps(legend_dir_specs, def reward_to_cost(y): - """Converts RL reward to control cost (for plotting).""" + """Converts RL reward to control cost (for plotting). + + """ return 250 - y @@ -121,7 +112,6 @@ def load_stats(legend_dir_specs, scalar_names=[], window=None, x_num_max=None): """Gets all processed stats for multiple scalars.""" scalar_stats = {} for scalar_name in scalar_names: - # Get all stats. stats = defaultdict(list) for l, dirs in legend_dir_specs.items(): @@ -129,7 +119,6 @@ def load_stats(legend_dir_specs, scalar_names=[], window=None, x_num_max=None): # Pick from either log source (tensorboard or log text files). path = os.path.join(d, "logs", scalar_name + ".log") _, x, _, y = load_from_log_file(path) - # Smoothing. x, y = np.asarray(x), np.asarray(y) if "return" in scalar_name.lower() or "reward" in scalar_name.lower(): @@ -137,7 +126,6 @@ def load_stats(legend_dir_specs, scalar_names=[], window=None, x_num_max=None): if window: x, y = window_func(x, y, window, np.mean) stats[l].append([x, y]) - # Post-processing. x_max = float("inf") for _, runs in stats.items(): @@ -146,19 +134,16 @@ def load_stats(legend_dir_specs, scalar_names=[], window=None, x_num_max=None): x_max = min(x_max, len(x)) if x_num_max: x_max = min(x_max, x_num_max) - processed_stats = {} for name, runs in stats.items(): # Use same x for all runs to an algo. x = np.array([x[:x_max] for x, _ in runs])[0] - # Different y for different runs. y = np.stack([y[:x_max] for _, y in runs]) y_mean = np.mean(y, axis=0) y_std = np.std(y, axis=0) y_median = np.median(y, axis=0) y_quantiles = np.quantile(y, [0.25, 0.75], axis=0) - # Record stats. processed_stats[name] = { "x": x, @@ -167,10 +152,8 @@ def load_stats(legend_dir_specs, scalar_names=[], window=None, x_num_max=None): "median": y_median, "quantiles": y_quantiles, } - # Copy over stats. scalar_stats[scalar_name] = deepcopy(processed_stats) - return scalar_stats @@ -186,18 +169,17 @@ def plot_from_exps2(legend_dir_specs, num_std=1, use_median_quantile=False, cols_per_row=3): - """Plots 1 stat figure at a time.""" + """Plots 1 stat figure at a time. + + """ # Get all stats. scalar_stats = load_stats(legend_dir_specs, scalar_names=scalar_names, window=window, x_num_max=x_num_max) - # Make plots. num_plots = len(scalar_stats) num_rows = math.ceil(num_plots / cols_per_row) num_cols = min(num_plots, cols_per_row) - fig = plt.figure(figsize=(12, 6)) axes = fig.subplots(nrows=num_rows, ncols=num_cols) - for idx, scalar_name in enumerate(scalar_names): row_idx = idx // num_rows col_idx = idx % num_cols @@ -206,38 +188,30 @@ def plot_from_exps2(legend_dir_specs, else: ax = axes[col_idx] processed_stats = scalar_stats[scalar_name] - for i, name in enumerate(sorted(processed_stats.keys())): color_i = (i + 0) % len(COLORS) color = COLORS[color_i] - line_i = (i + 0) % len(LINE_STYLES) linestyle = LINE_STYLES[line_i][-1] - x = processed_stats[name]["x"] - if use_median_quantile: y_median = processed_stats[name]["median"] y_quantiles = processed_stats[name]["quantiles"] y_quant_1st = y_quantiles[0] y_quant_3rd = y_quantiles[1] - ax.plot(x, y_median, label=name, color=color, alpha=0.7, linestyle=linestyle) ax.fill_between(x, y_quant_3rd, y_quant_1st, alpha=0.1, color=color) else: y_mean = processed_stats[name]["mean"] y_std = processed_stats[name]["std"] - ax.plot(x, y_mean, label=name, color=color, alpha=0.7, linestyle=linestyle) ax.fill_between(x, y_mean + num_std * y_std, y_mean - num_std * y_std, alpha=0.1, color=color) ax.set_xlabel(xlabel) ax.set_ylabel(ylabels[idx]) ax.set_ylim((-10, None)) - # Postprocess plot. fig.suptitle(title) fig.subplots_adjust(bottom=0.15) - lines = [] labels = [] for ax in fig.axes: @@ -245,21 +219,19 @@ def plot_from_exps2(legend_dir_specs, lines.extend(axLine) labels.extend(axLabel) break - fig.legend( - lines, - labels, - loc='lower center', - fancybox=True, - shadow=True, - borderaxespad=0.1, - ncol=7) + fig.legend(lines, + labels, + loc='lower center', + fancybox=True, + shadow=True, + borderaxespad=0.1, + ncol=7) plt.savefig(out_path) plt.show() return scalar_stats xabs_legend_map = { - # baseline "ppo": "ppo", "ppo_rs_0.15": "ppo_rs_slack0.15", "ppo_rs_0.2": "ppo_rs_slack0.2", @@ -267,16 +239,17 @@ def plot_from_exps2(legend_dir_specs, "safe_exp_slack0.2": "se_0.2", } + xlh2_legend_map = { - # baseline "ppo": "ppo", } + thetalh2_legend_map = { - # baseline "ppo": "ppo", } + legend_maps = { "xabs": xabs_legend_map, "xlh2": xlh2_legend_map, @@ -284,26 +257,23 @@ def plot_from_exps2(legend_dir_specs, } xabs_name_map = { - # baseline "ppo": "PPO", - # reward shaping "ppo_rs_slack0.15": "PPO+cost shaping 0.15", "ppo_rs_slack0.2": "PPO+cost shaping 0.2", - # se "se_0.15": "PPO+safety layer 0.15", "se_0.2": "PPO+safety layer 0.2", } def plot(config): - """Central plot function.""" - legend_map = legend_maps[config.constraint] + """Central plot function. + """ + legend_map = legend_maps[config.constraint] # Collect results. dirs = os.listdir(config.plot_dir) - # remove pretrain dirs + # Remove pretrain dirs. dirs = sorted([d for d in dirs if d in legend_map]) - # Make spec. spec = {} for d in dirs: @@ -311,32 +281,27 @@ def plot(config): legend = legend_map[d_name] seed_dirs = os.listdir(os.path.join(config.plot_dir, d)) spec[legend] = [os.path.join(config.plot_dir, d, sd) for sd in seed_dirs] - # Collective plot (mean only). - scalar_stats = plot_from_exps2( - spec, - out_path=os.path.join(config.plot_dir, "performance.jpg"), - scalar_names=["stat/ep_return", "stat/constraint_violation"], - title="Learning Curves", - sub_titles=["Average Costs", "Total Constraint Violations"], - xlabel="Step", - ylabels=["Cost", "Constraint Violations"], - window=10, - x_num_max=None, - num_std=0, - use_median_quantile=True, - cols_per_row=3) - - # Save stats to csv. + scalar_stats = plot_from_exps2(spec, + out_path=os.path.join(config.plot_dir, "performance.jpg"), + scalar_names=["stat/ep_return", "stat/constraint_violation"], + title="Learning Curves", + sub_titles=["Average Costs", "Total Constraint Violations"], + xlabel="Step", + ylabels=["Cost", "Constraint Violations"], + window=10, + x_num_max=None, + num_std=0, + use_median_quantile=True, + cols_per_row=3) + # Save stats to CSV. curves = ["median", "top_quartile", "bottom_quartile"] header = [] stat_rows = [] x_already = False - for scalar_name in scalar_stats: stats = scalar_stats[scalar_name] true_scalar_name = {"stat/ep_return": "Cost", "stat/constraint_violation": "Constraint Violations"}[scalar_name] - # Collect stats. for algo_name in sorted(stats.keys()): true_algo_name = xabs_name_map[algo_name] @@ -351,27 +316,21 @@ def plot(config): stat_rows.append(stat["median"]) stat_rows.append(stat["quantiles"][1]) stat_rows.append(stat["quantiles"][0]) - # Make rows. stat_mtx = np.array(stat_rows).transpose() rows = stat_mtx.tolist() - # Write to csv. csv_path = os.path.join(config.plot_dir, "performance.csv") with open(csv_path, "w") as f: writer = csv.writer(f) writer.writerow(header) writer.writerows(rows) - print("Plotting done.") -# ----------------------------------------------------------------------------------- -# Main -# ----------------------------------------------------------------------------------- - MAIN_FUNCS = {"plot": plot} + if __name__ == "__main__": # Make config. fac = ConfigFactory() diff --git a/experiments/figure8/create_fig8.sh b/experiments/figure8/create_fig8.sh index 8ff3aaa3e..07d2ac910 100755 --- a/experiments/figure8/create_fig8.sh +++ b/experiments/figure8/create_fig8.sh @@ -1,3 +1,4 @@ #!/bin/bash +# Model-predictive safety certification of unsafe PPO controller. python3 ./mpsc_experiment.py --task cartpole --algo mpsc --overrides ./config_overrides/config_mpsc_cartpole.yaml diff --git a/experiments/figure8/create_unsafe_ppo_model.sh b/experiments/figure8/create_unsafe_ppo_model.sh index 5b614e630..66bd309f1 100755 --- a/experiments/figure8/create_unsafe_ppo_model.sh +++ b/experiments/figure8/create_unsafe_ppo_model.sh @@ -14,4 +14,4 @@ cp ./unsafe_ppo_model/unsafe_ppo_model_30000.pt ./unsafe_ppo_model/bak_unsafe_pp mv ./unsafe_ppo_temp_data/seed2_*/model_latest.pt ./unsafe_ppo_model/unsafe_ppo_model_30000.pt # Removed the temporary data used to train the new unsafe PPO model. -rm -r -f ./unsafe_ppo_temp_data/ \ No newline at end of file +rm -r -f ./unsafe_ppo_temp_data/ diff --git a/experiments/main.py b/experiments/main.py index 1c10870dd..d371d11b4 100644 --- a/experiments/main.py +++ b/experiments/main.py @@ -1,4 +1,4 @@ -"""Vanilla training/plotting/testing script. +"""Template training/plotting/testing script. """ import os @@ -15,27 +15,22 @@ from safe_control_gym.utils.utils import mkdirs, set_dir_from_config, set_device_from_config, set_seed_from_config, save_video -# ----------------------------------------------------------------------------------- -# Funcs -# ----------------------------------------------------------------------------------- - def train(config): - """General training template. + """Training template. Usage: * to start training, use with `--func train`. * to restore from a previous training, additionally use `--restore {dir_path}` where `dir_path` is the output folder from previous training. + """ # Experiment setup. if not config.restore: set_dir_from_config(config) set_seed_from_config(config) set_device_from_config(config) - # Define function to create task/env. env_func = partial(make, config.task, output_dir=config.output_dir, **config.task_config) - # Create the controller/control_agent. control_agent = make(config.algo, env_func, @@ -48,7 +43,6 @@ def train(config): control_agent.reset() if config.restore: control_agent.load(os.path.join(config.restore, "model_latest.pt")) - # Training. control_agent.learn() control_agent.close() @@ -62,12 +56,12 @@ def make_plots(config): * use with `--func plot` and `--restore {dir_path}` where `dir_path` is the experiment folder containing the logs. * save figures under `dir_path/plots/`. + """ # Define source and target log locations. log_dir = os.path.join(config.output_dir, "logs") plot_dir = os.path.join(config.output_dir, "plots") mkdirs(plot_dir) - plot_from_logs(log_dir, plot_dir, window=3) print("Plotting done.") @@ -80,13 +74,12 @@ def test_policy(config): * to test policy from a trained model checkpoint, additionally use `--restore {dir_path}` where `dir_path` is folder to the trained model. * to test un-trained policy (e.g. non-learning based), use as it is. + """ # Evaluation setup. set_device_from_config(config) - # Define function to create task/env. env_func = partial(make, config.task, output_dir=config.output_dir, **config.task_config) - # Create the controller/control_agent. control_agent = make(config.algo, env_func, @@ -99,35 +92,31 @@ def test_policy(config): control_agent.reset() if config.restore: control_agent.load(os.path.join(config.restore, "model_latest.pt")) - # Test controller. - results = control_agent.run(n_episodes=config.algo_config.eval_batch_size, render=config.render, verbose=config.verbose, use_adv=config.use_adv) + results = control_agent.run(n_episodes=config.algo_config.eval_batch_size, + render=config.render, + verbose=config.verbose, + use_adv=config.use_adv) control_agent.close() - # Save evalution results. eval_path = os.path.join(config.output_dir, "eval", config.eval_output_path) eval_dir = os.path.dirname(eval_path) mkdirs(eval_dir) with open(eval_path, "wb") as f: pickle.dump(results, f) - ep_lengths = results["ep_lengths"] ep_returns = results["ep_returns"] msg = "eval_ep_length {:.2f} +/- {:.2f}\n".format(ep_lengths.mean(), ep_lengths.std()) msg += "eval_ep_return {:.3f} +/- {:.3f}\n".format(ep_returns.mean(), ep_returns.std()) print(msg) - if "frames" in results: save_video(os.path.join(eval_dir, "video.gif"), results["frames"]) print("Evaluation done.") -# ----------------------------------------------------------------------------------- -# Main -# ----------------------------------------------------------------------------------- - MAIN_FUNCS = {"train": train, "plot": make_plots, "test": test_policy} + if __name__ == "__main__": # Make config. fac = ConfigFactory() @@ -138,12 +127,10 @@ def test_policy(config): fac.add_argument("--use_adv", action="store_true", help="if to evaluate against adversary.") fac.add_argument("--eval_output_path", type=str, default="test_results.pkl", help="file path to save evaluation results.") config = fac.merge() - # System settings. if config.thread > 0: # E.g. set single thread for less context switching torch.set_num_threads(config.thread) - # Execute. func = MAIN_FUNCS.get(config.func, None) if func is None: diff --git a/safe_control_gym/__init__.py b/safe_control_gym/__init__.py index 7a1f443e2..e797863f6 100644 --- a/safe_control_gym/__init__.py +++ b/safe_control_gym/__init__.py @@ -1,4 +1,3 @@ -"""Register all configurable objects by importing.""" from safe_control_gym.controllers import * from safe_control_gym.envs import * from safe_control_gym.envs.env_wrappers import * diff --git a/safe_control_gym/controllers/__init__.py b/safe_control_gym/controllers/__init__.py index 2dfe5c151..90d6fa60d 100644 --- a/safe_control_gym/controllers/__init__.py +++ b/safe_control_gym/controllers/__init__.py @@ -1,7 +1,8 @@ -"""Register controllers.""" +"""Register controllers. + +""" from safe_control_gym.utils.registration import register -####################### MPC register(id="mpc", entry_point="safe_control_gym.controllers.mpc.mpc:MPC", config_entry_point="safe_control_gym.controllers.mpc:mpc.yaml") @@ -10,22 +11,18 @@ entry_point="safe_control_gym.controllers.mpc.linear_mpc:LinearMPC", config_entry_point="safe_control_gym.controllers.mpc:linear_mpc.yaml") -####################### Learning-based MPC register(id="gp_mpc", entry_point="safe_control_gym.controllers.mpc.gp_mpc:GPMPC", config_entry_point="safe_control_gym.controllers.mpc:gp_mpc.yaml") -####################### Model Predictive Safety Certification register(id="mpsc", entry_point="safe_control_gym.controllers.mpsc.mpsc:MPSC", config_entry_point="safe_control_gym.controllers.mpsc:mpsc.yaml") -####################### RL Baselines register(id="ppo", entry_point="safe_control_gym.controllers.ppo.ppo:PPO", config_entry_point="safe_control_gym.controllers.ppo:ppo.yaml") -####################### Soft register(id="safe_explorer_ppo", entry_point="safe_control_gym.controllers.safe_explorer.safe_ppo:SafeExplorerPPO", config_entry_point="safe_control_gym.controllers.safe_explorer:safe_ppo.yaml") diff --git a/safe_control_gym/controllers/base_controller.py b/safe_control_gym/controllers/base_controller.py index 37e6d2e5b..627f11341 100644 --- a/safe_control_gym/controllers/base_controller.py +++ b/safe_control_gym/controllers/base_controller.py @@ -1,8 +1,12 @@ -"""Base classes.""" +"""Base classes. + +""" class BaseController: - """Template for controller/agent, implement the following methods as needed.""" + """Template for controller/agent, implement the following methods as needed. + + """ def __init__(self, env_func, @@ -11,7 +15,8 @@ def __init__(self, output_dir="temp", device="cpu", seed=0, - **kwargs): + **kwargs + ): """Initializes controller agent. Args: @@ -21,37 +26,54 @@ def __init__(self, output_dir (str): folder to write outputs. device (str): cpu or cuda. seed (int): random seed. + """ - # base args + # Base args. self.env_func = env_func self.training = training self.checkpoint_path = checkpoint_path self.output_dir = output_dir self.device = device self.seed = seed - - # algo specific args + # Algorithm specific args. for k, v in kwargs.items(): self.__dict__[k] = v def reset(self): - """Do initializations for training or evaluation.""" + """Do initializations for training or evaluation. + + """ pass def close(self): - """Shuts down and cleans up lingering resources.""" + """Shuts down and cleans up lingering resources. + + """ pass - def save(self, path): - """Saves model params and experiment state to checkpoint path.""" + def save(self, + path + ): + """Saves model params and experiment state to checkpoint path. + + """ pass - def load(self, path): - """Restores model and experiment given checkpoint path.""" + def load(self, + path + ): + """Restores model and experiment given checkpoint path. + + """ pass - def learn(self, env=None, **kwargs): - """Performs learning (pre-training, training, fine-tuning, etc).""" + def learn(self, + env=None, + **kwargs + ): + """Performs learning (pre-training, training, fine-tuning, etc). + + """ pass def run(self, @@ -59,6 +81,9 @@ def run(self, render=False, n_episodes=10, verbose=False, - **kwargs): - """Runs evaluation with current policy.""" + **kwargs + ): + """Runs evaluation with current policy. + + """ pass diff --git a/safe_control_gym/controllers/mpc/gp_mpc.py b/safe_control_gym/controllers/mpc/gp_mpc.py index f854c2d6b..2248a0ac8 100644 --- a/safe_control_gym/controllers/mpc/gp_mpc.py +++ b/safe_control_gym/controllers/mpc/gp_mpc.py @@ -1,19 +1,30 @@ -"""Model Predictive Control with Gaussian Process - -Example: - run mpc on cartpole balance - - $ python tests/test_main.py --mode test_policy --exp_id mpc_cartpole \ - --algo mpc --task cartpole --env_wraps time_limit --max_episode_steps 200 +"""Model Predictive Control with a Gaussian Process model. + +Based on: + * L. Hewing, J. Kabzan and M. N. Zeilinger, "Cautious Model Predictive Control Using Gaussian Process Regression," + in IEEE Transactions on Control Systems Technology, vol. 28, no. 6, pp. 2736-2743, Nov. 2020, doi: 10.1109/TCST.2019.2949757. + +Implementation details: + 1. The previous time step MPC solution is used to compute the set constraints and GP dynamics rollout. + Here, the dynamics are rolled out using the Mean Equivelence method, the fastest, but least accurate. + 2. The GP is approximated using the Fully Independent Training Conditional (FITC) outlined in + * J. Quinonero-Candela, C. E. Rasmussen, and R. Herbrich, “A unifying view of sparse approximate Gaussian process regression,” + Journal of Machine Learning Research, vol. 6, pp. 1935–1959, 2005. + https://www.jmlr.org/papers/volume6/quinonero-candela05a/quinonero-candela05a.pdf + * E. Snelson and Z. Ghahramani, “Sparse gaussian processes using pseudo-inputs,” in Advances in Neural Information Processing + Systems, Y. Weiss, B. Scholkopf, and J. C. Platt, Eds., 2006, pp. 1257–1264. + and the inducing points are the previous MPC solution. + 3. Each dimension of the learned error dynamics is an independent Zero Mean SE Kernel GP. """ import scipy import numpy as np import casadi as cs -from copy import deepcopy import time import torch import gpytorch + +from copy import deepcopy from skopt.sampler import Lhs from functools import partial @@ -22,37 +33,9 @@ from safe_control_gym.controllers.mpc.gp_utils import GaussianProcessCollection, ZeroMeanIndependentGPModel, covSEard from safe_control_gym.envs.benchmark_env import Task -# ----------------------------------------------------------------------------------- -# Gaussian Process MPC -# ----------------------------------------------------------------------------------- - class GPMPC(MPC): - """MPC with Gaussian Process as dynamics residual. This implementation is based on the paper - - L. Hewing, J. Kabzan and M. N. Zeilinger, "Cautious Model Predictive Control Using Gaussian Process Regression," - in IEEE Transactions on Control Systems Technology, vol. 28, no. 6, pp. 2736-2743, Nov. 2020, - doi: 10.1109/TCST.2019.2949757. - (https://ieeexplore.ieee.org/abstract/document/8909368) or (https://arxiv.org/pdf/1705.10702.pdf). - - A Gaussian process learns the error dynamics, and state and input constraints are to ensure, with some - probability, that the error dynamics will remain within constraint boundaires. Note that there are a lot of - approximations made on the GPs, as outlined in the papers. Namely, - 1. The previous time step mpc solution is used to compute the set constraints and gp dynamics rollout. - Here, the dynamics are rolled out using the Mean Equivelence method, the fastest, but least accurate. - 2. The GP is approximated using the Fully Independent Training Conditional (FITC) outlined in - J. Quinonero-Candela, C. E. Rasmussen, and R. Herbrich, “A unifying ˜ - view of sparse approximate Gaussian process regression,” Journal of - Machine Learning Research, vol. 6, pp. 1935–1959, 2005. - (https://www.jmlr.org/papers/volume6/quinonero-candela05a/quinonero-candela05a.pdf) - and - E. Snelson and Z. Ghahramani, “Sparse gaussian processes using - pseudo-inputs,” in Advances in Neural Information Processing Systems, - Y. Weiss, B. Scholkopf, and J. C. Platt, Eds., 2006, pp. 1257–1264. - (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.514.540&rep=rep1&type=pdf) - The inducing points are the previous mpc solution. These approximations are critical to making GP-MPC - work. - 3. Each dimension of the learned error dynamics is an independent Zero Mean SE Kernel GP. + """MPC with Gaussian Process as dynamics residual. """ @@ -79,37 +62,36 @@ def __init__( inertial_prop: list = [1.0], prior_param_coeff: float = 1.0, output_dir: str = "results/temp", - **kwargs): - """ - Initialize GP-MPC. + **kwargs + ): + """Initialize GP-MPC. Args: - env_func (gym.Env): Functionalized initialization of the environment. - seed (int): Random seed (currently not used). - horizon (int): mpc planning horizon. + env_func (gym.Env): functionalized initialization of the environment. + seed (int): random seed. + horizon (int): MPC planning horizon. Q, R (np.array): cost weight matrix. use_prev_start (bool): Warmstart mpc with the previous solution. - train_iterations (int): The number of training examples to use for each dimension of the GP. - optimization_iterations (list): The number of optimization iterations for each dimension of the GP. - learning_rate (list): The learning rate for training each dimension of the GP. - normalize_training_data (bool): Normalize the training data (NOT WORKING WELL). - use_gpu (bool): Use GPU while training the gp. - gp_model_path (str): Path to a pretrained GP model. If None, will train a new one. - output_dir (str): Directory to store model and results. - prob (float): Desired probabilistic safety level. - inertial_prop (list): To initialize the inertial properties of the prior model. - prior_param_coeff (float): Constant multiplying factor to adjust the prior model intertial properties. - input_mask (list): List of which input dimensions to use in GP model. If None, all are used. - target_mask (list): List of which output dimensions to use in the GP model. If None, all are used. + train_iterations (int): the number of training examples to use for each dimension of the GP. + optimization_iterations (list): the number of optimization iterations for each dimension of the GP. + learning_rate (list): the learning rate for training each dimension of the GP. + normalize_training_data (bool): Normalize the training data. + use_gpu (bool): use GPU while training the gp. + gp_model_path (str): path to a pretrained GP model. If None, will train a new one. + output_dir (str): directory to store model and results. + prob (float): desired probabilistic safety level. + inertial_prop (list): to initialize the inertial properties of the prior model. + prior_param_coeff (float): constant multiplying factor to adjust the prior model intertial properties. + input_mask (list): list of which input dimensions to use in GP model. If None, all are used. + target_mask (list): list of which output dimensions to use in the GP model. If None, all are used. gp_approx (str): 'mean_eq' used mean equivalence rollout for the GP dynamics. Only one that works currently. - online_learning (bool): If true, GP kernel values will be updated using past trajectory values. - additional_constraints (list): List of Constraint objects defining additional constraints to be used. + online_learning (bool): if true, GP kernel values will be updated using past trajectory values. + additional_constraints (list): list of Constraint objects defining additional constraints to be used. """ self.prior_env_func = partial(env_func, inertial_prop=np.array(inertial_prop)*prior_param_coeff) self.prior_param_coeff = prior_param_coeff - # Initialize the method using linear MPC. self.prior_ctrl = LinearMPC( self.prior_env_func, @@ -117,36 +99,27 @@ def __init__( q_mpc=q_mpc, r_mpc=r_mpc, use_prev_start=use_prev_start, - # runner args - # shared/base args output_dir=output_dir, additional_constraints=additional_constraints, ) self.prior_ctrl.reset() - super().__init__( self.prior_env_func, - # model args horizon=horizon, q_mpc=q_mpc, r_mpc=r_mpc, use_prev_start=use_prev_start, - # runner args - # shared/base args output_dir=output_dir, additional_constraints=additional_constraints, **kwargs) - - # setup envs + # Setup environments. self.env_func = env_func self.env = env_func(randomized_init=False) self.env_training = env_func(randomized_init=True) - # No training data accumulated yet so keep the dynamics function as linear prior. self.train_data = None self.prior_dynamics_func = self.prior_ctrl.linear_dynamics_func - - # GP and training parameters + # GP and training parameters. self.gaussian_process = None self.train_iterations = train_iterations self.optimization_iterations = optimization_iterations @@ -156,7 +129,6 @@ def __init__( self.use_gpu = use_gpu self.seed = seed self.prob = prob - if input_mask is None: self.input_mask = np.arange(self.model.nx + self.model.nu).tolist() else: @@ -167,80 +139,72 @@ def __init__( self.target_mask = target_mask Bd = np.eye(self.model.nx) self.Bd = Bd[:, self.target_mask] - self.gp_approx = gp_approx self.online_learning = online_learning self.last_obs = None self.last_action = None - ## logging - #self.logger = ExperimentLogger(output_dir, log_file_out=True) - def setup_prior_dynamics(self): - """Computes the LQR gain used for propograting GP uncertainty from the prior model dynamics.""" - # Determine the LQR gain K to propogate the input uncertainty. - # It may be better to do this at each timestep, but that will increase comp complexity. + """Computes the LQR gain used for propograting GP uncertainty from the prior model dynamics. + + """ + # Determine the LQR gain K to propogate the input uncertainty (doing this at each timestep will increase complexity). A, B = discretize_linear_system(self.prior_ctrl.dfdx, self.prior_ctrl.dfdu, self.dt) Q_lqr = self.Q R_lqr = self.R P = scipy.linalg.solve_discrete_are(A, B, Q_lqr, R_lqr) btp = np.dot(B.T, P) - - # The sign below is the correct sign (with -). I have triple checked. self.lqr_gain = -np.dot(np.linalg.inv(self.R + np.dot(btp, B)), np.dot(btp, A)) self.discrete_dfdx = A self.discrete_dfdu = B def set_gp_dynamics_func(self): - """Updates symbolic dynamics with actual control frequency, - initialize GP model and add to the combined dynamics. + """Updates symbolic dynamics. + + With actual control frequency, initialize GP model and add to the combined dynamics. + """ self.setup_prior_dynamics() - # Compute the probabilistic constraint inverse CDF according to section III.D.b in (Hewing 2019) + # Compute the probabilistic constraint inverse CDF according to section III.D.b in Hewing 2019. self.inverse_cdf = scipy.stats.norm.ppf(1 - (1/self.model.nx - (self.prob + 1)/(2*self.model.nx))) - self.create_sparse_GP_machinery() def create_sparse_GP_machinery(self): - """This setups the gaussian process approximations for FITC formulation. """ - lengthscales, signal_var, noise_var, gp_K_plus_noise = self.gaussian_process.get_hyperparameters( - as_numpy=True) + """This setups the gaussian process approximations for FITC formulation. + + """ + lengthscales, signal_var, noise_var, gp_K_plus_noise = self.gaussian_process.get_hyperparameters(as_numpy=True) self.length_scales = lengthscales.squeeze() self.signal_var = signal_var.squeeze() self.noise_var = noise_var.squeeze() self.gp_K_plus_noise = gp_K_plus_noise - Nx = len(self.input_mask) Ny = len(self.target_mask) N = self.gaussian_process.n_training_samples - - # Create CasADI function for computing the kernel K_z_zind with parameters for z, z_ind, length scales - # and signal variance. The method used here is similar to that done in safe_control_gym.math_and_models.gp_functions - # build_gp(). - # We need the CasADI version of this so that it can by symbolically differentiated in in the mpc optimization. + # Create CasADI function for computing the kernel K_z_zind with parameters for z, z_ind, length scales and signal variance. + # We need the CasADI version of this so that it can by symbolically differentiated in in the MPC optimization. z1 = cs.SX.sym('z1', Nx) z2 = cs.SX.sym('z2', Nx) ell_s = cs.SX.sym('ell', Nx) sf2_s = cs.SX.sym('sf2') z_ind = cs.SX.sym('z_ind', self.T, Nx) - covSE = cs.Function('covSE', [z1, z2, ell_s, sf2_s], [covSEard(z1, z2, ell_s, sf2_s)]) - ks = cs.SX.zeros(1, self.T) for i in range(self.T): ks[i] = covSE(z1, z_ind[i, :], ell_s, sf2_s) ks_func = cs.Function('K_s', [z1, z_ind, ell_s, sf2_s], [ks]) - K_z_zind = cs.SX.zeros(Ny, self.T) for i in range(Ny): K_z_zind[i,:] = ks_func(z1, z_ind, self.length_scales[i,:], self.signal_var[i]) - - # This will be mulitplied by the mean_post_factor computed at every time step - # to compute the approximate mean. + # This will be mulitplied by the mean_post_factor computed at every time step to compute the approximate mean. self.K_z_zind_func = cs.Function('K_z_zind', [z1, z_ind],[K_z_zind],['z1', 'z2'],['K']) - def preprocess_training_data(self, x_seq, u_seq, x_next_seq): + def preprocess_training_data(self, + x_seq, + u_seq, + x_next_seq + ): """Converts trajectory data for GP trianing. Args: @@ -251,33 +215,29 @@ def preprocess_training_data(self, x_seq, u_seq, x_next_seq): Returns: np.array: inputs for GP training, (N, nx+nu). np.array: targets for GP training, (N, nx). + """ - # Get the predicted dynamics. Recall that this is a linear prior, which means - # we need to account for the fact that it is linearized about an eq using - # self.X_GOAL and self.U_GOAL. + # Get the predicted dynamics. This is a linear prior, thus we need to account for the fact that + # it is linearized about an eq using self.X_GOAL and self.U_GOAL. x_pred_seq = self.prior_dynamics_func(x0=x_seq.T - self.prior_ctrl.X_LIN[:, None], p=u_seq.T - self.prior_ctrl.U_LIN[:,None])['xf'].toarray() - - targets = (x_next_seq.T - (x_pred_seq+self.prior_ctrl.X_LIN[:,None])).transpose() # (N, nx) - #targets = x_next_seq - x_pred_seq.T # (N, nx) - inputs = np.hstack([x_seq, u_seq]) # (N, nx+nu) + targets = (x_next_seq.T - (x_pred_seq+self.prior_ctrl.X_LIN[:,None])).transpose() # (N, nx). + inputs = np.hstack([x_seq, u_seq]) # (N, nx+nu). return inputs, targets - def precompute_probabilistic_limits(self, print_sets=True): + def precompute_probabilistic_limits(self, + print_sets=True + ): """This updates the constraint value limits to account for the uncertainty in the dynamics rollout. Args: print_sets (bool): True to print out the sets for debugging purposes. """ - # TODO: This should be divided into multiple functions. One for rolling out covaiance and another for - # computing the probabilistic constraint limits (at the least). nx, nu = self.model.nx, self.model.nu T = self.T - state_covariances = np.zeros((self.T+1, nx, nx)) input_covariances = np.zeros((self.T, nu, nu)) - # Initilize lists for the tightening of each constraint. state_constraint_set = [] for state_constraint in self.constraints.state_constraints: @@ -285,7 +245,6 @@ def precompute_probabilistic_limits(self, print_sets=True): input_constraint_set = [] for input_constraint in self.constraints.input_constraints: input_constraint_set.append(np.zeros((input_constraint.num_constraints, T))) - if self.x_prev is not None and self.u_prev is not None: cov_x = np.zeros((nx, nx)) for i in range(T): @@ -293,7 +252,6 @@ def precompute_probabilistic_limits(self, print_sets=True): cov_u = self.lqr_gain @ cov_x @ self.lqr_gain.T input_covariances[i] = cov_u cov_xu = cov_x @ self.lqr_gain.T - z = np.hstack((self.x_prev[:,i], self.u_prev[:,i])) if self.gp_approx == 'taylor': raise NotImplementedError("Taylor GP approximation is currently not working.") @@ -302,16 +260,13 @@ def precompute_probabilistic_limits(self, print_sets=True): cov_d = cov_d_tensor.detach().numpy() else: raise NotImplementedError('gp_approx method is incorrect or not implemented') - - # loop through input constraints and tighten by the required ammount + # Loop through input constraints and tighten by the required ammount. for ui, input_constraint in enumerate(self.constraints.input_constraints): input_constraint_set[ui][:, i] = -1*self.inverse_cdf * \ np.absolute(input_constraint.A) @ np.sqrt(np.diag(cov_u)) for si, state_constraint in enumerate(self.constraints.state_constraints): state_constraint_set[si][:, i] = -1*self.inverse_cdf * \ np.absolute(state_constraint.A) @ np.sqrt(np.diag(cov_x)) - - if self.gp_approx == 'taylor': raise NotImplementedError("Taylor GP rollout not implemented.") elif self.gp_approx == 'mean_eq': @@ -323,44 +278,38 @@ def precompute_probabilistic_limits(self, print_sets=True): self.Bd @ cov_d @ self.Bd.T else: raise NotImplementedError('gp_approx method is incorrect or not implemented') - # Udate Final covariance. for si, state_constraint in enumerate(self.constraints.state_constraints): state_constraint_set[si][:,-1] = -1 * self.inverse_cdf * \ np.absolute(state_constraint.A) @ np.sqrt(np.diag(cov_x)) - state_covariances[-1] = cov_x if print_sets: print("Probabilistic State Constraint values along Horizon:") print(state_constraint_set) print("Probabilistic Input Constraint values along Horizon:") print(input_constraint_set) - self.results_dict['input_constraint_set'].append(input_constraint_set) self.results_dict['state_constraint_set'].append(state_constraint_set) self.results_dict['state_horizon_cov'].append(state_covariances) self.results_dict['input_horizon_cov'].append(input_covariances) - return state_constraint_set, input_constraint_set def precompute_sparse_gp_values(self): - """Uses the last MPC solution to precomupte values associated with the FITC GP approximation.""" - # TODO: Move to GP utils + """Uses the last MPC solution to precomupte values associated with the FITC GP approximation. + + """ n_data_points = self.gaussian_process.n_training_samples dim_gp_inputs = len(self.input_mask) dim_gp_outputs = len(self.target_mask) - - # Get the inducing points + # Get the inducing points. if self.x_prev is not None and self.u_prev is not None: - # Use the previous MPC soln (as per Hewing 2019) + # Use the previous MPC solution as in Hewing 2019. z_ind = np.hstack((self.x_prev[:,:-1].T, self.u_prev.T)) z_ind = z_ind[:,self.input_mask] else: - # If there is no previous solution. Choose T random training set points + # If there is no previous solution. Choose T random training set points. inds = self.env.np_random.choice(range(n_data_points), size=self.T) - z_ind = self.data_inputs[inds][:, self.input_mask] - K_zind_zind = self.gaussian_process.kernel(torch.Tensor(z_ind).double()) K_zind_zind_inv = self.gaussian_process.kernel_inv(torch.Tensor(z_ind).double()) K_x_zind = self.gaussian_process.kernel(torch.from_numpy(self.data_inputs[:, self.input_mask]).double(), @@ -373,44 +322,38 @@ def precompute_sparse_gp_values(self): for i in range(dim_gp_outputs): mean_post_factor[i] = Sigma[i] @ K_x_zind[i].T @ Gamma_inv[i] @ \ torch.from_numpy(self.data_targets[:,self.target_mask[i]]).double() - return mean_post_factor.detach().numpy(), Sigma.detach().numpy(), K_zind_zind_inv.detach().numpy(), z_ind def setup_gp_optimizer(self): - """Sets up nonlinear optimization problem including - cost objective, variable bounds and dynamics constraints. - """ + """Sets up nonlinear optimization problem including cost objective, variable bounds and dynamics constraints. + """ nx, nu = self.model.nx, self.model.nu T = self.T - - # define optimizer and variables + # Define optimizer and variables. opti = cs.Opti() - # states + # States. x_var = opti.variable(nx, T + 1) - # inputs + # Inputs. u_var = opti.variable(nu, T) - # initial state + # Initial state. x_init = opti.parameter(nx, 1) - # reference (equilibrium point or trajectory, last step for terminal cost) + # Reference (equilibrium point or trajectory, last step for terminal cost). x_ref = opti.parameter(nx, T + 1) - # Chance constraint limits + # Chance constraint limits. state_constraint_set = [] for state_constraint in self.constraints.state_constraints: state_constraint_set.append(opti.parameter(state_constraint.num_constraints, T+1)) input_constraint_set = [] for input_constraint in self.constraints.input_constraints: input_constraint_set.append(opti.parameter(input_constraint.num_constraints, T)) - - # Sparse GP mean postfactor matrix + # Sparse GP mean postfactor matrix. mean_post_factor = opti.parameter(len(self.target_mask), T) - # Sparse GP inducing points + # Sparse GP inducing points. z_ind = opti.parameter(T, len(self.input_mask)) - - # cost (cumulative) + # Cost (cumulative). cost = 0 cost_func = self.model.loss - for i in range(T): cost += cost_func(x=x_var[:, i], u=u_var[:, i], @@ -418,7 +361,7 @@ def setup_gp_optimizer(self): Ur=np.zeros((nu, 1)), Q=self.Q, R=self.R)["l"] - # terminal cost + # Terminal cost. cost += cost_func(x=x_var[:, -1], u=np.zeros((nu, 1)), Xr=x_ref[:, -1], @@ -426,35 +369,28 @@ def setup_gp_optimizer(self): Q=self.Q, R=self.R)["l"] opti.minimize(cost) - z = cs.vertcat(x_var[:,:-1], u_var) z = z[self.input_mask,:] for i in range(self.T): - # Dynamics constraints using the dynamics of the prior and the mean of the GP. - # This follows the tractable dynamics formulation in Section III.B in (Hewing 2019). + # This follows the tractable dynamics formulation in Section III.B in Hewing 2019. # Note that for the GP approximation, we are purposely using elementwise multiplication *. next_state = self.prior_dynamics_func(x0=x_var[:, i]-self.prior_ctrl.X_LIN[:,None], p=u_var[:, i]-self.prior_ctrl.U_LIN[:,None])['xf'] + \ self.prior_ctrl.X_LIN[:,None]+ self.Bd @ cs.sum2(self.K_z_zind_func(z1=z[:,i].T, z2=z_ind)['K'] * mean_post_factor) opti.subject_to(x_var[:, i + 1] == next_state) - - # Probabilistic state and input constraints according to Hewing 2019 constraint tightening + # Probabilistic state and input constraints according to Hewing 2019 constraint tightening. for s_i, state_constraint in enumerate(self.state_constraints_sym): opti.subject_to(state_constraint(x_var[:, i]) <= state_constraint_set[s_i][:,i]) for u_i, input_constraint in enumerate(self.input_constraints_sym): opti.subject_to(input_constraint(u_var[:, i]) <= input_constraint_set[u_i][:,i]) - - # final state constraints + # Final state constraints. for s_i, state_constraint in enumerate(self.state_constraints_sym): opti.subject_to(state_constraint(x_var[:, -1]) <= state_constraint_set[s_i][:,-1]) - - - # initial condition constraints + # Initial condition constraints. opti.subject_to(x_var[:, 0] == x_init) - - # create solver (IPOPT solver for now) + # Create solver (IPOPT solver in this version). opts = {"ipopt.print_level": 4, "ipopt.sb": "yes", "ipopt.max_iter": 100, @@ -473,8 +409,10 @@ def setup_gp_optimizer(self): "cost": cost } - def select_action_with_gp(self, obs): - """Solves nonlinear mpc problem to get next action. + def select_action_with_gp(self, + obs + ): + """Solves nonlinear MPC problem to get next action. Args: obs (np.array): current state/observation. @@ -494,39 +432,32 @@ def select_action_with_gp(self, obs): mean_post_factor = opti_dict["mean_post_factor"] z_ind = opti_dict["z_ind"] cost = opti_dict["cost"] - - # assign the initial state + # Assign the initial state. opti.set_value(x_init, obs) - - # assign reference trajectory within horizon + # Assign reference trajectory within horizon. goal_states = self.get_references() opti.set_value(x_ref, goal_states) if self.mode == "tracking": self.traj_step += 1 - - # set the probabilistic state and input constraint set limits + # Set the probabilistic state and input constraint set limits. state_constraint_set_prev, input_constraint_set_prev = self.precompute_probabilistic_limits() for si in range(len(self.constraints.state_constraints)): opti.set_value(state_constraint_set[si], state_constraint_set_prev[si]) for ui in range(len(self.constraints.input_constraints)): opti.set_value(input_constraint_set[ui], input_constraint_set_prev[ui]) - mean_post_factor_val, Sigma, K_zind_zind_inv, z_ind_val = self.precompute_sparse_gp_values() opti.set_value(mean_post_factor, mean_post_factor_val) opti.set_value(z_ind, z_ind_val) - - # initial guess for optim problem + # Initial guess for the optimization problem. if self.warmstart and self.x_prev is not None and self.u_prev is not None: # shift previous solutions by 1 step x_guess = deepcopy(self.x_prev) u_guess = deepcopy(self.u_prev) x_guess[:, :-1] = x_guess[:, 1:] u_guess[:-1] = u_guess[1:] - opti.set_initial(x_var, x_guess) opti.set_initial(u_var, u_guess) - - # solve the optimization problem + # Solve the optimization problem. try: sol = opti.solve() x_val, u_val = sol.value(x_var), sol.value(u_var) @@ -535,52 +466,49 @@ def select_action_with_gp(self, obs): u_val = np.atleast_2d(u_val) self.x_prev = x_val self.u_prev = u_val - self.results_dict['horizon_states'].append(deepcopy(self.x_prev)) self.results_dict['horizon_inputs'].append(deepcopy(self.u_prev)) - zi = np.hstack((x_val[:,0], u_val[:,0])) zi = zi[self.input_mask] gp_contribution = np.sum(self.K_z_zind_func(z1=zi, z2=z_ind_val)['K'].toarray() * mean_post_factor_val,axis=1) print("GP Contribution: %s" % gp_contribution) - # take the first one from solved action sequence + # Take the first one from solved action sequence. if u_val.ndim > 1: action = u_val[:, 0] else: action = np.array([u_val[0]]) - self.prev_action = action, return action - def learn(self, input_data=None, target_data=None, gp_model=None, plot=False): + def learn(self, + input_data=None, + target_data=None, + gp_model=None, + plot=False + ): """Performs GP training. Args: - input_data, target_data (np.array): (OPTIONAL) data to use for training - gp_model (str): If not None, this is the path to pretrained models to use instead of training new ones. - plot (bool): To plot validation trajectories or not. + input_data, target_data (optiona, np.array): data to use for training + gp_model (str): if not None, this is the path to pretrained models to use instead of training new ones. + plot (bool): to plot validation trajectories or not. Returns: training_results (dict): Dictionary of the training results. """ - if gp_model is None: gp_model = self.gp_model_path - - # TODO: Until sampling within constraints is implemented, do this and fix later self.prior_ctrl.remove_constraints(self.prior_ctrl.additional_constraints) self.reset() - if self.online_learning: input_data = np.zeros((self.train_iterations, len(self.input_mask))) target_data = np.zeros((self.train_iterations, len(self.target_mask))) - if input_data is None and target_data is None: train_inputs = [] train_targets = [] train_info = [] - # Use Latin Hypercube Sampling to generate states withing environment bounds + # Use Latin Hypercube Sampling to generate states withing environment bounds. lhs_sampler = Lhs(lhs_type='classic', criterion='maximin') limits = [(self.env.INIT_STATE_RAND_INFO[key].low, self.env.INIT_STATE_RAND_INFO[key].high) for key in self.env.INIT_STATE_RAND_INFO] @@ -593,7 +521,6 @@ def learn(self, input_data=None, target_data=None, gp_model=None, plot=False): for i in range(self.train_iterations): # For random initial state training. init_state = init_state_samples[i,:] - # Collect data with prior controller. run_env = self.env_func(init_state=init_state, randomized_init=False) episode_results = self.prior_ctrl.run(env=run_env, max_steps=1) @@ -601,7 +528,6 @@ def learn(self, input_data=None, target_data=None, gp_model=None, plot=False): x_obs = episode_results['obs'][-3:,:] u_seq = episode_results['action'][-1:,:] run_env.close() - x_seq = x_obs[:-1,:] x_next_seq = x_obs[1:,:] train_inputs_i, train_targets_i = self.preprocess_training_data(x_seq, u_seq, x_next_seq) @@ -610,12 +536,10 @@ def learn(self, input_data=None, target_data=None, gp_model=None, plot=False): else: train_inputs = input_data train_targets = target_data - train_inputs = np.vstack(train_inputs) train_targets = np.vstack(train_targets) self.data_inputs = train_inputs self.data_targets = train_targets - train_inputs_tensor = torch.Tensor(train_inputs).double() train_targets_tensor = torch.Tensor(train_targets).double() if plot: @@ -626,7 +550,7 @@ def learn(self, input_data=None, target_data=None, gp_model=None, plot=False): u_seq = validation_results['action'] x_seq = x_obs[:-1, :] x_next_seq = x_obs[1:, :] - # Define likelihood + # Define likelihood. likelihood = gpytorch.likelihoods.GaussianLikelihood( noise_constraint=gpytorch.constraints.GreaterThan(1e-9), ).double() @@ -649,22 +573,18 @@ def learn(self, input_data=None, target_data=None, gp_model=None, plot=False): learning_rate=self.learning_rate, gpu=self.use_gpu, dir=self.output_dir) - # Plot Validation + # Plot validation. if plot: - validation_inputs, validation_targets = self.preprocess_training_data(x_seq, u_seq, x_next_seq) fig_count = 0 fig_count = self.gaussian_process.plot_trained_gp(torch.Tensor(validation_inputs).double(), torch.Tensor(validation_targets).double(), fig_count=fig_count) - self.set_gp_dynamics_func() self.setup_gp_optimizer() - # TODO: Until sampling within constraints is fixed, need to add constraint back in self.prior_ctrl.add_constraints(self.prior_ctrl.additional_constraints) self.prior_ctrl.reset() - - # collect training results + # Collect training results. training_results = {} training_results['train_targets'] = train_targets training_results['train_inputs'] = train_inputs @@ -672,21 +592,20 @@ def learn(self, input_data=None, target_data=None, gp_model=None, plot=False): training_results['info'] = train_info except UnboundLocalError: training_results['info'] = None - return training_results - - def select_action(self, obs): + def select_action(self, + obs + ): """Select the action based on the given observation. Args: - obs (np.array): Current observed state + obs (np.array): current observed state. Returns: - action (np.array): Desired policy action. + action (np.array): desired policy action. """ - # todo: Modify this to handle a passed in prior controller if self.gaussian_process is None: action = self.prior_ctrl.select_action(obs) else: @@ -695,7 +614,6 @@ def select_action(self, obs): exit() t1 = time.perf_counter() action = self.select_action_with_gp(obs) - #action = self.apply_linear_ancilliary_control(obs, action) t2 = time.perf_counter() print("GP SELECT ACTION TIME: %s" %(t2 - t1)) self.last_obs = obs @@ -703,12 +621,16 @@ def select_action(self, obs): return action def close(self): - """Cleans up resources.""" + """Clean up. + + """ self.env_training.close() self.env.close() - #self.logger.close() def reset_results_dict(self): + """ + + """ "Result the results_dict before running." super().reset_results_dict() self.results_dict['input_constraint_set'] = [] @@ -717,8 +639,10 @@ def reset_results_dict(self): self.results_dict['input_horizon_cov'] = [] def reset(self): - """Reset the controller before running.""" - # setup reference input + """Reset the controller before running. + + """ + # Setup reference input. if self.env.TASK == Task.STABILIZATION: self.mode = "stabilization" self.x_goal = self.env.X_GOAL @@ -726,15 +650,12 @@ def reset(self): self.mode = "tracking" self.traj = self.env.X_GOAL.T self.traj_step = 0 - - # dynamics model + # Dynamics model. if self.gaussian_process is not None: self.set_gp_dynamics_func() - - # casadi optimizer + # CasADi optimizer. self.setup_gp_optimizer() self.prior_ctrl.reset() - - # previously solved states & inputs, useful for warm start + # Previously solved states & inputs, useful for warm start. self.x_prev = None self.u_prev = None diff --git a/safe_control_gym/controllers/mpc/gp_mpc.yaml b/safe_control_gym/controllers/mpc/gp_mpc.yaml index a6c9d5256..cff27d17b 100644 --- a/safe_control_gym/controllers/mpc/gp_mpc.yaml +++ b/safe_control_gym/controllers/mpc/gp_mpc.yaml @@ -1,7 +1,7 @@ # Seed for LHS sampling seed: 1337 -# mpc args +# MPC args horizon: 10 r_mpc: - 1. @@ -30,7 +30,7 @@ inertial_prop: - 1.0 prior_param_coeff: 1. -# runner args +# Runner args deque_size: 10 eval_batch_size: 10 diff --git a/safe_control_gym/controllers/mpc/gp_utils.py b/safe_control_gym/controllers/mpc/gp_utils.py index 7dad37e2a..bf9a8b0e8 100644 --- a/safe_control_gym/controllers/mpc/gp_utils.py +++ b/safe_control_gym/controllers/mpc/gp_utils.py @@ -1,43 +1,60 @@ +"""Utility functions for Gaussian Processes. + +""" import os.path import numpy as np import gpytorch import torch import matplotlib.pyplot as plt -from copy import deepcopy import casadi as ca +from copy import deepcopy + from safe_control_gym.utils.utils import mkdirs -def covSEard(x, z, ell, sf2): - """ GP squared exponential kernel - This function is copied from GP-MPC library by - Copyright (c) 2018, Helge-André Langåker +def covSEard(x, + z, + ell, + sf2 + ): + """GP squared exponential kernel. - Args: - x (np.array or casadi.MX/SX): First vector. - z (np.array or casadi.MX/SX): Second vector. - ell (np.array or casadi.MX/SX): Length scales. - sf2 (float or casadi.MX/SX): output scale parameter. + This function is based on the 2018 GP-MPC library by Helge-André Langåker - Returns: - SE kernel (casadi.MX/SX): SE kernel. + Args: + x (np.array or casadi.MX/SX): First vector. + z (np.array or casadi.MX/SX): Second vector. + ell (np.array or casadi.MX/SX): Length scales. + sf2 (float or casadi.MX/SX): output scale parameter. + + Returns: + SE kernel (casadi.MX/SX): SE kernel. """ dist = ca.sum1((x - z)**2 / ell**2) return sf2 * ca.SX.exp(-.5 * dist) + class ZeroMeanIndependentMultitaskGPModel(gpytorch.models.ExactGP): - def __init__(self, train_x, train_y, likelihood, nx): - """ - Multidimension Gaussian Process model with zero mean function (or - constant mean...)and radial basis function kernel (SE). + """Multidimensional Gaussian Process model with zero mean function. + + Or constant mean and radial basis function kernel (SE). + + """ + + def __init__(self, + train_x, + train_y, + likelihood, + nx + ): + """Initialize a multidimensional Gaussian Process model with zero mean function. Args: - train_x (torch.Tensor): input training data (input_dim X N samples) - train_y (torch.Tensor): output training data (output dim x N samples) - likelihood (gpytorch.likelihood): Likelihood function - (gpytorch.likelihoods.MultitaskGaussianLikelihood) + train_x (torch.Tensor): input training data (input_dim X N samples). + train_y (torch.Tensor): output training data (output dim x N samples). + likelihood (gpytorch.likelihood): Likelihood function (gpytorch.likelihoods.MultitaskGaussianLikelihood). nx (int): dimension of the target output (output dim) """ @@ -54,7 +71,12 @@ def __init__(self, train_x, train_y, likelihood, nx): ard_num_dims=train_x.shape[1] ) - def forward(self, x): + def forward(self, + x + ): + """ + + """ mean_x = self.mean_module(x) covar_x = self.covar_module(x) return gpytorch.distributions.MultitaskMultivariateNormal.from_batch_mvn( @@ -63,16 +85,22 @@ def forward(self, x): class ZeroMeanIndependentGPModel(gpytorch.models.ExactGP): - def __init__(self, train_x, train_y, likelihood): - """ - Single dimensional output Gaussian Process model with zero mean function (or - constant mean...) and radial basis function kernel (SE). + """Single dimensional output Gaussian Process model with zero mean function. + + Or constant mean and radial basis function kernel (SE). + + """ + def __init__(self, + train_x, + train_y, + likelihood + ): + """Initialize a single dimensional Gaussian Process model with zero mean function. Args: - train_x (torch.Tensor): input training data (input_dim X N samples) - train_y (torch.Tensor): output training data (output dim x N samples) - likelihood (gpytorch.likelihood): Likelihood function - (gpytorch.likelihoods.GaussianLikelihood) + train_x (torch.Tensor): input training data (input_dim X N samples). + train_y (torch.Tensor): output training data (output dim x N samples). + likelihood (gpytorch.likelihood): Likelihood function (gpytorch.likelihoods.GaussianLikelihood). """ super().__init__(train_x, train_y, likelihood) @@ -84,29 +112,38 @@ def __init__(self, train_x, train_y, likelihood): ard_num_dims=train_x.shape[1] ) - def forward(self, x): + def forward(self, + x + ): + """ + + """ mean_x = self.mean_module(x) covar_x = self.covar_module(x) return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + class GaussianProcessCollection: - """Collection of GaussianProcesses for multioutput GPs""" + """Collection of GaussianProcesses for multioutput GPs. + + """ + def __init__(self, model_type, likelihood, target_dim, input_mask=None, target_mask=None, - normalize=False): - """ - Creates a single GaussianProcess for each output dimension + normalize=False + ): + """Creates a single GaussianProcess for each output dimension. Args: - model_type (gpytorch model class): Model class for the GP (ZeroMeanIndependentGPModel) - likelihood (gpytorch.likelihood): likelihood function + model_type (gpytorch model class): Model class for the GP (ZeroMeanIndependentGPModel). + likelihood (gpytorch.likelihood): likelihood function. target_dim (int): Dimension of the output (how many GPs to make). input_mask (list): Input dimensions to keep. If None, use all input dimensions. target_mask (list): Target dimensions to keep. If None, use all target dimensions. - normalize (bool): If True, scale all data between -1 and 1. (prototype and not fully operational) + normalize (bool): If True, scale all data between -1 and 1. """ self.gp_list = [] @@ -123,7 +160,10 @@ def __init__(self, model_type, input_mask=input_mask, normalize=normalize)) - def _init_properties(self, train_inputs, train_targets): + def _init_properties(self, + train_inputs, + train_targets + ): """Initialize useful properties. Args: @@ -135,7 +175,11 @@ def _init_properties(self, train_inputs, train_targets): self.output_dimension = target_dimension self.n_training_samples = train_inputs.shape[0] - def init_with_hyperparam(self, train_inputs, train_targets, path_to_statedicts): + def init_with_hyperparam(self, + train_inputs, + train_targets, + path_to_statedicts + ): """Load hyperparameters from a state_dict. Args: @@ -160,8 +204,12 @@ def init_with_hyperparam(self, train_inputs, train_targets, path_to_statedicts): gp_K_plus_noise = torch.stack(gp_K_plus_noise_list) self.K_plus_noise = gp_K_plus_noise - def get_hyperparameters(self, as_numpy=False): - """Get the outputscale and lengthscale from the kernel matrices of the GPs.""" + def get_hyperparameters(self, + as_numpy=False + ): + """Get the outputscale and lengthscale from the kernel matrices of the GPs. + + """ lengthscale_list = [] output_scale_list = [] noise_list = [] @@ -169,7 +217,6 @@ def get_hyperparameters(self, as_numpy=False): lengthscale_list.append(gp.model.covar_module.base_kernel.lengthscale.detach()) output_scale_list.append(gp.model.covar_module.outputscale.detach()) noise_list.append(gp.model.likelihood.noise.detach()) - lengthscale = torch.cat(lengthscale_list) outputscale = torch.Tensor(output_scale_list) noise = torch.Tensor(noise_list) @@ -178,11 +225,19 @@ def get_hyperparameters(self, as_numpy=False): else: return lengthscale, outputscale, noise, self.K_plus_noise - def train(self, train_x_raw, train_y_raw, n_train=[500], learning_rate=[0.01], gpu=False, dir='results'): - """ - Train the GP using Train_x and Train_y - train_x: Torch tensor ( N samples [rows] by input dim [cols]) - train_y: Torch tensor (N samples [rows] by target dim [cols] ) + def train(self, + train_x_raw, + train_y_raw, + n_train=[500], + learning_rate=[0.01], + gpu=False, + dir='results' + ): + """Train the GP using Train_x and Train_y. + + Args: + train_x: Torch tensor (N samples [rows] by input dim [cols]) + train_y: Torch tensor (N samples [rows] by target dim [cols]) """ self._init_properties(train_x_raw, train_y_raw) @@ -208,19 +263,26 @@ def train(self, train_x_raw, train_y_raw, n_train=[500], learning_rate=[0.01], g gp_K_plus_noise = torch.stack(gp_K_plus_noise_list) self.K_plus_noise = gp_K_plus_noise - def predict(self, x, requires_grad=False, return_pred=True): + def predict(self, + x, + requires_grad=False, + return_pred=True + ): """ - x : torch.Tensor (N_samples x input DIM) + + Args: + x : torch.Tensor (N_samples x input DIM). + Return - Predicitons - mean : torch.tensor (nx X N_samples) - lower : torch.tensor (nx X N_samples) - upper : torch.tensor (nx X N_samples) + Predictions + mean : torch.tensor (nx X N_samples). + lower : torch.tensor (nx X N_samples). + upper : torch.tensor (nx X N_samples). + """ means_list = [] cov_list = [] pred_list = [] - for gp in self.gp_list: if return_pred: mean, cov, pred = gp.predict(x, requires_grad=requires_grad, return_pred=return_pred) @@ -236,45 +298,58 @@ def predict(self, x, requires_grad=False, return_pred=True): else: return means, cov - def prediction_jacobian(self, query): - """Return Jacobian""" + def prediction_jacobian(self, + query + ): + """Return Jacobian. + + """ return NotImplementedError - def plot_trained_gp(self, inputs, targets, fig_count=0): - """Plot the trained GP given the input and target data. Useful for validation. + def plot_trained_gp(self, + inputs, + targets, + fig_count=0 + ): + """Plot the trained GP given the input and target data. + """ for gp_ind, gp in enumerate(self.gp_list): fig_count = gp.plot_trained_gp(inputs, targets[:,gp_ind,None], fig_count) fig_count += 1 - def _kernel_list(self, x1, x2=None): + def _kernel_list(self, + x1, + x2=None + ): """Evaluate the kernel given vectors x1 and x2. Args: - x1 (torch.Tensor): First vector - x2 (torch.Tensor): Second vector + x1 (torch.Tensor): First vector. + x2 (torch.Tensor): Second vector. - Return: + Returns: list of LazyTensor Kernels. """ if x2 is None: x2 = x1 - k_list = [] for gp in self.gp_list: k_list.append(gp.model.covar_module(x1, x2)) - return k_list - def kernel(self, x1, x2=None): + def kernel(self, + x1, + x2=None + ): """Evaluate the kernel given vectors x1 and x2. Args: - x1 (torch.Tensor): First vector - x2 (torch.Tensor): Second vector + x1 (torch.Tensor): First vector. + x2 (torch.Tensor): Second vector. - Return: + Returns: Torch tensor of the non-lazy kernel matrices. """ @@ -282,14 +357,19 @@ def kernel(self, x1, x2=None): non_lazy_tensors = [k.evaluate() for k in k_list] return torch.stack(non_lazy_tensors) - def kernel_inv(self, x1, x2=None): - """Evaluate the inverse kernel given vectors x1 and x2. Only works for square kernel. + def kernel_inv(self, + x1, + x2=None + ): + """Evaluate the inverse kernel given vectors x1 and x2. + + Only works for square kernel. Args: - x1 (torch.Tensor): First vector - x2 (torch.Tensor): Second vector + x1 (torch.Tensor): First vector. + x2 (torch.Tensor): Second vector. - Return: + Returns: Torch tensor of the non-lazy inverse kernel matrices. """ @@ -298,19 +378,28 @@ def kernel_inv(self, x1, x2=None): assert x1.shape == x2.shape, ValueError("x1 and x2 need to have the same shape.") k_list = self._kernel_list(x1, x2) num_of_points = x1.shape[0] - # Efficient inversion is performed VIA inv_matmul on the laze tensor with Identity. non_lazy_tensors = [k.inv_matmul(torch.eye(num_of_points).double()) for k in k_list] return torch.stack(non_lazy_tensors) class GaussianProcess: - def __init__(self, model_type, likelihood, input_mask=None, target_mask=None, normalize=False): - """ - Gaussian Process decorator for gpytorch + """Gaussian Process decorator for gpytorch. + + """ + + def __init__(self, + model_type, + likelihood, + input_mask=None, + target_mask=None, + normalize=False + ): + """Initialize Gaussian Process. + Args: - model_type (gpytorch model class): Model class for the GP (ZeroMeanIndependentMultitaskGPModel) - likelihood (gpytorch.likelihood): likelihood function - normalize (bool): If True, scale all data between -1 and 1. (prototype and not fully operational) + model_type (gpytorch model class): Model class for the GP (ZeroMeanIndependentMultitaskGPModel). + likelihood (gpytorch.likelihood): likelihood function. + normalize (bool): If True, scale all data between -1 and 1. (prototype and not fully operational). """ self.model_type = model_type @@ -321,8 +410,13 @@ def __init__(self, model_type, likelihood, input_mask=None, target_mask=None, no self.input_mask = input_mask self.target_mask = target_mask - def _init_model(self, train_inputs, train_targets): - """Init GP model from train inputs and train_targets""" + def _init_model(self, + train_inputs, + train_targets + ): + """Init GP model from train inputs and train_targets. + + """ if train_targets.ndim > 1: target_dimension = train_targets.shape[1] else: @@ -331,24 +425,32 @@ def _init_model(self, train_inputs, train_targets): self.model = self.model_type(train_inputs, train_targets, self.likelihood) - # Extract dimensions for external use. self.input_dimension = train_inputs.shape[1] self.output_dimension = target_dimension self.n_training_samples = train_inputs.shape[0] - def _compute_GP_covariances(self, train_x): - """Compute K(X,X) + sigma*I and it's inverse for later use.""" + def _compute_GP_covariances(self, + train_x + ): + """Compute K(X,X) + sigma*I and its inverse. - # compute inverse covariance plus noise for faster computation later + """ + # Pre-compute inverse covariance plus noise to speed-up computation. K_lazy = self.model.covar_module(train_x.double()) K_lazy_plus_noise = K_lazy.add_diag(self.model.likelihood.noise) n_samples = train_x.shape[0] self.model.K_plus_noise = K_lazy_plus_noise.matmul(torch.eye(n_samples).double()) self.model.K_plus_noise_inv = K_lazy_plus_noise.inv_matmul(torch.eye(n_samples).double()) - def init_with_hyperparam(self, train_inputs, train_targets, path_to_statedict): - """Load hyperparameters from a state_dict.""" + def init_with_hyperparam(self, + train_inputs, + train_targets, + path_to_statedict + ): + """Load hyperparameters from a state_dict. + + """ if self.input_mask is not None: train_inputs = train_inputs[:, self.input_mask] if self.target_mask is not None: @@ -360,19 +462,25 @@ def init_with_hyperparam(self, train_inputs, train_targets, path_to_statedict): self.model.double() # needed otherwise loads state_dict as float32 self._compute_GP_covariances(train_inputs) - def train(self, train_x_raw, train_y_raw, n_train=500, learning_rate=0.01, gpu=False, fname='best_model.pth'): - """ - Train the GP using Train_x and Train_y - train_x: Torch tensor ( N samples [rows] by input dim [cols]) - train_y: Torch tensor (N samples [rows] by target dim [cols] ) + def train(self, + train_x_raw, + train_y_raw, + n_train=500, + learning_rate=0.01, + gpu=False, + fname='best_model.pth' + ): + """Train the GP using Train_x and Train_y. - """ + Args: + train_x: Torch tensor (N samples [rows] by input dim [cols]) + train_y: Torch tensor (N samples [rows] by target dim [cols]) + """ if self.input_mask is not None: train_x_raw = train_x_raw[:, self.input_mask] if self.target_mask is not None: train_y_raw = train_y_raw[:, self.target_mask] - if self.NORMALIZE: self.scale_normalization = 2/(train_x_raw.max(0)[0] - train_x_raw.min(0)[0]) self.scale_shift = (-train_x_raw.max(0)[0] - train_x_raw.min(0)[0])/(train_x_raw.max(0)[0] - @@ -382,19 +490,16 @@ def train(self, train_x_raw, train_y_raw, n_train=500, learning_rate=0.01, gpu=F else: train_x = train_x_raw train_y = train_y_raw - self._init_model(train_x, train_y) if gpu: train_x = train_x.cuda() train_y = train_y.cuda() self.model = self.model.cuda() self.likelihood = self.likelihood.cuda() - self.model.double() self.likelihood.double() self.model.train() self.likelihood.train() - # Uncomment two lines below to fix likelihood noise self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate) mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model) last_loss = 99999999 @@ -415,7 +520,6 @@ def train(self, train_x_raw, train_y_raw, n_train=500, learning_rate=0.01, gpu=F best_epoch = i i+=1 - print("Training Complete") print("Lowest epoch: %s" %best_epoch) print("Lowest Loss: %s" % best_loss) @@ -426,28 +530,31 @@ def train(self, train_x_raw, train_y_raw, n_train=500, learning_rate=0.01, gpu=F self.model.load_state_dict(torch.load(fname)) self._compute_GP_covariances(train_x) - def predict(self, x, requires_grad=False, return_pred=True): - """ - x : torch.Tensor (N_samples x input DIM) - Return - Predicitons - mean : torch.tensor (nx X N_samples) - lower : torch.tensor (nx X N_samples) - upper : torch.tensor (nx X N_samples) + def predict(self, + x, + requires_grad=False, + return_pred=True + ): """ + Args: + x : torch.Tensor (N_samples x input DIM). + + Returns: + Predictions + mean : torch.tensor (nx X N_samples). + lower : torch.tensor (nx X N_samples). + upper : torch.tensor (nx X N_samples). + """ self.model.eval() self.likelihood.eval() if type(x) is np.ndarray: x = torch.from_numpy(x).double() - if self.input_mask is not None: x = x[:,self.input_mask] - if self.NORMALIZE: x = self.normalize(x) - if requires_grad: predictions = self.likelihood(self.model(x)) mean = predictions.mean @@ -462,17 +569,21 @@ def predict(self, x, requires_grad=False, return_pred=True): else: return mean, cov - def prediction_jacobian(self, query): + def prediction_jacobian(self, + query + ): mean_der, cov_der = torch.autograd.functional.jacobian( lambda x: self.predict(x, requires_grad=True, return_pred=False), query.double()) return mean_der.detach().squeeze() - def plot_trained_gp(self, inputs, targets, fig_count=0): - + def plot_trained_gp(self, + inputs, + targets, + fig_count=0 + ): if self.target_mask is not None: targets = targets[:, self.target_mask] - means, covs, preds = self.predict(inputs) t = np.arange(inputs.shape[0]) lower, upper = preds.confidence_region() @@ -492,24 +603,34 @@ def plot_trained_gp(self, inputs, targets, fig_count=0): plt.xlabel('Time (s)') plt.ylabel('v') plt.show() - return fig_count - def normalize(self, vector): + def normalize(self, + vector + ): dim = vector.shape[1] return normalize(vector, self.scale_normalization[:dim], self.scale_shift[:dim]) - def unnormalize(self, vector): + def unnormalize(self, + vector + ): dim = vector.shape[1] return unnormalize(vector, self.scale_normalization[:dim], self.scale_shift[:dim]) -def normalize(vector, a, b): +def normalize(vector, + a, + b + ): return vector*a + b -def unnormalize(vector, a, b): + +def unnormalize(vector, + a, + b + ): return (vector - b)/a diff --git a/safe_control_gym/controllers/mpc/linear_mpc.py b/safe_control_gym/controllers/mpc/linear_mpc.py index ec7d21674..5694f2b7a 100644 --- a/safe_control_gym/controllers/mpc/linear_mpc.py +++ b/safe_control_gym/controllers/mpc/linear_mpc.py @@ -1,53 +1,33 @@ -"""Linear Model Predictive Control +"""Linear Model Predictive Control. -Example: - run linear mpc on cartpole balance:: - - $ python tests/test_main.py --mode test_policy --exp_id linear_mpc_cartpole \ - --algo linear_mpc --task cartpole_200 --horizon 50 --q_mpc 100 0 10 0 - - run linear mpc on cartpole balance:: - - $ python tests/test_main.py --mode test_policy --exp_id linear_mpc_quad \ - --algo linear_mpc --task quadrotor --ctrl_time_mult 10 --horizon 50 \ - --use_prev_start {--shooting} - -Todo: - * better initialization of operating points per mpc step. +Based on: + * https://people.eecs.berkeley.edu/~pabbeel/cs287-fa12/slides/LQR.pdf + * https://pythonrobotics.readthedocs.io/en/latest/modules/path_tracking.html#mpc-modeling + * https://github.com/AtsushiSakai/PythonRobotics/blob/master/PathTracking/model_predictive_speed_and_steer_control/model_predictive_speed_and_steer_control.py """ -from sys import platform import numpy as np import casadi as cs + +from sys import platform from copy import deepcopy from safe_control_gym.controllers.mpc.mpc import MPC from safe_control_gym.envs.benchmark_env import Task -# ----------------------------------------------------------------------------------- -# Linear MPC -# ----------------------------------------------------------------------------------- - class LinearMPC(MPC): """ Simple linear MPC. - - reference: - - https://people.eecs.berkeley.edu/~pabbeel/cs287-fa12/slides/LQR.pdf - - https://pythonrobotics.readthedocs.io/en/latest/modules/path_tracking.html#mpc-modeling - - https://github.com/AtsushiSakai/PythonRobotics/blob/d391cdbb8c82f9c3e643f4c2fd95e85d03b47c9e/PathTracking/model_predictive_speed_and_steer_control/model_predictive_speed_and_steer_control.py + """ def __init__( self, env_func, - # model args horizon=5, q_mpc=[1], r_mpc=[1], warmstart=True, - # runner args - # shared/base args output_dir="results/temp", additional_constraints=[], **kwargs): @@ -60,14 +40,13 @@ def __init__( r_mpc (list): diagonals of input/action cost weight. warmstart (bool): if to initialize from previous iteration. output_dir (str): output directory to write logs and results. - additional_constraints (list): List of constraints + additional_constraints (list): list of constraints. """ - # all params/args (lazy hack) + # Store all params/args. for k, v in locals().items(): if k != "self" and k != "kwargs" and "__" not in k: self.__dict__[k] = v - super().__init__( env_func, horizon=horizon, @@ -78,21 +57,19 @@ def __init__( additional_constraints=additional_constraints, **kwargs ) - - # todo: setup environment equilibrium self.X_LIN = np.atleast_2d(self.env.X_GOAL)[0,:].T self.U_LIN = np.atleast_2d(self.env.U_GOAL)[0,:] - def set_dynamics_func(self): - """Updates symbolic dynamics with actual control frequency.""" - # original version, used in shooting + """Updates symbolic dynamics with actual control frequency. + + """ + # Original version, used in shooting. dfdxdfdu = self.model.df_func(x=self.X_LIN, u=self.U_LIN) dfdx = dfdxdfdu['dfdx'].toarray() dfdu = dfdxdfdu['dfdu'].toarray() delta_x = cs.MX.sym('delta_x', self.model.nx,1) delta_u = cs.MX.sym('delta_u', self.model.nu,1) - x_dot_lin_vec = dfdx @ delta_x + dfdu @ delta_u self.linear_dynamics_func = cs.integrator( 'linear_discrete_dynamics', self.model.integration_algo, @@ -102,30 +79,28 @@ def set_dynamics_func(self): 'ode': x_dot_lin_vec }, {'tf': self.dt} ) - self.dfdx = dfdx self.dfdu = dfdu - def setup_optimizer(self): - """Sets up convex optimization problem including - cost objective, variable bounds and dynamics constraints + """Sets up convex optimization problem. + + Including cost objective, variable bounds and dynamics constraints. + """ nx, nu = self.model.nx, self.model.nu T = self.T - - # define optimizer and variables + # Define optimizer and variables. opti = cs.Opti() - # states + # States. x_var = opti.variable(nx, T + 1) - # inputs + # Inputs. u_var = opti.variable(nu, T) - # initial state + # Initial state. x_init = opti.parameter(nx, 1) - # reference (equilibrium point or trajectory, last step for terminal cost) + # Reference (equilibrium point or trajectory, last step for terminal cost). x_ref = opti.parameter(nx, T + 1) - - # cost (cumulative) + # Cost (cumulative). cost = 0 cost_func = self.model.loss for i in range(T): @@ -135,7 +110,7 @@ def setup_optimizer(self): Ur=np.zeros((nu, 1)), Q=self.Q, R=self.R)["l"] - # terminal cost + # Terminal cost. cost += cost_func(x=x_var[:, -1]+self.X_LIN[:,None], u=np.zeros((nu, 1))+self.U_LIN[:, None], Xr=x_ref[:, -1], @@ -143,26 +118,21 @@ def setup_optimizer(self): Q=self.Q, R=self.R)["l"] opti.minimize(cost) - for i in range(self.T): - # dynamics constraints + # Dynamics constraints. next_state = self.linear_dynamics_func(x0=x_var[:, i], p=u_var[:,i])['xf'] opti.subject_to(x_var[:, i + 1] == next_state) - - # State and input constraints + # State and input constraints. for state_constraint in self.state_constraints_sym: opti.subject_to(state_constraint(x_var[:,i] + self.X_LIN.T) < 0) for input_constraint in self.input_constraints_sym: opti.subject_to(input_constraint(u_var[:,i] + self.U_LIN.T) < 0) - - # final state constraints + # Final state constraints. for state_constraint in self.state_constraints_sym: opti.subject_to(state_constraint(x_var[:,-1] + self.X_LIN.T) < 0) - - # initial condition constraints + # Initial condition constraints. opti.subject_to(x_var[:, 0] == x_init) - - # create solver (IPOPT solver for now ) + # Create solver (IPOPT solver in this version). opts = {} if platform == "linux": opti.solver('sqpmethod', opts) @@ -181,18 +151,20 @@ def setup_optimizer(self): "cost": cost } - def select_action(self, obs): - """Solve nonlinear mpc problem to get next action + def select_action(self, + obs + ): + """Solve nonlinear mpc problem to get next action. Args: obs (np.array): current state/observation. Returns: action (np.array): input/action to the task/env. + """ nx, nu = self.model.nx, self.model.nu T = self.T - opti_dict = self.opti_dict opti = opti_dict["opti"] x_var = opti_dict["x_var"] @@ -200,30 +172,24 @@ def select_action(self, obs): x_init = opti_dict["x_init"] x_ref = opti_dict["x_ref"] cost = opti_dict["cost"] - - # assign the initial state + # Assign the initial state. opti.set_value(x_init, obs-self.X_LIN) - - # assign reference trajectory within horizon + # Assign reference trajectory within horizon. goal_states = self.get_references() opti.set_value(x_ref, goal_states) if self.env.TASK == Task.TRAJ_TRACKING: self.traj_step += 1 - if self.warmstart and self.u_prev is not None and self.x_prev is not None: opti.set_initial(x_var, self.x_prev) opti.set_initial(u_var, self.u_prev) - - # solve the optimization problem + # Solve the optimization problem. try: sol = opti.solve() x_val, u_val = sol.value(x_var), sol.value(u_var) self.x_prev = x_val self.u_prev = u_val - self.results_dict['horizon_states'].append(deepcopy(self.x_prev) + self.X_LIN[:, None]) self.results_dict['horizon_inputs'].append(deepcopy(self.u_prev) + self.U_LIN[:, None]) - except RuntimeError as e: print(e) return_status = opti.return_status() @@ -233,14 +199,11 @@ def select_action(self, obs): elif return_status == 'Maximum_Iterations_Exceeded': self.terminate_loop = True u_val = opti.debug.value(u_var) - - # take first one from solved action sequence + # Take first one from solved action sequence. if u_val.ndim > 1: action = u_val[:, 0] else: action = np.array([u_val[0]]) action += self.U_LIN self.prev_action = action - return action - diff --git a/safe_control_gym/controllers/mpc/linear_mpc.yaml b/safe_control_gym/controllers/mpc/linear_mpc.yaml index 3bca31ecf..59663e3d7 100644 --- a/safe_control_gym/controllers/mpc/linear_mpc.yaml +++ b/safe_control_gym/controllers/mpc/linear_mpc.yaml @@ -1,4 +1,4 @@ -# mpc args +# MPC args horizon: 10 r_mpc: - 1. @@ -6,8 +6,6 @@ q_mpc: - 1. use_prev_start: True -# runner args +# Runner args deque_size: 10 eval_batch_size: 10 - - diff --git a/safe_control_gym/controllers/mpc/mpc.py b/safe_control_gym/controllers/mpc/mpc.py index a4b79d597..aca2c31c6 100644 --- a/safe_control_gym/controllers/mpc/mpc.py +++ b/safe_control_gym/controllers/mpc/mpc.py @@ -1,23 +1,9 @@ -"""Model Predictive Control +"""Model Predictive Control. -Example: - run mpc on cartpole balance:: - - $ python tests/test_main.py --mode test_policy --exp_id mpc_cartpole \ - --algo mpc --task cartpole_200 --horizon 50 --q_mpc 100 0 10 0 - - run mpc on quadrotor hovering:: - - $ python tests/test_main.py --mode test_policy --exp_id mpc_quad \ - --algo mpc --task quadrotor --ctrl_time_mult 10 --horizon 50 - -Todo: - * All for additional constraints to be added externally? - * Remove eval batch? - * Terminal Constraints? Talk to Lukas about how best to do this. """ import numpy as np import casadi as cs + from copy import deepcopy from safe_control_gym.controllers.base_controller import BaseController @@ -26,26 +12,22 @@ from safe_control_gym.envs.constraints import ConstraintList, GENERAL_CONSTRAINTS, create_ConstraintList_from_dict -# ----------------------------------------------------------------------------------- -# (Nonlinear) MPC -# ----------------------------------------------------------------------------------- +class MPC(BaseController): + """MPC with full nonlinear model. + """ -class MPC(BaseController): - """MPC with full nonlinear model.""" def __init__( self, env_func, - # model args horizon=5, q_mpc=[1], r_mpc=[1], warmstart=True, - # runner args - # shared/base args output_dir="results/temp", additional_constraints=None, - **kwargs): + **kwargs + ): """Creates task and controller. Args: @@ -61,10 +43,8 @@ def __init__( for k, v in locals().items(): if k != "self" and k != "kwargs" and "__" not in k: self.__dict__.update({k: v}) - - # task + # Task. self.env = env_func() - if additional_constraints is not None: additional_ConstraintsList = create_ConstraintList_from_dict(additional_constraints, GENERAL_CONSTRAINTS, @@ -74,23 +54,21 @@ def __init__( else: self.reset_constraints(self.env.constraints.constraints) self.additional_constraints = [] - - # model params + # Model parameters self.model = self.env.symbolic self.dt = self.model.dt - self.T = horizon self.Q = get_cost_weight_matrix(self.q_mpc, self.model.nx) self.R = get_cost_weight_matrix(self.r_mpc, self.model.nu) - # logging - #self.logger = ExperimentLogger(output_dir) - - def reset_constraints(self, constraints): + def reset_constraints(self, + constraints + ): """ Setup the constraints list. Args: - constraints (list): List of constraints controller is subject too. + constraints (list): List of constraints controller is subject to. + """ self.constraints = ConstraintList(constraints) self.state_constraints_sym = self.constraints.get_state_constraint_symbolic_models() @@ -98,93 +76,102 @@ def reset_constraints(self, constraints): if len(self.constraints.input_state_constraints) > 0: raise NotImplementedError('MPC cannot handle combined state input constraints yet.') - def add_constraints(self, constraints): - """Add the constraints in provided list to the system + def add_constraints(self, + constraints + ): + """Add the constraints (from a list) to the system. Args: constraints (list): List of constraints controller is subject too. + """ self.reset_constraints(constraints + self.constraints.constraints) - def remove_constraints(self, constraints): + def remove_constraints(self, + constraints + ): """Remove constraints from the current constraint list. Args: - constraints (list): list of Constraints to be removed + constraints (list): list of constraints to be removed. + """ old_constraints_list = self.constraints.constraints - for constraint in constraints: assert constraint in self.constraints.constraints,\ ValueError("This constraint is not in the current list of constraints") old_constraints_list.remove(constraint) self.reset_constraints(old_constraints_list) - def close(self): - """Cleans up resources.""" + """Cleans up resources. + + """ self.env.close() - #self.logger.close() def reset(self): - """Prepares for training or evaluation.""" - # setup reference input + """Prepares for training or evaluation. + + """ + # Setup reference input. if self.env.TASK == Task.STABILIZATION: self.mode = "stabilization" self.x_goal = self.env.X_GOAL elif self.env.TASK == Task.TRAJ_TRACKING: self.mode = "tracking" self.traj = self.env.X_GOAL.T - # step along the reference + # Step along the reference. self.traj_step = 0 - - # dynamics model + # Dynamics model. self.set_dynamics_func() - - # casadi optimizer + # CasADi optimizer. self.setup_optimizer() - # previously solved states & inputs, useful for warm start + # Previously solved states & inputs, useful for warm start. self.x_prev = None self.u_prev = None self.reset_results_dict() def set_dynamics_func(self): - """Updates symbolic dynamics with actual control frequency.""" - self.dynamics_func = cs.integrator('fd', self.model.integration_algo, { - 'x': self.model.x_sym, - 'p': self.model.u_sym, - 'ode': self.model.x_dot - }, {'tf': self.dt}) + """Updates symbolic dynamics with actual control frequency. + + """ + self.dynamics_func = cs.integrator('fd', self.model.integration_algo, + { + 'x': self.model.x_sym, + 'p': self.model.u_sym, + 'ode': self.model.x_dot + }, + {'tf': self.dt}) def setup_optimizer(self): - """Sets up nonlinear optimization problem.""" + """Sets up nonlinear optimization problem. + + """ nx, nu = self.model.nx, self.model.nu T = self.T - - # define optimizer and variables + # Define optimizer and variables. opti = cs.Opti() - # states + # States. x_var = opti.variable(nx, T + 1) - # inputs + # Inputs. u_var = opti.variable(nu, T) - # initial state + # Initial state. x_init = opti.parameter(nx, 1) - # reference (equilibrium point or trajectory, last step for terminal cost) + # Reference (equilibrium point or trajectory, last step for terminal cost). x_ref = opti.parameter(nx, T + 1) - - # cost (cumulative) + # Cost (cumulative). cost = 0 cost_func = self.model.loss for i in range(T): - # can ignore first state cost (since fist x_var == x_init) + # Can ignore the first state cost since fist x_var == x_init. cost += cost_func(x=x_var[:, i], u=u_var[:, i], Xr=x_ref[:, i], Ur=np.zeros((nu, 1)), Q=self.Q, R=self.R)["l"] - # terminal cost + # Terminal cost. cost += cost_func(x=x_var[:, -1], u=np.zeros((nu, 1)), Xr=x_ref[:, -1], @@ -192,28 +179,21 @@ def setup_optimizer(self): Q=self.Q, R=self.R)["l"] opti.minimize(cost) - - # constraints + # Constraints for i in range(self.T): - # dynamics constraints + # Dynamics constraints. next_state = self.dynamics_func(x0=x_var[:, i], p=u_var[:, i])['xf'] opti.subject_to(x_var[:, i + 1] == next_state) - for state_constraint in self.state_constraints_sym: opti.subject_to(state_constraint(x_var[:,i]) < 0) for input_constraint in self.input_constraints_sym: opti.subject_to(input_constraint(u_var[:,i]) < 0) - - # final state constraints - # todo: Make this optional Terminal State Constraint + # Final state constraints. for state_constraint in self.state_constraints_sym: opti.subject_to(state_constraint(x_var[:, i]) < 0) - - # initial condition constraints + # Initial condition constraints. opti.subject_to(x_var[:, 0] == x_init) - - # create solver (IPOPT solver for now) - # reference: https://groups.google.com/g/casadi-users/c/a9OGbF7R4ss + # Create solver (IPOPT solver in this version) opts = {"ipopt.print_level": 0, "ipopt.sb": "yes", "print_time": 0} opti.solver('ipopt', opts) self.opti_dict = { @@ -225,7 +205,9 @@ def setup_optimizer(self): "cost": cost } - def select_action(self, obs): + def select_action(self, + obs + ): """Solves nonlinear mpc problem to get next action. Args: @@ -242,37 +224,30 @@ def select_action(self, obs): x_init = opti_dict["x_init"] x_ref = opti_dict["x_ref"] cost = opti_dict["cost"] - - # assign the initial state + # Assign the initial state. opti.set_value(x_init, obs) - - # assign reference trajectory within horizon + # Assign reference trajectory within horizon. goal_states = self.get_references() opti.set_value(x_ref, goal_states) if self.mode == "tracking": self.traj_step += 1 - - # initial guess for optim problem + # Initial guess for optimization problem. if self.warmstart and self.x_prev is not None and self.u_prev is not None: # shift previous solutions by 1 step x_guess = deepcopy(self.x_prev) u_guess = deepcopy(self.u_prev) x_guess[:, :-1] = x_guess[:, 1:] u_guess[:-1] = u_guess[1:] - opti.set_initial(x_var, x_guess) opti.set_initial(u_var, u_guess) - - # solve the optimization problem + # Solve the optimization problem. sol = opti.solve() x_val, u_val = sol.value(x_var), sol.value(u_var) self.x_prev = x_val self.u_prev = u_val - self.results_dict['horizon_states'].append(deepcopy(self.x_prev)) self.results_dict['horizon_inputs'].append(deepcopy(self.u_prev)) - - # take the first one from solved action sequence + # Take the first action from the solved action sequence. if u_val.ndim > 1: action = u_val[:, 0] else: @@ -281,30 +256,33 @@ def select_action(self, obs): return action def get_references(self): - """Constructs reference states along mpc horizon.(nx, T+1)""" + """Constructs reference states along mpc horizon.(nx, T+1). + + """ if self.env.TASK == Task.STABILIZATION: - # repeat goal state for horizon steps + # Repeat goal state for horizon steps. goal_states = np.tile(self.env.X_GOAL.reshape(-1, 1), (1, self.T + 1)) elif self.env.TASK == Task.TRAJ_TRACKING: - # slice trajectory for horizon steps, - # if not long enough, repeat last state for remaining steps + # Slice trajectory for horizon steps, if not long enough, repeat last state. start = min(self.traj_step, self.traj.shape[-1]) end = min(self.traj_step + self.T + 1, self.traj.shape[-1]) remain = max(0, self.T + 1 - (end - start)) - goal_states = np.concatenate([ self.traj[:, start:end], np.tile(self.traj[:, -1:], (1, remain)) ], -1) else: raise Exception("Reference for this mode is not implemented.") - return goal_states # (nx, T+1) + return goal_states # (nx, T+1). def reset_results_dict(self): + """ + + """ self.results_dict = { 'obs': [], 'reward': [], 'done': [], - 'info': [], # ignore initial info so we can save easier + 'info': [], 'action': [], 'horizon_inputs': [], 'horizon_states': [] @@ -314,7 +292,8 @@ def run(self, env=None, render=False, logging=False, - max_steps=100): + max_steps=100 + ): """Runs evaluation with current policy. Args: @@ -327,10 +306,8 @@ def run(self, """ if env is None: env = self.env - self.x_prev = None self.u_prev = None - obs, info = env.reset() print("Init State:") print(obs) @@ -338,7 +315,6 @@ def run(self, frames = [] self.reset_results_dict() self.results_dict['obs'].append(obs) - i = 0 if self.env.TASK == Task.STABILIZATION: MAX_STEPS = max_steps @@ -346,7 +322,6 @@ def run(self, MAX_STEPS = self.traj.shape[1] else: raise("Undefined Task") - self.terminate_loop = False while np.linalg.norm(obs - env.X_GOAL) > 1e-3 and\ i < MAX_STEPS and\ @@ -355,15 +330,13 @@ def run(self, if self.terminate_loop: print("Infeasible MPC Problem") break - - # repeat input for more efficient control + # Repeat input for more efficient control. obs, reward, done, info = env.step(action) self.results_dict['obs'].append(obs) self.results_dict['reward'].append(reward) self.results_dict['done'].append(done) self.results_dict['info'].append(info) self.results_dict['action'].append(action) - print(i, '-th step.') print(action) print(obs) @@ -371,14 +344,11 @@ def run(self, print(done) print(info) print() - if render: env.render() frames.append(env.render("rgb_array")) - i += 1 - - # collect evaluation results + # Collect evaluation results. ep_lengths = np.asarray(ep_lengths) ep_returns = np.asarray(ep_returns) if logging: @@ -386,10 +356,7 @@ def run(self, msg += "eval_ep_length {:.2f} +/- {:.2f} | eval_ep_return {:.3f} +/- {:.3f}\n".format( ep_lengths.mean(), ep_lengths.std(), ep_returns.mean(), ep_returns.std()) - #self.logger.info(msg + "\n") - self.results_dict['obs'] = np.vstack(self.results_dict['obs']) self.results_dict['reward'] = np.vstack(self.results_dict['reward']) self.results_dict['action'] = np.vstack(self.results_dict['action']) - return deepcopy(self.results_dict) diff --git a/safe_control_gym/controllers/mpc/mpc.yaml b/safe_control_gym/controllers/mpc/mpc.yaml index 3bca31ecf..59663e3d7 100644 --- a/safe_control_gym/controllers/mpc/mpc.yaml +++ b/safe_control_gym/controllers/mpc/mpc.yaml @@ -1,4 +1,4 @@ -# mpc args +# MPC args horizon: 10 r_mpc: - 1. @@ -6,8 +6,6 @@ q_mpc: - 1. use_prev_start: True -# runner args +# Runner args deque_size: 10 eval_batch_size: 10 - - diff --git a/safe_control_gym/controllers/mpc/mpc_utils.py b/safe_control_gym/controllers/mpc/mpc_utils.py index 19c1a650e..25f70874f 100644 --- a/safe_control_gym/controllers/mpc/mpc_utils.py +++ b/safe_control_gym/controllers/mpc/mpc_utils.py @@ -1,14 +1,15 @@ -"""Module descripion. +"""General MPC utility functions. """ import numpy as np import scipy -def get_cost_weight_matrix(weights, dim): +def get_cost_weight_matrix(weights, + dim + ): """Gets weight matrix from input args. - """ if len(weights) == dim: W = np.diag(weights) @@ -19,28 +20,31 @@ def get_cost_weight_matrix(weights, dim): return W -def discretize_linear_system(A, B, dt, exact=False): - """ discretization of a linear system +def discretize_linear_system(A, + B, + dt, + exact=False + ): + """Discretize a linear system. - dx/dt = A x + B u + dx/dt = A x + B u --> xd[k+1] = Ad xd[k] + Bd ud[k] where xd[k] = x(k*dt) Args: - A: np.array, system transition matrix - B: np.array, input matrix - dt: scalar, step time interval - exact: bool, if to use exact discretization + A: np.array, system transition matrix. + B: np.array, input matrix. + dt: scalar, step time interval. + exact: bool, if to use exact discretization. Returns: - discretized matrices Ad, Bd + Discretized matrices Ad, Bd. + """ state_dim, input_dim = A.shape[1], B.shape[1] - if exact: M = np.zeros((state_dim + input_dim, state_dim + input_dim)) M[:state_dim, :state_dim] = A M[:state_dim, state_dim:] = B - Md = scipy.linalg.expm(M * dt) Ad = Md[:state_dim, :state_dim] Bd = Md[:state_dim, state_dim:] @@ -48,5 +52,4 @@ def discretize_linear_system(A, B, dt, exact=False): I = np.eye(state_dim) Ad = I + A * dt Bd = B * dt - return Ad, Bd diff --git a/safe_control_gym/controllers/mpsc/mpsc.py b/safe_control_gym/controllers/mpsc/mpsc.py index 9791d1323..a3e9f4908 100644 --- a/safe_control_gym/controllers/mpsc/mpsc.py +++ b/safe_control_gym/controllers/mpsc/mpsc.py @@ -1,21 +1,21 @@ -"""Model Predictive Safety Certification (MPSC) +"""Model Predictive Safety Certification (MPSC). -Based on -K.P. Wabsersich and M.N. Zeilinger "Linear model predictive safety certification for learning-based control" 2019 -https://arxiv.org/pdf/1803.08552.pdf - -The general idea is that any learning controller input can be certefied as safe, and if not safe, can be corrected +The core idea is that any learning controller input can be either certificated as safe or, if not safe, corrected using an MPC controller based on Tube MPC. +Based on + * K.P. Wabsersich and M.N. Zeilinger "Linear model predictive safety certification for learning-based control" 2019 + https://arxiv.org/pdf/1803.08552.pdf + """ +import numpy as np +import casadi as cs +import torch from copy import deepcopy from itertools import product -import numpy as np -import casadi as cs from munch import munchify from scipy.linalg import solve_discrete_are -import torch from safe_control_gym.controllers.base_controller import BaseController from safe_control_gym.controllers.mpc.mpc_utils import get_cost_weight_matrix, discretize_linear_system @@ -25,7 +25,10 @@ class MPSC(BaseController): - """Model Predictive Safety Certification Class.""" + """Model Predictive Safety Certification Class. + + """ + def __init__(self, env_func, rl_controller=None, @@ -37,8 +40,9 @@ def __init__(self, warmstart: bool = True, run_length: int = 200, additional_constraints: list = None, - **kwargs): - """Initialize the MPSC + **kwargs + ): + """Initialize the MPSC. Args: env_func (partial gym.Env): Environment for the task. @@ -51,12 +55,11 @@ def __init__(self, additional_constraints (list): List of additional constraints to consider. """ - # Setup the Environments + # Setup the Environments. self.env_func = env_func self.env = env_func(randomized_init=False) self.training_env = env_func(randomized_init=True) - - # Setup attributes + # Setup attributes. self.model = self.env.symbolic self.dt = self.model.dt self.Q = get_cost_weight_matrix(q_lin, self.model.nx) @@ -71,23 +74,23 @@ def __init__(self, self.tau = tau self.run_length = run_length self.rl_controller = rl_controller - self.omega_AABB_verts = None self.z_prev = None self.v_prev = None - if additional_constraints is None: additional_constraints = [] self.constraints, self.state_constraints_sym, self.input_constraints_sym = self.reset_constraints( self.env.constraints.constraints + - additional_constraints - ) + additional_constraints) - def reset_constraints(self, constraints): + def reset_constraints(self, + constraints + ): """ Setup the constraints list. Args: constraints (list): List of constraints controller is subject too. + """ constraints_list = ConstraintList(constraints) state_constraints_sym = constraints_list.get_state_constraint_symbolic_models() @@ -97,14 +100,15 @@ def reset_constraints(self, constraints): return constraints_list, state_constraints_sym, input_constraints_sym def set_linear_dynamics(self): - """Compute the linear dynamics""" - # original version, used in shooting + """Compute the linear dynamics + + """ + # Original version, used in shooting. dfdxdfdu = self.model.df_func(x=self.X_LIN, u=self.U_LIN) dfdx = dfdxdfdu['dfdx'].toarray() dfdu = dfdxdfdu['dfdu'].toarray() delta_x = cs.MX.sym('delta_x', self.model.nx,1) delta_u = cs.MX.sym('delta_u', self.model.nu,1) - x_dot_lin_vec = dfdx @ delta_x + dfdu @ delta_u linear_dynamics_func = cs.integrator( 'linear_discrete_dynamics', self.model.integration_algo, @@ -114,12 +118,13 @@ def set_linear_dynamics(self): 'ode': x_dot_lin_vec }, {'tf': self.dt} ) - discrete_dfdx, discrete_dfdu = discretize_linear_system(dfdx, dfdu, self.dt) return linear_dynamics_func, discrete_dfdx, discrete_dfdu def compute_lqr_gain(self): - """Compute LQR gain by solving the DARE.""" + """Compute LQR gain by solving the DARE. + + """ P = solve_discrete_are(self.discrete_dfdx, self.discrete_dfdu, self.Q, @@ -129,7 +134,8 @@ def compute_lqr_gain(self): self.discrete_dfdx)) def learn(self, - env=None): + env=None + ): """Compute the Robust Positively Invariant (RPI) set that the MPSC used. Args: @@ -138,144 +144,118 @@ def learn(self, """ if env is None: env = self.training_env - - # create set of error residuals + # Create set of error residuals. w = np.zeros((self.model.nx, self.n_samples)) next_true_states = np.zeros((self.model.nx, self.n_samples)) next_pred_states = np.zeros((self.model.nx, self.n_samples)) actions = np.zeros((self.model.nu, self.n_samples)) - # Use uniform sampling of control inputs and states. for i in range(self.n_samples): init_state, info = env.reset() - u = env.action_space.sample() # will give a random action within action space + u = env.action_space.sample() # Will yield a random action within action space. actions[:,i] = u x_next_obs, _, _, _ = env.step(u) x_next_linear = self.linear_dynamics_func(x0=init_state - self.X_LIN, p=u - self.U_LIN)['xf'].toarray() - next_true_states[:,i] = x_next_obs next_pred_states[:,i] = x_next_linear[:,0] w[:,i] = x_next_obs - x_next_linear[:,0] - A_cl = self.discrete_dfdx + self.discrete_dfdu @ self.lqr_gain - P = compute_RPI_set(A_cl, w, self.tau) - self.learn_actions = actions self.learn_next_true_states = next_true_states self.learn_next_pred_states = next_pred_states self.w = w self.A_cl = A_cl self.omega_P = P - self.omega_AABB_verts = ellipse_bounding_box(P) self.tighten_state_and_input_constraints() self.omega_constraint = QuadraticContstraint(self.env, P, 1.0, constraint_input_type=ConstraintInputType.STATE) - - # Now that constraints are defined, setup the optimizer + # Now that constraints are defined, setup the optimizer. self.setup_optimizer() def tighten_state_and_input_constraints(self): - """Tigthen the state and input constraints based on the RPI.""" + """Tigthen the state and input constraints based on the RPI. + + """ if self.omega_AABB_verts is not None: K_omega_AABB_verts_raw = (self.lqr_gain @ self.omega_AABB_verts.T).T - # take the outermost values + # Take the outermost values. self.K_omega_AABB_verts = np.vstack((np.max(K_omega_AABB_verts_raw), np.min(K_omega_AABB_verts_raw))) - - # get the current input constraint vertices + # Get the current input constraint vertices. input_constraint = self.constraints.input_constraints if len(input_constraint) > 1: raise NotImplementedError("MPSC currently can't handle more than 1 constraint") input_constraint = input_constraint[0] self.U_vertices = np.array([[input_constraint.high, input_constraint.low]]).T - - self.tightened_input_constraint_verts, tightened_input_constr_func\ = pontryagin_difference_AABB( self.U_vertices, self.K_omega_AABB_verts) self.tightened_input_constraint = tightened_input_constr_func(env=self.env, - constraint_input_type=ConstraintInputType.INPUT) - # get the state constraint vertices + constraint_input_type=ConstraintInputType.INPUT) + # Get the state constraint vertices. state_constraints = self.constraints.state_constraints if len(state_constraints) > 1: raise NotImplementedError("MPSC currently can't handle more than 1 constraint") state_constraints = state_constraints[0] X_vertices_raw = [(state_constraints.high[i],state_constraints.low[i]) for i in range(self.model.nx)] self.X_vertices = np.clip(np.vstack(list(product(*X_vertices_raw))), -100, 100) - - self.tightened_state_constraint_verts, tightened_state_constraint_func = pontryagin_difference_AABB( - self.X_vertices, - self.omega_AABB_verts) + self.tightened_state_constraint_verts, tightened_state_constraint_func = pontryagin_difference_AABB(self.X_vertices, + self.omega_AABB_verts) self.tightened_state_constraint = tightened_state_constraint_func(env=self.env, constraint_input_type=ConstraintInputType.STATE) else: raise ValueError("") def setup_optimizer(self): - """Setup the certifying MPC problem.""" - # horizon is a parameter + """Setup the certifying MPC problem. + + """ + # Horizon parameter. horizon = self.horizon nx, nu = self.model.nx, self.model.nu - - # define optimizer and variables + # Define optimizer and variables. opti = cs.Opti() - # states + # States. z_var = opti.variable(nx, horizon + 1) - # inputs + # Inputs. v_var = opti.variable(nu, horizon) - # Certified Input + # Certified input. u_tilde = opti.variable(nu, 1) - # Desired Input + # Desired input. u_L = opti.parameter(nu, 1) - # Current observed state + # Current observed state. x = opti.parameter(nx, 1) - - # CONSTRAINTS - # Currently only handles a single constraint for state and input + # Constraints (currently only handles a single constraint for state and input). state_constraints = self.tightened_state_constraint.get_symbolic_model() input_constraints = self.tightened_input_constraint.get_symbolic_model() omega_constraint = self.omega_constraint.get_symbolic_model() - - # constraints for i in range(self.horizon): - # dynamics constraints eqn 5.b + # Dynamics constraints (eqn 5.b). next_state = self.linear_dynamics_func(x0=z_var[:,i], p=v_var[:,i])['xf'] opti.subject_to(z_var[:, i + 1] == next_state) - - # Eqn 5.c - # input constraints + # Input constraints (eqn 5.c). opti.subject_to( input_constraints(v_var[:,i]) <= 0) - # State Constraints opti.subject_to(state_constraints(z_var[:,i]) <= 0) - - # final state constraints (5.d) + # Final state constraints (5.d). opti.subject_to(z_var[:, -1] == 0 ) - - # Initial state constraints (5.e) + # Initial state constraints (5.e). opti.subject_to(omega_constraint(x - z_var[:, 0]) <= 0) - - # Real input (5.f) + # Real input (5.f). opti.subject_to(u_tilde == v_var[:,0] + self.lqr_gain @ (x - z_var[:,0])) - - ## COST - # Note: Using 2norm or sqrt makes this infeasible - cost = (u_L - u_tilde).T @ (u_L - u_tilde) # eqn 5.a + # Cost (# eqn 5.a, note: using 2norm or sqrt makes this infeasible). + cost = (u_L - u_tilde).T @ (u_L - u_tilde) opti.minimize(cost) - - # create solver (IPOPT solver for now) + # Create solver (IPOPT solver as of this version). opts = {"ipopt.print_level": 4, "ipopt.sb": "yes", "ipopt.max_iter": 50, "print_time": 1} opti.solver('ipopt', opts) - - - self.opti_dict = { "opti": opti, "z_var": z_var, @@ -286,8 +266,13 @@ def setup_optimizer(self): "cost": cost } - def solve_optimization(self, obs, uncertified_input): - """Solve the MPC optimization problem for a given observation and uncertified input.""" + def solve_optimization(self, + obs, + uncertified_input + ): + """Solve the MPC optimization problem for a given observation and uncertified input. + + """ opti_dict = self.opti_dict opti = opti_dict["opti"] z_var = opti_dict["z_var"] @@ -296,33 +281,29 @@ def solve_optimization(self, obs, uncertified_input): u_L = opti_dict["u_L"] x = opti_dict["x"] cost = opti_dict["cost"] - opti.set_value(x, obs) opti.set_value(u_L, uncertified_input) - - # initial guess for optim problem + # Initial guess for optimization problem. if (self.warmstart and self.z_prev is not None and self.v_prev is not None and self.u_tilde_prev is not None): - # shift previous solutions by 1 step + # Shift previous solutions by 1 step. z_guess = deepcopy(self.x_prev) v_guess = deepcopy(self.u_prev) z_guess[:, :-1] = z_guess[:, 1:] v_guess[:-1] = v_guess[1:] - opti.set_initial(z_var, z_guess) opti.set_initial(v_var, v_guess) opti.set_initial(u_tilde, deepcopy(self.u_tilde_prev)) - - # solve the optimization problem + # Solve the optimization problem. try: sol = opti.solve() x_val, u_val, u_tilde_val = sol.value(z_var), sol.value(v_var), sol.value(u_tilde) self.z_prev = x_val self.v_prev = u_val self.u_tilde_prev = u_tilde_val - # take the first one from solved action sequence + # Take the first one from solved action sequence. if u_val.ndim > 1: action = u_tilde_val else: @@ -332,11 +313,15 @@ def solve_optimization(self, obs, uncertified_input): except RuntimeError: feasible = False action = None - return action, feasible - def certify_action(self, obs, u_L): - """Algorithm 1 from [1].""" + def certify_action(self, + obs, + u_L + ): + """Algorithm 1 from Wabsersich 2019. + + """ action, feasible = self.solve_optimization(obs, u_L) self.results_dict['feasible'].append(feasible) if feasible: @@ -356,7 +341,9 @@ def certify_action(self, obs, u_L): action = self.lqr_gain @ obs return action - def select_action(self, obs): + def select_action(self, + obs + ): """Selection feedback action. Args: @@ -369,22 +356,21 @@ def select_action(self, obs): """ if self.rl_controller is not None: with torch.no_grad(): - u_L, v, logp = self.rl_controller.agent.ac.step( - torch.FloatTensor(obs).to(self.rl_controller.device)) + u_L, v, logp = self.rl_controller.agent.ac.step(torch.FloatTensor(obs).to(self.rl_controller.device)) else: u_L = 2*np.sin(0.01*np.pi*self.time_step) + 0.5*np.sin(0.12*np.pi*self.time_step) self.results_dict['learning_actions'].append(u_L) action = self.certify_action(obs, u_L) action_diff = np.linalg.norm(u_L - action) self.results_dict['corrections'].append(action_diff) - return action, u_L def run(self, env=None, uncertified_env=None, run_length=None, - **kwargs): + **kwargs + ): """Run the simulation. Args: @@ -402,11 +388,9 @@ def run(self, run_length = self.run_length if uncertified_env is None: uncertified_env = self.env_func(randomized_init=False) - self.setup_results_dict() obs, _ = env.reset() self.results_dict['obs'].append(obs) - self.kinf = self.horizon - 1 self.time_step = 0 for i in range(run_length): @@ -415,7 +399,6 @@ def run(self, self.results_dict['obs'].append(obs) self.results_dict['actions'].append(action) self.time_step += 1 - uncertified_obs, _ = uncertified_env.reset() self.results_dict['uncertified_obs'].append(uncertified_obs) for i in range(run_length): @@ -427,13 +410,14 @@ def run(self, uncertified_obs, _, _, _ = uncertified_env.step(uncertified_action) self.results_dict['uncertified_actions'].append(uncertified_action) self.results_dict['uncertified_obs'].append(uncertified_obs) - self.close_results_dict() uncertified_env.close() return self.results_dict def setup_results_dict(self): - """Setup the results dictionary to store run information.""" + """Setup the results dictionary to store run information. + + """ self.results_dict = {} self.results_dict['obs'] = [] self.results_dict['actions'] = [] @@ -446,7 +430,9 @@ def setup_results_dict(self): self.results_dict['kinf'] = [] def close_results_dict(self): - """Cleanup the rtesults dict and munchify it.""" + """Cleanup the rtesults dict and munchify it. + + """ self.results_dict['obs'] = np.vstack(self.results_dict['obs']) self.results_dict['uncertified_obs'] = np.vstack(self.results_dict['uncertified_obs']) self.results_dict['uncertified_actions'] = np.vstack(self.results_dict['uncertified_actions']) @@ -457,12 +443,15 @@ def close_results_dict(self): self.results_dict = munchify(self.results_dict) def close(self): - """Cleans up resources.""" + """Cleans up resources. + + """ self.env.close() - #self.logger.close() def reset(self): - """Prepares for training or evaluation.""" + """Prepares for training or evaluation. + + """ # setup reference input if self.env.TASK == Task.STABILIZATION: self.mode = "stabilization" diff --git a/safe_control_gym/controllers/mpsc/mpsc.yaml b/safe_control_gym/controllers/mpsc/mpsc.yaml index b67af61be..11967bfcb 100644 --- a/safe_control_gym/controllers/mpsc/mpsc.yaml +++ b/safe_control_gym/controllers/mpsc/mpsc.yaml @@ -6,11 +6,11 @@ q_lin: - 1. warmstart: False -# safe set calculation +# Safe set calculation n_samples: 600 -# tau parameter for the calcuation of the RPI +# Tau parameter for the calcuation of the RPI tau: 0.95 -# how long to run the mpsc for +# How long to run the mpsc for run_length: 30 diff --git a/safe_control_gym/controllers/mpsc/mpsc_utils.py b/safe_control_gym/controllers/mpsc/mpsc_utils.py index fc56039b1..6e341f95d 100644 --- a/safe_control_gym/controllers/mpsc/mpsc_utils.py +++ b/safe_control_gym/controllers/mpsc/mpsc_utils.py @@ -1,21 +1,29 @@ -"""Utility functions for Model Predictive Safety Certification +"""Utility functions for Model Predictive Safety Certification. """ -from itertools import product -from functools import partial import cvxpy as cp import numpy as np import pytope as pt + +from itertools import product +from functools import partial from matplotlib.patches import Ellipse from safe_control_gym.envs.constraints import BoundedConstraint, LinearConstraint -def compute_RPI_set(Acl, w, tau): + +def compute_RPI_set(Acl, + w, + tau + ): """Compute a Robust positively invariant set (RPI). This follows the method in sec IV A from [1] to compute the robust positively invartiant set through a convex optimzation problem (LMI?) (equation 8a and 8b). + [1] K.P. Wabsersich and M.N. Zeilinger "Linear model predictive safety certification for learning-based control" + 2019. https://arxiv.org/pdf/1803.08552.pdf + Args: Acl (np.array): Closed loop gain matrix A+BK (nx by nx) w (np.array): Collection of dynamics error residuals with dim (nx by n_samples) @@ -24,9 +32,6 @@ def compute_RPI_set(Acl, w, tau): Returns: P (np.array): P from eqn 8 that defines the ellipsoidal RPI set. - [1] K.P. Wabsersich and M.N. Zeilinger "Linear model predictive safety certification for learning-based control" - 2019. https://arxiv.org/pdf/1803.08552.pdf - """ n_samples = w.shape[1] P = cp.Variable(Acl.shape, symmetric=True) @@ -41,24 +46,22 @@ def compute_RPI_set(Acl, w, tau): con_22 = w_i.T @ P @ w_i + tau - 1 constraints += [cp.bmat([[con_11, con_12], [con_21, con_22]]) << 0] - prob = cp.Problem(cp.Minimize(-cp.log_det(P)), constraints) try: results = prob.solve(solver='MOSEK', verbose=True) except cp.SolverError: print("[ERROR] RPI computation requires the MOSEK solver.") exit() - return P.value def ellipse_bounding_box(P): - """Finds the bounding box of an ellipse defined by x^T P x <= 1 + """Finds the bounding box of an ellipse defined by x^T P x <= 1. Args: P (np.array): n by n array defining the ellipse. Returns: - vertices (np.array): An vertical of the vertices (number of verts by dim of space) + vertices (np.array): An vertical of the vertices (number of verts by dim of space). """ c = np.eye(P.shape[0]) @@ -66,9 +69,7 @@ def ellipse_bounding_box(P): for i in range(P.shape[0]): extremes.append((np.sqrt(c[:,i, None].T @ np.linalg.inv(P) @ c[:,i, None])[0,0], -np.sqrt(c[:,i, None].T @ np.linalg.inv(P) @ c[:,i, None])[0,0])) - vertices = list(product(*extremes)) - return np.vstack(vertices) def get_ellipse_eig_decomp(P): @@ -87,15 +88,15 @@ def get_ellipse_eig_decomp(P): evals, evecs = np.linalg.eig(P) major_axis_ind = np.argmin(evals) minor_axis_ind = 0 if major_axis_ind == 1 else 1 - major_eval = evals[major_axis_ind] minor_eval = evals[minor_axis_ind] major_evec = evecs[:, major_axis_ind] minor_evec = evecs[:, minor_axis_ind] - return minor_eval, major_eval, minor_evec, major_evec -def get_ellipse_angle_rep(P, rads=True): +def get_ellipse_angle_rep(P, + rads=True + ): """Gets the angle representation of the ellipse which is required for plotting. Args: @@ -110,16 +111,18 @@ def get_ellipse_angle_rep(P, rads=True): """ minor_eval, major_eval, minor_evec, major_evec = get_ellipse_eig_decomp(P) alpha = np.arctan2(major_evec[1], major_evec[0]) - major_axis_length = 1/np.sqrt(major_eval) minor_axis_length = 1/np.sqrt(minor_eval) - if rads: return minor_axis_length, major_axis_length, alpha else: return minor_axis_length, major_axis_length, alpha*180/np.pi -def add_2d_ellipse(position, cov, ax, legend=None): +def add_2d_ellipse(position, + cov, + ax, + legend=None + ): """Add a 2D Ellipse patch to an axis. Args: @@ -130,7 +133,6 @@ def add_2d_ellipse(position, cov, ax, legend=None): """ minor_axis_length, major_axis_length, alpha = get_ellipse_angle_rep(cov, rads=False) - if legend: ellipse = Ellipse(position, 2*major_axis_length, @@ -144,16 +146,16 @@ def add_2d_ellipse(position, cov, ax, legend=None): 2*minor_axis_length, angle=alpha, alpha=0.5) - - ax.add_artist(ellipse) -def pontryagin_difference_AABB(verts1, verts2): - """Computre verts1 (-) verts2 +def pontryagin_difference_AABB(verts1, + verts2 + ): + """Computre verts1 (-) verts2. Args: verts1, verts2 (np.array): Array of vertices ( n vertices by space dimension) stacked vertically - so that every vertix is a horizonal array + so that every vertix is a horizonal array: vertsi = [v1, v2, ..., @@ -164,7 +166,7 @@ def pontryagin_difference_AABB(verts1, verts2): """ if verts1.shape[1] > 1: - # If greater than 2 dimensions + # If greater than 2 dimensions. poly1 = pt.Polytope(verts1) poly2 = pt.Polytope(verts2) diff = poly1 - poly2 @@ -174,7 +176,7 @@ def pontryagin_difference_AABB(verts1, verts2): const_func = partial(LinearConstraint, A=A, b=b) return verts, const_func else: - # If 1D data. Only handles closed compact sets + # If 1D data. Only handles closed compact sets. vert2_range = np.ptp(verts2) vert_min = np.min(verts1) + vert2_range/2 vert_max = np.max(verts1) - vert2_range/2 diff --git a/safe_control_gym/controllers/ppo/ppo.py b/safe_control_gym/controllers/ppo/ppo.py index 48872091e..8e166f465 100644 --- a/safe_control_gym/controllers/ppo/ppo.py +++ b/safe_control_gym/controllers/ppo/ppo.py @@ -1,36 +1,22 @@ """Proximal Policy Optimization (PPO) -Adapted from https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/ppo.py -Adapted hyperparameters from [here](https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/ppo.yml) - -Reference papers & code: - * [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) - * [Implementation Matters in Deep Policy Gradients: A Case Study on PPO and TRPO](https://arxiv.org/pdf/2005.12729.pdf) - * [pytorch-a2c-ppo-acktr-gail](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail) - * [openai spinning up - ppo](https://github.com/openai/spinningup/tree/master/spinup/algos/pytorch/ppo) - * [stable baselines3 - ppo](https://github.com/DLR-RM/stable-baselines3/tree/master/stable_baselines3/ppo) - -Example: - train on cartpole balance:: - - $ python mains/main.py --algo ppo --task cartpole --overrides ppo_cartpole.yaml - - test on cartpole balance:: - - $ python mains/main.py --func test --restore {...} - - train on quadrotor stabilization:: - - $ - -Todo: - * +Based on: + * https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/ppo.py + * (hyperparameters) https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/ppo.yml + +Additional references: + * Proximal Policy Optimization Algorithms - https://arxiv.org/pdf/1707.06347.pdf + * Implementation Matters in Deep Policy Gradients: A Case Study on PPO and TRPO - https://arxiv.org/pdf/2005.12729.pdf + * pytorch-a2c-ppo-acktr-gail - https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail + * openai spinning up - ppo - https://github.com/openai/spinningup/tree/master/spinup/algos/pytorch/ppo + * stable baselines3 - ppo - https://github.com/DLR-RM/stable-baselines3/tree/master/stable_baselines3/ppo """ import os import time import numpy as np import torch + from collections import defaultdict from safe_control_gym.utils.logging import ExperimentLogger @@ -43,24 +29,32 @@ class PPO(BaseController): - """proximal policy optimization.""" - - def __init__(self, env_func, training=True, checkpoint_path="model_latest.pt", output_dir="temp", device="cpu", seed=0, **kwargs): + """Proximal policy optimization. + + """ + + def __init__(self, + env_func, + training=True, + checkpoint_path="model_latest.pt", + output_dir="temp", + device="cpu", + seed=0, + **kwargs + ): super().__init__(env_func, training, checkpoint_path, output_dir, device, seed, **kwargs) - - # task + # Task. if self.training: - # training (+ evaluation) + # Training and testing. self.env = make_vec_envs(env_func, None, self.rollout_batch_size, self.num_workers, seed) self.env = VecRecordEpisodeStatistics(self.env, self.deque_size) self.eval_env = env_func() self.eval_env = RecordEpisodeStatistics(self.eval_env, self.deque_size) else: - # testing only + # Testing only. self.env = env_func() self.env = RecordEpisodeStatistics(self.env) - - # agent + # Agent. self.agent = PPOAgent(self.env.observation_space, self.env.action_space, hidden_dim=self.hidden_dim, @@ -73,52 +67,55 @@ def __init__(self, env_func, training=True, checkpoint_path="model_latest.pt", o opt_epochs=self.opt_epochs, mini_batch_size=self.mini_batch_size) self.agent.to(device) - - # pre-/post-processing + # Pre-/post-processing. self.obs_normalizer = BaseNormalizer() if self.norm_obs: self.obs_normalizer = MeanStdNormalizer(shape=self.env.observation_space.shape, clip=self.clip_obs, epsilon=1e-8) - self.reward_normalizer = BaseNormalizer() if self.norm_reward: self.reward_normalizer = RewardStdNormalizer(gamma=self.gamma, clip=self.clip_reward, epsilon=1e-8) - - # logging + # Logging. if self.training: log_file_out = True use_tensorboard = self.tensorboard else: - # disable logging to texts and tfboard for evaluation + # Disable logging to file and tfboard for evaluation. log_file_out = False use_tensorboard = False self.logger = ExperimentLogger(output_dir, log_file_out=log_file_out, use_tensorboard=use_tensorboard) def reset(self): - """Do initializations for training or evaluation.""" + """Do initializations for training or evaluation. + + """ if self.training: # set up stats tracking self.env.add_tracker("constraint_violation", 0) self.eval_env.add_tracker("constraint_values", 0, mode="queue") - self.total_steps = 0 obs, _ = self.env.reset() self.obs = self.obs_normalizer(obs) else: - # add eposodic stats to be tracked + # Add episodic stats to be tracked. self.env.add_tracker("constraint_values", 0, mode="queue") def close(self): - """Shuts down and cleans up lingering resources.""" + """Shuts down and cleans up lingering resources. + + """ self.env.close() if self.training: self.eval_env.close() self.logger.close() - def save(self, path): - """Saves model params and experiment state to checkpoint path.""" + def save(self, + path + ): + """Saves model params and experiment state to checkpoint path. + + """ path_dir = os.path.dirname(path) os.makedirs(path_dir, exist_ok=True) - state_dict = { "agent": self.agent.state_dict(), "obs_normalizer": self.obs_normalizer.state_dict(), @@ -134,16 +131,18 @@ def save(self, path): state_dict.update(exp_state) torch.save(state_dict, path) - def load(self, path): - """Restores model and experiment given checkpoint path.""" - state = torch.load(path) + def load(self, + path + ): + """Restores model and experiment given checkpoint path. - # restore policy + """ + state = torch.load(path) + # Restore policy. self.agent.load_state_dict(state["agent"]) self.obs_normalizer.load_state_dict(state["obs_normalizer"]) self.reward_normalizer.load_state_dict(state["reward_normalizer"]) - - # restore experiment state + # Restore experiment state. if self.training: self.total_steps = state["total_steps"] self.obs = state["obs"] @@ -151,22 +150,25 @@ def load(self, path): self.env.set_env_random_state(state["env_random_state"]) self.logger.load(self.total_steps) - def learn(self, env=None, **kwargs): - """Performs learning (pre-training, training, fine-tuning, etc).""" + def learn(self, + env=None, + **kwargs + ): + """Performs learning (pre-training, training, fine-tuning, etc). + + """ while self.total_steps < self.max_env_steps: results = self.train_step() - - # checkpoint + # Checkpoint. if self.total_steps >= self.max_env_steps or (self.save_interval and self.total_steps % self.save_interval == 0): - # latest/final checkpoint + # Latest/final checkpoint. self.save(self.checkpoint_path) self.logger.info("Checkpoint | {}".format(self.checkpoint_path)) if self.num_checkpoints and self.total_steps % (self.max_env_steps // self.num_checkpoints) == 0: - # intermediate checkpoint + # Intermediate checkpoint. path = os.path.join(self.output_dir, "checkpoints", "model_{}.pt".format(self.total_steps)) self.save(path) - - # eval + # Evaluation. if self.eval_interval and self.total_steps % self.eval_interval == 0: eval_results = self.run(env=self.eval_env, n_episodes=self.eval_batch_size) results["eval"] = eval_results @@ -174,19 +176,26 @@ def learn(self, env=None, **kwargs): eval_results["ep_lengths"].std(), eval_results["ep_returns"].mean(), eval_results["ep_returns"].std())) - # save best model + # Save best model. eval_score = eval_results["ep_returns"].mean() eval_best_score = getattr(self, "eval_best_score", -np.infty) if self.eval_save_best and eval_best_score < eval_score: self.eval_best_score = eval_score self.save(os.path.join(self.output_dir, "model_best.pt")) - - # logging + # Logging. if self.log_interval and self.total_steps % self.log_interval == 0: self.log_step(results) - def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs): - """Runs evaluation with current policy.""" + def run(self, + env=None, + render=False, + n_episodes=10, + verbose=False, + **kwargs + ): + """Runs evaluation with current policy. + + """ self.agent.eval() self.obs_normalizer.set_read_only() if env is None: @@ -194,66 +203,59 @@ def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs): else: if not is_wrapped(env, RecordEpisodeStatistics): env = RecordEpisodeStatistics(env, n_episodes) - # add eposodic stats to be tracked + # Add eposodic stats to be tracked. env.add_tracker("constraint_values", 0, mode="queue") - obs, info = env.reset() obs = self.obs_normalizer(obs) ep_returns, ep_lengths = [], [] frames = [] - while len(ep_returns) < n_episodes: with torch.no_grad(): obs = torch.FloatTensor(obs).to(self.device) action = self.agent.ac.act(obs) - obs, reward, done, info = env.step(action) if render: env.render() frames.append(env.render("rgb_array")) if verbose: print("obs {} | act {}".format(obs, action)) - if done: assert "episode" in info ep_returns.append(info["episode"]["r"]) ep_lengths.append(info["episode"]["l"]) obs, _ = env.reset() obs = self.obs_normalizer(obs) - - # collect evaluation results + # Collect evaluation results. ep_lengths = np.asarray(ep_lengths) ep_returns = np.asarray(ep_returns) eval_results = {"ep_returns": ep_returns, "ep_lengths": ep_lengths} if len(frames) > 0: eval_results["frames"] = frames - # other episodic stats from evaluation env + # Other episodic stats from evaluation env. if len(env.queued_stats) > 0: queued_stats = {k: np.asarray(v) for k, v in env.queued_stats.items()} eval_results.update(queued_stats) return eval_results def train_step(self): - """Performs a training/fine-tuning step.""" + """Performs a training/fine-tuning step. + + """ self.agent.train() self.obs_normalizer.unset_read_only() rollouts = PPOBuffer(self.env.observation_space, self.env.action_space, self.rollout_steps, self.rollout_batch_size) obs = self.obs start = time.time() - for step in range(self.rollout_steps): with torch.no_grad(): act, v, logp = self.agent.ac.step(torch.FloatTensor(obs).to(self.device)) next_obs, rew, done, info = self.env.step(act) - next_obs = self.obs_normalizer(next_obs) rew = self.reward_normalizer(rew, done) mask = 1 - done.astype(float) - - # time truncation is not true termination + # Time truncation is not the same as true termination. terminal_v = np.zeros_like(v) for idx, inf in enumerate(info["n"]): - # if "TimeLimit.truncated" in inf and inf["TimeLimit.truncated"]: if "terminal_info" not in inf: continue inff = inf["terminal_info"] @@ -262,14 +264,11 @@ def train_step(self): terminal_obs_tensor = torch.FloatTensor(terminal_obs).unsqueeze(0).to(self.device) terminal_val = self.agent.ac.critic(terminal_obs_tensor).squeeze().detach().numpy() terminal_v[idx] = terminal_val - rollouts.push({"obs": obs, "act": act, "rew": rew, "mask": mask, "v": v, "logp": logp, "terminal_v": terminal_v}) obs = next_obs - self.obs = obs self.total_steps += self.rollout_batch_size * self.rollout_steps - - # learn from rollout batch + # Learn from rollout batch. last_val = self.agent.ac.critic(torch.FloatTensor(obs).to(self.device)).detach().numpy() ret, adv = compute_returns_and_advantages(rollouts.rew, rollouts.v, @@ -280,15 +279,18 @@ def train_step(self): use_gae=self.use_gae, gae_lambda=self.gae_lambda) rollouts.ret = ret - # prevent divide-by-0 for repetitive tasks + # Prevent divide-by-0 for repetitive tasks. rollouts.adv = (adv - adv.mean()) / (adv.std() + 1e-6) - results = self.agent.update(rollouts, self.device) results.update({"step": self.total_steps, "elapsed_time": time.time() - start}) return results - def log_step(self, results): - """Does logging after a training step.""" + def log_step(self, + results + ): + """Does logging after a training step. + + """ step = results["step"] # runner stats self.logger.add_scalars( @@ -301,11 +303,9 @@ def log_step(self, results): prefix="time", write=False, write_tb=False) - - # learning stats + # Learning stats. self.logger.add_scalars({k: results[k] for k in ["policy_loss", "value_loss", "entropy_loss", "approx_kl"]}, step, prefix="loss") - - # performance stats + # Performance stats. ep_lengths = np.asarray(self.env.length_queue) ep_returns = np.asarray(self.env.return_queue) self.logger.add_scalars( @@ -316,11 +316,9 @@ def log_step(self, results): }, step, prefix="stat") - - # total constraint violation during learning + # Total constraint violation during learning. total_violations = self.env.accumulated_stats["constraint_violation"] self.logger.add_scalars({"constraint_violation": total_violations}, step, prefix="stat") - if "eval" in results: eval_ep_lengths = results["eval"]["ep_lengths"] eval_ep_returns = results["eval"]["ep_returns"] @@ -332,21 +330,19 @@ def log_step(self, results): }, step, prefix="stat_eval") - - # each episodic constraint + # Each episodic constraint. if "constraint_values" in results["eval"]: con_dict = {} con_vals = results["eval"]["constraint_values"] if len(con_vals.shape) == 2: - # multiple constraints + # Multiple constraints. for i in range(con_vals.shape[-1]): con_name = "constraint_value_{}".format(i) con_dict[con_name] = con_vals[:, i].mean() else: - # single constraint + # Single constraint. con_name = "constraint_value_0" con_dict[con_name] = con_vals.mean() self.logger.add_scalars(con_dict, step, prefix="stat_eval") - - # print summary table - self.logger.dump_scalars() \ No newline at end of file + # Print summary table + self.logger.dump_scalars() diff --git a/safe_control_gym/controllers/ppo/ppo.yaml b/safe_control_gym/controllers/ppo/ppo.yaml index 277170a3e..a5567a791 100644 --- a/safe_control_gym/controllers/ppo/ppo.yaml +++ b/safe_control_gym/controllers/ppo/ppo.yaml @@ -1,11 +1,11 @@ -# model args +# Model args hidden_dim: 64 norm_obs: False norm_reward: False clip_obs: 10. clip_reward: 10. -# loss args +# Loss args gamma: 0.99 use_gae: False gae_lambda: 0.95 @@ -14,14 +14,14 @@ clip_param: 0.2 target_kl: 0.01 entropy_coef: 0.01 -# optim args +# Optim args opt_epochs: 10 mini_batch_size: 64 actor_lr: 0.0003 critic_lr: 0.001 max_grad_norm: 0.5 -# runner args +# Runner args max_env_steps: 1000000 num_workers: 1 rollout_batch_size: 4 @@ -29,7 +29,7 @@ rollout_steps: 100 deque_size: 10 eval_batch_size: 10 -# misc +# Misc log_interval: 0 save_interval: 0 num_checkpoints: 0 diff --git a/safe_control_gym/controllers/ppo/ppo_utils.py b/safe_control_gym/controllers/ppo/ppo_utils.py index 086a8cfaa..baeb25d90 100644 --- a/safe_control_gym/controllers/ppo/ppo_utils.py +++ b/safe_control_gym/controllers/ppo/ppo_utils.py @@ -1,20 +1,22 @@ -from collections import defaultdict -from copy import deepcopy +"""PPO utilities. + +""" import numpy as np import torch import torch.nn as nn + from gym.spaces import Box +from collections import defaultdict +from copy import deepcopy from safe_control_gym.math_and_models.neural_networks import MLP, CNN, RNN, init_ from safe_control_gym.math_and_models.distributions import Normal, Categorical -# ----------------------------------------------------------------------------------- -# Agent -# ----------------------------------------------------------------------------------- - class PPOAgent: - """A PPO class that encapsulates model, optimizer and update functions.""" + """A PPO class that encapsulates models, optimizers and update functions. + + """ def __init__(self, obs_space, @@ -28,83 +30,94 @@ def __init__(self, critic_lr=0.001, opt_epochs=10, mini_batch_size=64, - **kwargs): - # params + **kwargs + ): + # Parameters. self.obs_space = obs_space self.act_space = act_space - self.use_clipped_value = use_clipped_value self.clip_param = clip_param self.target_kl = target_kl self.entropy_coef = entropy_coef - self.opt_epochs = opt_epochs self.mini_batch_size = mini_batch_size - - # model + # Model. self.ac = MLPActorCritic(obs_space, act_space, hidden_dims=[hidden_dim] * 2, activation="tanh") - - # optimizers + # Optimizers. self.actor_opt = torch.optim.Adam(self.ac.actor.parameters(), actor_lr) - self.critic_opt = torch.optim.Adam(self.ac.critic.parameters(), - critic_lr) + self.critic_opt = torch.optim.Adam(self.ac.critic.parameters(), critic_lr) - def to(self, device): - """Puts agent to device.""" + def to(self, + device + ): + """Puts agent to device. + + """ self.ac.to(device) def train(self): - """Sets training mode.""" + """Sets training mode. + + """ self.ac.train() def eval(self): - """Sets evaluation mode.""" + """Sets evaluation mode. + + """ self.ac.eval() def state_dict(self): - """Snapshots agent state.""" + """Snapshots agent state. + + """ return { "ac": self.ac.state_dict(), "actor_opt": self.actor_opt.state_dict(), "critic_opt": self.critic_opt.state_dict() } - def load_state_dict(self, state_dict): - """Restores agent state.""" + def load_state_dict(self, + state_dict + ): + """Restores agent state. + + """ self.ac.load_state_dict(state_dict["ac"]) self.actor_opt.load_state_dict(state_dict["actor_opt"]) self.critic_opt.load_state_dict(state_dict["critic_opt"]) - def compute_policy_loss(self, batch): - """Returns policy loss(es) given batch of data.""" - obs, act, logp_old, adv = batch["obs"], batch["act"], batch[ - "logp"], batch["adv"] - dist, logp = self.ac.actor(obs, act) + def compute_policy_loss(self, + batch + ): + """Returns policy loss(es) given batch of data. - # policy + """ + obs, act, logp_old, adv = batch["obs"], batch["act"], batch["logp"], batch["adv"] + dist, logp = self.ac.actor(obs, act) + # Policy. ratio = torch.exp(logp - logp_old) - clip_adv = torch.clamp(ratio, 1 - self.clip_param, - 1 + self.clip_param) * adv + clip_adv = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * adv policy_loss = -torch.min(ratio * adv, clip_adv).mean() - - # entropy + # Entropy. entropy_loss = -dist.entropy().mean() - - # kl/trust region + # KL/trust region. approx_kl = (logp_old - logp).mean() return policy_loss, entropy_loss, approx_kl - def compute_value_loss(self, batch): - """Returns value loss(es) given batch of data.""" + def compute_value_loss(self, + batch + ): + """Returns value loss(es) given batch of data. + + """ obs, ret, v_old = batch["obs"], batch["ret"], batch["v"] v_cur = self.ac.critic(obs) - if self.use_clipped_value: - v_old_clipped = v_old + (v_cur - v_old).clamp( - -self.clip_param, self.clip_param) + v_old_clipped = v_old + (v_cur - v_old).clamp(-self.clip_param, self.clip_param) v_loss = (v_cur - ret).pow(2) v_loss_clipped = (v_old_clipped - ret).pow(2) value_loss = 0.5 * torch.max(v_loss, v_loss_clipped).mean() @@ -112,62 +125,60 @@ def compute_value_loss(self, batch): value_loss = 0.5 * (v_cur - ret).pow(2).mean() return value_loss - def update(self, rollouts, device="cpu"): - """Updates model parameters based on current training batch.""" + def update(self, + rollouts, + device="cpu" + ): + """Updates model parameters based on current training batch. + + """ results = defaultdict(list) num_mini_batch = rollouts.max_length * rollouts.batch_size // self.mini_batch_size - for i in range(self.opt_epochs): p_loss_epoch, v_loss_epoch, e_loss_epoch, kl_epoch = 0, 0, 0, 0 - for batch in rollouts.sampler(self.mini_batch_size, device): - # actor update - policy_loss, entropy_loss, approx_kl = self.compute_policy_loss( - batch) - # update only when no kl constraint or constraint is satisfied - if (not self.target_kl > 0) or (self.target_kl > 0 and approx_kl - <= 1.5 * self.target_kl): + # Actor update. + policy_loss, entropy_loss, approx_kl = self.compute_policy_loss(batch) + # Update only when no KL constraint or constraint is satisfied. + if (not self.target_kl > 0) or (self.target_kl > 0 and approx_kl <= 1.5 * self.target_kl): self.actor_opt.zero_grad() (policy_loss + self.entropy_coef * entropy_loss).backward() self.actor_opt.step() - - # critic update + # Critic update. value_loss = self.compute_value_loss(batch) self.critic_opt.zero_grad() value_loss.backward() self.critic_opt.step() - p_loss_epoch += policy_loss.item() v_loss_epoch += value_loss.item() e_loss_epoch += entropy_loss.item() kl_epoch += approx_kl.item() - results["policy_loss"].append(p_loss_epoch / num_mini_batch) results["value_loss"].append(v_loss_epoch / num_mini_batch) results["entropy_loss"].append(e_loss_epoch / num_mini_batch) results["approx_kl"].append(kl_epoch / num_mini_batch) - results = {k: sum(v) / len(v) for k, v in results.items()} return results -# ----------------------------------------------------------------------------------- -# Models -# ----------------------------------------------------------------------------------- - - class MLPActor(nn.Module): + """Actor MLP model. + + """ def __init__(self, obs_dim, act_dim, hidden_dims, activation, - discrete=False): + discrete=False + ): + """ + + """ super().__init__() self.pi_net = MLP(obs_dim, act_dim, hidden_dims, activation) - - # construct output action distribution + # Construct output action distribution. self.discrete = discrete if discrete: self.dist_fn = lambda x: Categorical(logits=x) @@ -175,7 +186,13 @@ def __init__(self, self.logstd = nn.Parameter(-0.5 * torch.ones(act_dim)) self.dist_fn = lambda x: Normal(x, self.logstd.exp()) - def forward(self, obs, act=None): + def forward(self, + obs, + act=None + ): + """ + + """ dist = self.dist_fn(self.pi_net(obs)) logp_a = None if act is not None: @@ -184,12 +201,27 @@ def forward(self, obs, act=None): class MLPCritic(nn.Module): + """Critic MLP model. + + """ + + def __init__(self, + obs_dim, + hidden_dims, + activation + ): + """ - def __init__(self, obs_dim, hidden_dims, activation): + """ super().__init__() self.v_net = MLP(obs_dim, 1, hidden_dims, activation) - def forward(self, obs): + def forward(self, + obs + ): + """ + + """ return self.v_net(obs) @@ -197,17 +229,21 @@ class MLPActorCritic(nn.Module): """Model for the actor-critic agent. Attributes: - actor (MLPActor): policy network. - critic (MLPCritic): value network. + actor (MLPActor): policy network. + critic (MLPCritic): value network. + """ def __init__(self, obs_space, act_space, hidden_dims=(64, 64), - activation="tanh"): - super().__init__() + activation="tanh" + ): + """ + """ + super().__init__() obs_dim = obs_space.shape[0] if isinstance(act_space, Box): act_dim = act_space.shape[0] @@ -215,31 +251,34 @@ def __init__(self, else: act_dim = act_space.n discrete = True - - # policy - self.actor = MLPActor(obs_dim, act_dim, hidden_dims, activation, - discrete) - # value function + # Policy. + self.actor = MLPActor(obs_dim, act_dim, hidden_dims, activation, discrete) + # Value function. self.critic = MLPCritic(obs_dim, hidden_dims, activation) - def step(self, obs): + def step(self, + obs + ): + """ + + """ dist, _ = self.actor(obs) a = dist.sample() logp_a = dist.log_prob(a) v = self.critic(obs) return a.numpy(), v.numpy(), logp_a.numpy() - def act(self, obs): + def act(self, + obs + ): + """ + + """ dist, _ = self.actor(obs) a = dist.mode() return a.cpu().numpy() -# ----------------------------------------------------------------------------------- -# Storage -# ----------------------------------------------------------------------------------- - - class PPOBuffer(object): """Storage for a batch of episodes during training. @@ -248,20 +287,24 @@ class PPOBuffer(object): batch_size (int): number of episodes per batch. scheme (dict): describs shape & other info of data to be stored. keys (list): names of all data from scheme. + """ - def __init__(self, obs_space, act_space, max_length, batch_size): + def __init__(self, + obs_space, + act_space, + max_length, + batch_size + ): super().__init__() self.max_length = max_length self.batch_size = batch_size - T, N = max_length, batch_size obs_dim = obs_space.shape if isinstance(act_space, Box): act_dim = act_space.shape[0] else: act_dim = act_space.n - self.scheme = { "obs": { "vshape": (T, N, *obs_dim) @@ -296,30 +339,37 @@ def __init__(self, obs_space, act_space, max_length, batch_size): self.reset() def reset(self): - """Allocates space for containers.""" + """Allocates space for containers. + + """ for k, info in self.scheme.items(): - assert "vshape" in info, "Scheme must define vshape for {}".format( - k) + assert "vshape" in info, "Scheme must define vshape for {}".format(k) vshape = info["vshape"] dtype = info.get("dtype", np.float32) init = info.get("init", np.zeros) self.__dict__[k] = init(vshape, dtype=dtype) - self.t = 0 - def push(self, batch): - """Inserts transition step data (as dict) to storage.""" + def push(self, + batch + ): + """Inserts transition step data (as dict) to storage. + + """ for k, v in batch.items(): assert k in self.keys shape = self.scheme[k]["vshape"][1:] dtype = self.scheme[k].get("dtype", np.float32) v_ = np.asarray(deepcopy(v), dtype=dtype).reshape(shape) self.__dict__[k][self.t] = v_ - self.t = (self.t + 1) % self.max_length - def get(self, device="cpu"): - """Returns all data.""" + def get(self, + device="cpu" + ): + """Returns all data. + + """ batch = {} for k, info in self.scheme.items(): shape = info["vshape"][2:] @@ -327,20 +377,28 @@ def get(self, device="cpu"): batch[k] = torch.as_tensor(data, device=device) return batch - def sample(self, indices): - """Returns partial data.""" + def sample(self, + indices + ): + """Returns partial data. + + """ batch = {} for k, info in self.scheme.items(): shape = info["vshape"][2:] batch[k] = self.__dict__[k].reshape(-1, *shape)[indices] return batch - def sampler(self, mini_batch_size, device="cpu", drop_last=True): - """Makes sampler to loop through all data.""" - total_steps = self.max_length * self.batch_size - sampler = random_sample(np.arange(total_steps), mini_batch_size, - drop_last) + def sampler(self, + mini_batch_size, + device="cpu", + drop_last=True + ): + """Makes sampler to loop through all data. + """ + total_steps = self.max_length * self.batch_size + sampler = random_sample(np.arange(total_steps), mini_batch_size, drop_last) for indices in sampler: batch = self.sample(indices) batch = { @@ -349,13 +407,13 @@ def sampler(self, mini_batch_size, device="cpu", drop_last=True): yield batch -# ----------------------------------------------------------------------------------- -# Misc -# ----------------------------------------------------------------------------------- - +def random_sample(indices, + batch_size, + drop_last=True + ): + """Returns index batches to iterate over. -def random_sample(indices, batch_size, drop_last=True): - """Returns index batches to iterave over""" + """ indices = np.asarray(np.random.permutation(indices)) batches = indices[:len(indices) // batch_size * batch_size].reshape( -1, batch_size) @@ -374,17 +432,18 @@ def compute_returns_and_advantages(rews, last_val=0, gamma=0.99, use_gae=False, - gae_lambda=0.95): - """Useful for policy-gradient algos.""" + gae_lambda=0.95 + ): + """Useful for policy-gradient algorithms. + + """ T, N = rews.shape[:2] rets, advs = np.zeros((T, N, 1)), np.zeros((T, N, 1)) ret, adv = last_val, np.zeros((N, 1)) vals = np.concatenate([vals, last_val[np.newaxis, ...]], 0) - - # compensate for time truncation + # Compensate for time truncation. rews += gamma * terminal_vals - - # cumulative discounted sums + # Cumulative discounted sums. for i in reversed(range(T)): ret = rews[i] + gamma * masks[i] * ret if not use_gae: @@ -394,5 +453,4 @@ def compute_returns_and_advantages(rews, adv = adv * gae_lambda * gamma * masks[i] + td_error rets[i] = deepcopy(ret) advs[i] = deepcopy(adv) - return rets, advs diff --git a/safe_control_gym/controllers/safe_explorer/safe_explorer_utils.py b/safe_control_gym/controllers/safe_explorer/safe_explorer_utils.py index 26d5f2356..8c480f9e2 100644 --- a/safe_control_gym/controllers/safe_explorer/safe_explorer_utils.py +++ b/safe_control_gym/controllers/safe_explorer/safe_explorer_utils.py @@ -1,23 +1,25 @@ -from gym.spaces import Box -from collections import defaultdict -from copy import deepcopy +"""Utility function for a generic safe explorer. + +""" import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from gym.spaces import Box +from collections import defaultdict +from copy import deepcopy + from safe_control_gym.envs.env_wrappers.vectorized_env.vec_env_utils import _flatten_obs from safe_control_gym.math_and_models.neural_networks import MLP, CNN, RNN, init_ from safe_control_gym.math_and_models.distributions import Normal, Categorical from safe_control_gym.controllers.ppo.ppo_utils import random_sample -# ----------------------------------------------------------------------------------- -# Safety Layer -# ----------------------------------------------------------------------------------- - class SafetyLayer: - """Layer to learn constraint models and to impose action projection.""" + """Layer to learn constraint models and to impose action projection. + + """ def __init__(self, obs_space, @@ -27,61 +29,76 @@ def __init__(self, lr=0.001, slack=None, **kwargs): - # params + # Parameters. self.num_constraints = num_constraints - - # seperate model per constraint + # Seperate model per constraint. input_dim = obs_space.shape[0] output_dim = act_space.shape[0] self.constraint_models = nn.ModuleList([ MLP(input_dim, output_dim, hidden_dims=[hidden_dim]) for _ in range(self.num_constraints) ]) - # constraint slack variables/values + # Constraint slack variables/values. assert slack is not None and isinstance(slack, (int, float, list)) if isinstance(slack, (int, float)): slack = [slack] self.slack = np.array(slack) - - # optimizers + # Optimizers. self.optimizers = [ torch.optim.Adam(model.parameters(), lr=lr) for model in self.constraint_models ] - def to(self, device): - """Puts agent to device.""" + def to(self, + device + ): + """Puts agent to device. + + """ self.constraint_models.to(device) def train(self): - """Sets training mode.""" + """Sets training mode. + + """ self.constraint_models.train() def eval(self): - """Sets evaluation mode.""" + """Sets evaluation mode. + + """ self.constraint_models.eval() def state_dict(self): - """Snapshots agent state.""" + """Snapshots agent state. + + """ return { "constraint_models": self.constraint_models.state_dict(), "optimizers": [opt.state_dict() for opt in self.optimizers] } - def load_state_dict(self, state_dict): - """Restores agent state.""" + def load_state_dict(self, + state_dict + ): + """Restores agent state. + + """ self.constraint_models.load_state_dict(state_dict["constraint_models"]) for i, opt_state_dict in enumerate(state_dict["optimizers"]): self.optimizers[i].load_state_dict(opt_state_dict) - def compute_loss(self, batch): - """Gets constraint value L2 loss for each constraint.""" + def compute_loss(self, + batch + ): + """Gets constraint value L2 loss for each constraint. + + """ obs, act = batch["obs"], batch["act"] c, c_next = batch["c"], batch["c_next"] gs = [model(obs) for model in self.constraint_models] - - # each is (N,1,A) x (N,A,1) -> (N,), so [(N,)]_{n_constriants} + # Each is (N,1,A) x (N,A,1) -> (N,), so [(N,)]_{n_constriants} c_next_pred = [ c[:, i] + torch.bmm(g.view(g.shape[0], 1, -1), act.view(act.shape[0], -1, 1)).view(-1) @@ -94,26 +111,30 @@ def compute_loss(self, batch): return losses def update(self, batch): - """Updates the constraint models from data batch.""" - losses = self.compute_loss(batch) + """Updates the constraint models from data batch. + """ + losses = self.compute_loss(batch) for loss, opt in zip(losses, self.optimizers): opt.zero_grad() loss.backward() opt.step() - - resutls = { + results = { "constraint_{}_loss".format(i): loss.item() for i, loss in enumerate(losses) } - return resutls + return results - def get_safe_action(self, obs, act, c): + def get_safe_action(self, + obs, + act, + c + ): """Does action projection with the trained safety layer. - According to the paper, this simple projection works when only 1 constraint - is active at a time; for multiple active constriants, either consult in-graph - QP solver like in OptLayer or consider cvxpylayers (https://github.com/cvxgrp/cvxpylayers) + According to Dalal 2018, this simple projection works when only 1 constraint at a time + is active; for multiple active constriants, either resort to in-graph QP solver such as + OptLayer or see cvxpylayers (https://github.com/cvxgrp/cvxpylayers). Args: obs (torch.FloatTensor): observations, shape (B,O). @@ -127,7 +148,6 @@ def get_safe_action(self, obs, act, c): self.eval() # [(B,A)]_C g = [model(obs) for model in self.constraint_models] - # Find the lagrange multipliers [(B,)]_C multipliers = [] for i in range(len(g)): @@ -138,12 +158,11 @@ def get_safe_action(self, obs, act, c): act.unsqueeze(2)).view(-1) + c_i + self.slack[i] denomin = torch.bmm(g_i.unsqueeze(1), g_i.unsqueeze(2)).view(-1) + 1e-8 - # formula (5) from paper + # Equation (5) from Dalal 2018. mult = F.relu(numer / denomin) # (B,) multipliers.append(mult) multipliers = torch.stack(multipliers, -1) # (B,C) - - # Calculate correction, formula (6) from paper + # Calculate correction, equation (6) from Dalal 2018. max_mult, max_idx = torch.topk(multipliers, 1, dim=-1) # (B,1) max_idx = max_idx.view(-1).tolist() # []_B # [(A,)]_B -> (B,A) @@ -154,11 +173,6 @@ def get_safe_action(self, obs, act, c): return action_new -# ----------------------------------------------------------------------------------- -# Storage -# ----------------------------------------------------------------------------------- - - class ConstraintBuffer(object): """Storage for replay buffer during training. @@ -167,6 +181,7 @@ class ConstraintBuffer(object): batch_size (int): number of samples (steps) per batch. scheme (dict): describs shape & other info of data to be stored. keys (list): names of all data from scheme. + """ def __init__(self, @@ -174,17 +189,16 @@ def __init__(self, act_space, num_constraints, max_size, - batch_size=None): + batch_size=None + ): super().__init__() self.max_size = max_size self.batch_size = batch_size - obs_dim = obs_space.shape if isinstance(act_space, Box): act_dim = act_space.shape[0] else: act_dim = act_space.n - N = max_size self.scheme = { "obs": { @@ -204,7 +218,9 @@ def __init__(self, self.reset() def reset(self): - """Allocate space for containers.""" + """Allocate space for containers. + + """ for k, info in self.scheme.items(): assert "vshape" in info, "Scheme must define vshape for {}".format( k) @@ -212,16 +228,19 @@ def reset(self): dtype = info.get("dtype", np.float32) init = info.get("init", np.zeros) self.__dict__[k] = init(vshape, dtype=dtype) - self.pos = 0 self.buffer_size = 0 def __len__(self): - """Returns current size of the buffer.""" + """Returns current size of the buffer. + + """ return self.buffer_size def state_dict(self): - """Returns a snapshot of current buffer.""" + """Returns a snapshot of current buffer. + + """ state = dict( pos=self.pos, buffer_size=self.buffer_size, @@ -231,47 +250,61 @@ def state_dict(self): state[k] = v return state - def load_state_dict(self, state): - """Restores buffer from previous state.""" + def load_state_dict(self, + state + ): + """Restores buffer from previous state. + + """ for k, v in state.items(): self.__dict__[k] = v - def push(self, batch): - """Inserts transition step data (as dict) to storage.""" - # batch size + def push(self, + batch + ): + """Inserts transition step data (as dict) to storage. + + """ + # Batch size. k = list(batch.keys())[0] n = batch[k].shape[0] - for k, v in batch.items(): shape = self.scheme[k]["vshape"][1:] dtype = self.scheme[k].get("dtype", np.float32) v_ = np.asarray(v, dtype=dtype).reshape((n,) + shape) - if self.pos + n <= self.max_size: self.__dict__[k][self.pos:self.pos + n] = v_ else: - # wrap around + # Wrap. remain_n = self.pos + n - self.max_size self.__dict__[k][self.pos:self.max_size] = v_[:-remain_n] self.__dict__[k][:remain_n] = v_[-remain_n:] - self.pos = (self.pos + n) % self.max_size if self.buffer_size < self.max_size: self.buffer_size = min(self.max_size, self.pos + n) - def sample(self, indices): - """Returns partial data.""" + def sample(self, + indices + ): + """Returns partial data. + + """ batch = {} for k, info in self.scheme.items(): shape = info["vshape"][1:] batch[k] = self.__dict__[k].reshape(-1, *shape)[indices] return batch - def sampler(self, batch_size, device="cpu", drop_last=True): - """Makes sampler to loop through all data.""" + def sampler(self, + batch_size, + device="cpu", + drop_last=True + ): + """Makes sampler to loop through all data. + + """ total_steps = len(self) sampler = random_sample(np.arange(total_steps), batch_size, drop_last) - for indices in sampler: batch = self.sample(indices) batch = { diff --git a/safe_control_gym/controllers/safe_explorer/safe_ppo.py b/safe_control_gym/controllers/safe_explorer/safe_ppo.py index 3d728bf04..011644958 100644 --- a/safe_control_gym/controllers/safe_explorer/safe_ppo.py +++ b/safe_control_gym/controllers/safe_explorer/safe_ppo.py @@ -1,9 +1,11 @@ -"""[summary] +"""PPO-based safe explorer. + """ import os import time import numpy as np import torch + from collections import defaultdict from safe_control_gym.utils.logging import ExperimentLogger @@ -12,7 +14,6 @@ from safe_control_gym.envs.env_wrappers.vectorized_env.vec_env_utils import _flatten_obs from safe_control_gym.envs.env_wrappers.record_episode_statistics import RecordEpisodeStatistics, VecRecordEpisodeStatistics from safe_control_gym.math_and_models.normalization import BaseNormalizer, MeanStdNormalizer, RewardStdNormalizer - from safe_control_gym.controllers.base_controller import BaseController from safe_control_gym.controllers.ppo.ppo_utils import compute_returns_and_advantages from safe_control_gym.controllers.safe_explorer.safe_explorer_utils import SafetyLayer, ConstraintBuffer @@ -20,26 +21,34 @@ class SafeExplorerPPO(BaseController): - """Safety layer for constraint satisfaction for RL.""" - - def __init__(self, env_func, training=True, checkpoint_path="model_latest.pt", output_dir="temp", device="cpu", seed=0, **kwargs): + """Safety layer for constraint satisfaction for RL. + + """ + + def __init__(self, + env_func, + training=True, + checkpoint_path="model_latest.pt", + output_dir="temp", + device="cpu", + seed=0, + **kwargs + ): super().__init__(env_func, training, checkpoint_path, output_dir, device, seed, **kwargs) - - # task + # Task. if self.training: - # training (+ evaluation) + # Training and testing. self.env = make_vec_envs(env_func, None, self.rollout_batch_size, self.num_workers, seed) self.env = VecRecordEpisodeStatistics(self.env, self.deque_size) self.eval_env = env_func() self.eval_env = RecordEpisodeStatistics(self.eval_env, self.deque_size) self.num_constraints = self.env.envs[0].num_constraints else: - # testing only + # Testing only. self.env = env_func() self.env = RecordEpisodeStatistics(self.env) self.num_constraints = self.env.num_constraints - - # safety layer + # Safety layer. self.safety_layer = SafetyLayer(self.env.observation_space, self.env.action_space, hidden_dim=self.constraint_hidden_dim, @@ -47,8 +56,7 @@ def __init__(self, env_func, training=True, checkpoint_path="model_latest.pt", o lr=self.constraint_lr, slack=self.constraint_slack) self.safety_layer.to(device) - - # agent + # Agent. self.agent = SafePPOAgent( self.env.observation_space, self.env.action_space, @@ -64,66 +72,67 @@ def __init__(self, env_func, training=True, checkpoint_path="model_latest.pt", o action_modifier=self.safety_layer.get_safe_action, ) self.agent.to(device) - - # pre-/post-processing + # Pre-/post-processing. self.obs_normalizer = BaseNormalizer() if self.norm_obs: self.obs_normalizer = MeanStdNormalizer(shape=self.env.observation_space.shape, clip=self.clip_obs, epsilon=1e-8) - self.reward_normalizer = BaseNormalizer() if self.norm_reward: self.reward_normalizer = RewardStdNormalizer(gamma=self.gamma, clip=self.clip_reward, epsilon=1e-8) - - # logging + # Logging. if self.training: log_file_out = True use_tensorboard = self.tensorboard else: - # disable logging to texts and tfboard for evaluation + # Disable logging to texts and tfboard for evaluation. log_file_out = False use_tensorboard = False self.logger = ExperimentLogger(output_dir, log_file_out=log_file_out, use_tensorboard=use_tensorboard) def reset(self): - """Do initializations for training or evaluation.""" + """Do initializations for training or evaluation. + + """ if self.training: if self.pretraining: self.constraint_buffer = ConstraintBuffer(self.env.observation_space, self.env.action_space, self.num_constraints, self.constraint_buffer_size) else: - # load safety layer for 2nd stage training + # Load safety layer for 2nd stage training. assert self.pretrained, "Must provide a pre-trained model for adaptation." if os.path.isdir(self.pretrained): self.pretrained = os.path.join(self.pretrained, "model_latest.pt") state = torch.load(self.pretrained) self.safety_layer.load_state_dict(state["safety_layer"]) - - # set up stats tracking + # Set up stats tracking. self.env.add_tracker("constraint_violation", 0) - self.total_steps = 0 obs, info = self.env.reset() self.obs = self.obs_normalizer(obs) self.c = np.array([inf["constraint_values"] for inf in info["n"]]) def close(self): - """Shuts down and cleans up lingering resources.""" + """Shuts down and cleans up lingering resources. + + """ self.env.close() if self.training: self.eval_env.close() self.logger.close() - def save(self, path): - """Saves model params and experiment state to checkpoint path.""" + def save(self, + path + ): + """Saves model params and experiment state to checkpoint path. + + """ path_dir = os.path.dirname(path) os.makedirs(path_dir, exist_ok=True) - state_dict = { "agent": self.agent.state_dict(), "safety_layer": self.safety_layer.state_dict(), "obs_normalizer": self.obs_normalizer.state_dict(), "reward_normalizer": self.reward_normalizer.state_dict(), } - if self.training: exp_state = { "total_steps": self.total_steps, @@ -137,17 +146,19 @@ def save(self, path): state_dict["constraint_buffer"] = self.constraint_buffer.state_dict() torch.save(state_dict, path) - def load(self, path): - """Restores model and experiment given checkpoint path.""" - state = torch.load(path) + def load(self, + path + ): + """Restores model and experiment given checkpoint path. - # restore policy + """ + state = torch.load(path) + # Restore policy. self.agent.load_state_dict(state["agent"]) self.safety_layer.load_state_dict(state["safety_layer"]) self.obs_normalizer.load_state_dict(state["obs_normalizer"]) self.reward_normalizer.load_state_dict(state["reward_normalizer"]) - - # restore experiment state + # Restore experiment state. if self.training: self.total_steps = state["total_steps"] self.obs = state["obs"] @@ -158,34 +169,35 @@ def load(self, path): if self.pretraining: self.constraint_buffer.load_state_dict(state["constraint_buffer"]) - def learn(self, env=None, **kwargs): - """Performs learning (pre-training, training, fine-tuning, etc).""" + def learn(self, + env=None, + **kwargs + ): + """Performs learning (pre-training, training, fine-tuning, etc). + + """ if self.pretraining: final_step = self.constraint_epochs train_func = self.pretrain_step else: final_step = self.max_env_steps train_func = self.train_step - while self.total_steps < final_step: results = train_func() - - # checkpoint + # Checkpoint. if self.total_steps >= final_step or (self.save_interval and self.total_steps % self.save_interval == 0): - # latest/final checkpoint + # Latest/final checkpoint. self.save(self.checkpoint_path) self.logger.info("Checkpoint | {}".format(self.checkpoint_path)) if self.num_checkpoints and self.total_steps % (final_step // self.num_checkpoints) == 0: - # intermediate checkpoint + # Intermediate checkpoint. path = os.path.join(self.output_dir, "checkpoints", "model_{}.pt".format(self.total_steps)) self.save(path) - - # eval + # Evaluation. if self.eval_interval and self.total_steps % self.eval_interval == 0: if self.pretraining: eval_results = self.eval_constraint_models() results["eval"] = eval_results - # TODO: no single criterion to determine best eval model else: eval_results = self.run(env=self.eval_env, n_episodes=self.eval_batch_size) results["eval"] = eval_results @@ -193,19 +205,26 @@ def learn(self, env=None, **kwargs): eval_results["ep_lengths"].std(), eval_results["ep_returns"].mean(), eval_results["ep_returns"].std())) - # save best model + # Save the best model. eval_score = eval_results["ep_returns"].mean() eval_best_score = getattr(self, "eval_best_score", -np.infty) if self.eval_save_best and eval_best_score < eval_score: self.eval_best_score = eval_score self.save(os.path.join(self.output_dir, "model_best.pt")) - - # logging + # Logging. if self.log_interval and self.total_steps % self.log_interval == 0: self.log_step(results) - def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs): - """Runs evaluation with current policy.""" + def run(self, + env=None, + render=False, + n_episodes=10, + verbose=False, + **kwargs + ): + """Runs evaluation with current policy. + + """ self.agent.eval() self.obs_normalizer.set_read_only() if env is None: @@ -213,26 +232,22 @@ def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs): else: if not is_wrapped(env, RecordEpisodeStatistics): env = RecordEpisodeStatistics(env) - obs, info = env.reset() obs = self.obs_normalizer(obs) c = info["constraint_values"] ep_returns, ep_lengths = [], [] frames = [] - while len(ep_returns) < n_episodes: with torch.no_grad(): obs = torch.FloatTensor(obs).to(self.device) c = torch.FloatTensor(c).to(self.device) action = self.agent.ac.act(obs, c=c) - obs, reward, done, info = env.step(action) if render: env.render() frames.append(env.render("rgb_array")) if verbose: print("obs {} | act {}".format(obs, action)) - if done: assert "episode" in info ep_returns.append(info["episode"]["r"]) @@ -240,8 +255,7 @@ def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs): obs, info = env.reset() obs = self.obs_normalizer(obs) c = info["constraint_values"] - - # collect evaluation results + # Collect evaluation results. ep_lengths = np.asarray(ep_lengths) ep_returns = np.asarray(ep_returns) eval_results = {"ep_returns": ep_returns, "ep_lengths": ep_lengths} @@ -250,50 +264,50 @@ def run(self, env=None, render=False, n_episodes=10, verbose=False, **kwargs): return eval_results def pretrain_step(self): - """Performs a pre-trianing step.""" + """Performs a pre-trianing step. + + """ results = defaultdict(list) start = time.time() self.safety_layer.train() self.obs_normalizer.unset_read_only() - - # Just sample episodes for the whole epoch + # Just sample episodes for the whole epoch. self.collect_constraint_data(self.constraint_steps_per_epoch) self.total_steps += 1 - - # Do the update from memory + # Do the update from memory. for batch in self.constraint_buffer.sampler(self.constraint_batch_size): res = self.safety_layer.update(batch) for k, v in res.items(): results[k].append(v) - - self.constraint_buffer.reset() # TODO: check perf. w/ & wo/ it. - + self.constraint_buffer.reset() results = {k: sum(v) / len(v) for k, v in results.items()} results.update({"step": self.total_steps, "elapsed_time": time.time() - start}) return results def train_step(self): - """Performs a training/fine-tuning step.""" + """Performs a training/fine-tuning step. + + """ self.agent.train() self.obs_normalizer.unset_read_only() - rollouts = SafePPOBuffer(self.env.observation_space, self.env.action_space, self.num_constraints, self.rollout_steps, self.rollout_batch_size) + rollouts = SafePPOBuffer(self.env.observation_space, + self.env.action_space, + self.num_constraints, + self.rollout_steps, + self.rollout_batch_size) obs = self.obs c = self.c start = time.time() - for step in range(self.rollout_steps): with torch.no_grad(): act, v, logp = self.agent.ac.step(torch.FloatTensor(obs).to(self.device), c=torch.FloatTensor(c).to(self.device)) next_obs, rew, done, info = self.env.step(act) - next_obs = self.obs_normalizer(next_obs) rew = self.reward_normalizer(rew, done) mask = 1 - done.astype(float) - - # time truncation is not true termination + # Time truncation is not the same as the true termination. terminal_v = np.zeros_like(v) for idx, inf in enumerate(info["n"]): - # if "TimeLimit.truncated" in inf and inf["TimeLimit.truncated"]: if "terminal_info" not in inf: continue inff = inf["terminal_info"] @@ -302,7 +316,6 @@ def train_step(self): terminal_obs_tensor = torch.FloatTensor(terminal_obs).unsqueeze(0).to(self.device) terminal_val = self.agent.ac.critic(terminal_obs_tensor).squeeze().detach().numpy() terminal_v[idx] = terminal_val - rollouts.push({ "obs": obs, "act": act, @@ -315,12 +328,10 @@ def train_step(self): }) obs = next_obs c = np.array([inf["constraint_values"] for inf in info["n"]]) - self.obs = obs self.c = c self.total_steps += self.rollout_batch_size * self.rollout_steps - - # learn from rollout batch + # Learn from rollout batch. last_val = self.agent.ac.critic(torch.FloatTensor(obs).to(self.device)).detach().numpy() ret, adv = compute_returns_and_advantages(rollouts.rew, rollouts.v, @@ -331,84 +342,82 @@ def train_step(self): use_gae=self.use_gae, gae_lambda=self.gae_lambda) rollouts.ret = ret - # prevent divide-by-0 for repetitive tasks + # Prevent divide-by-0 for repetitive tasks. rollouts.adv = (adv - adv.mean()) / (adv.std() + 1e-6) - results = self.agent.update(rollouts, self.device) results.update({"step": self.total_steps, "elapsed_time": time.time() - start}) return results - def log_step(self, results): - """Does logging after a training step.""" + def log_step(self, + results + ): + """Does logging after a training step. + + """ step = results["step"] final_step = self.constraint_epochs if self.pretraining else self.max_env_steps - # runner stats + # Runner stats. self.logger.add_scalars({ - "step": step, - "time": results["elapsed_time"], - "progress": step / final_step - }, + "step": step, + "time": results["elapsed_time"], + "progress": step / final_step + }, step, prefix="time", write=False, write_tb=False) - if self.pretraining: - # constraint learning stats + # Constraint learning stats. for i in range(self.safety_layer.num_constraints): name = "constraint_{}_loss".format(i) self.logger.add_scalars({name: results[name]}, step, prefix="constraint_loss") if "eval" in results: self.logger.add_scalars({name: results["eval"][name]}, step, prefix="constraint_loss_eval") else: - # learning stats + # Learning stats. self.logger.add_scalars({k: results[k] for k in ["policy_loss", "value_loss", "entropy_loss", "approx_kl"]}, step, prefix="loss") - - # performance stats + # Performance stats. ep_lengths = np.asarray(self.env.length_queue) ep_returns = np.asarray(self.env.return_queue) self.logger.add_scalars({ - "ep_length": ep_lengths.mean(), - "ep_return": ep_returns.mean(), - "ep_reward": (ep_returns / ep_lengths).mean() - }, + "ep_length": ep_lengths.mean(), + "ep_return": ep_returns.mean(), + "ep_reward": (ep_returns / ep_lengths).mean() + }, step, prefix="stat") - - # total constraint violation during learning + # Total constraint violation during learning. total_violations = self.env.accumulated_stats["constraint_violation"] self.logger.add_scalars({"constraint_violation": total_violations}, step, prefix="stat") - if "eval" in results: eval_ep_lengths = results["eval"]["ep_lengths"] eval_ep_returns = results["eval"]["ep_returns"] - self.logger.add_scalars( - { - "ep_length": eval_ep_lengths.mean(), - "ep_return": eval_ep_returns.mean(), - "ep_reward": (eval_ep_returns / eval_ep_lengths).mean() - }, - step, - prefix="stat_eval") - - # print summary table + self.logger.add_scalars({ + "ep_length": eval_ep_lengths.mean(), + "ep_return": eval_ep_returns.mean(), + "ep_reward": (eval_ep_returns / eval_ep_lengths).mean() + }, + step, + prefix="stat_eval") + + # Print summary table. self.logger.dump_scalars() - def collect_constraint_data(self, num_steps): - """Uses random policy to collect data for pre-training constriant models.""" + def collect_constraint_data(self, + num_steps + ): + """Uses random policy to collect data for pre-training constriant models. + + """ step = 0 obs, info = self.env.reset() obs = self.obs_normalizer(obs) c = np.array([inf["constraint_values"] for inf in info["n"]]) - while step < num_steps: - # action = self.env.action_space.sample() action_spaces = self.env.get_attr("action_space") action = np.array([space.sample() for space in action_spaces]) - obs_next, _, done, info = self.env.step(action) obs_next = self.obs_normalizer(obs_next) - c_next = [] for i, d in enumerate(done): if d: @@ -417,27 +426,24 @@ def collect_constraint_data(self, num_steps): c_next_i = info["n"][i]["constraint_values"] c_next.append(c_next_i) c_next = np.array(c_next) - self.constraint_buffer.push({"act": action, "obs": obs, "c": c, "c_next": c_next}) - obs = obs_next c = np.array([inf["constraint_values"] for inf in info["n"]]) step += self.rollout_batch_size def eval_constraint_models(self): - """Runs evaluation for the constraint models.""" + """Runs evaluation for the constraint models. + + """ eval_resutls = defaultdict(list) self.safety_layer.eval() self.obs_normalizer.set_read_only() - - # collect evaluation data + # Collect evaluation data. self.collect_constraint_data(self.constraint_eval_steps) - for batch in self.constraint_buffer.sampler(self.constraint_batch_size): losses = self.safety_layer.compute_loss(batch) for i, loss in enumerate(losses): eval_resutls["constraint_{}_loss".format(i)].append(loss.item()) - - self.constraint_buffer.reset() # TODO: use a different buffer for evaluation + self.constraint_buffer.reset() eval_resutls = {k: sum(v) / len(v) for k, v in eval_resutls.items()} return eval_resutls diff --git a/safe_control_gym/controllers/safe_explorer/safe_ppo.yaml b/safe_control_gym/controllers/safe_explorer/safe_ppo.yaml index 56ab772d5..c82b355b2 100644 --- a/safe_control_gym/controllers/safe_explorer/safe_ppo.yaml +++ b/safe_control_gym/controllers/safe_explorer/safe_ppo.yaml @@ -1,11 +1,11 @@ -# model args +# Model args hidden_dim: 64 norm_obs: False norm_reward: False clip_obs: 10. clip_reward: 10. -# safety layer args +# Safety layer args pretraining: True pretrained: null constraint_hidden_dim: 10 @@ -18,7 +18,7 @@ constraint_eval_interval: 5 constraint_buffer_size: 1000000 constraint_slack: null -# loss args +# Loss args gamma: 0.99 use_gae: False gae_lambda: 0.95 @@ -27,14 +27,14 @@ clip_param: 0.2 target_kl: 0.01 entropy_coef: 0.01 -# optim args +# Optim args opt_epochs: 10 mini_batch_size: 64 actor_lr: 0.0003 critic_lr: 0.001 max_grad_norm: 0.5 -# runner args +# Runner args max_env_steps: 1000000 num_workers: 1 rollout_batch_size: 4 @@ -42,7 +42,7 @@ rollout_steps: 100 deque_size: 10 eval_batch_size: 10 -# misc +# Misc log_interval: 0 save_interval: 0 num_checkpoints: 0 diff --git a/safe_control_gym/controllers/safe_explorer/safe_ppo_utils.py b/safe_control_gym/controllers/safe_explorer/safe_ppo_utils.py index fb0fcb10b..79bf561c7 100644 --- a/safe_control_gym/controllers/safe_explorer/safe_ppo_utils.py +++ b/safe_control_gym/controllers/safe_explorer/safe_ppo_utils.py @@ -1,20 +1,21 @@ -from gym.spaces import Box +"""Utility function for the PPO-based safe explorer. + +""" import numpy as np import torch import torch.nn as nn +from gym.spaces import Box + from safe_control_gym.math_and_models.neural_networks import MLP, CNN, RNN, init_ from safe_control_gym.math_and_models.distributions import Normal, Categorical - import safe_control_gym.controllers.ppo.ppo_utils as ppo_utils -# ----------------------------------------------------------------------------------- -# Agent -# ----------------------------------------------------------------------------------- - class SafePPOAgent(ppo_utils.PPOAgent): - """A PPO class that encapsulates model, optimizer and update functions.""" + """A PPO class that encapsulates models, optimizers and update functions. + + """ def __init__(self, obs_space, @@ -29,57 +30,48 @@ def __init__(self, opt_epochs=10, mini_batch_size=64, action_modifier=None, - **kwargs): - # params + **kwargs + ): + # Parameters. self.obs_space = obs_space self.act_space = act_space - self.use_clipped_value = use_clipped_value self.clip_param = clip_param self.target_kl = target_kl self.entropy_coef = entropy_coef - self.opt_epochs = opt_epochs self.mini_batch_size = mini_batch_size - - # model + # Model. self.ac = MLPActorCritic(obs_space, act_space, hidden_dims=[hidden_dim] * 2, activation="tanh", action_modifier=action_modifier) - - # optimizers + # Optimizers. self.actor_opt = torch.optim.Adam(self.ac.actor.parameters(), actor_lr) - self.critic_opt = torch.optim.Adam(self.ac.critic.parameters(), - critic_lr) + self.critic_opt = torch.optim.Adam(self.ac.critic.parameters(), critic_lr) def compute_policy_loss(self, batch): - """Returns policy loss(es) given batch of data.""" - obs, act, logp_old, adv, c = batch["obs"], batch["act"], batch[ - "logp"], batch["adv"], batch["c"] - dist, logp = self.ac.actor(obs, act, c=c) + """Returns policy loss(es) given batch of data. - # policy + """ + obs, act, logp_old, adv, c = batch["obs"], batch["act"], batch["logp"], batch["adv"], batch["c"] + dist, logp = self.ac.actor(obs, act, c=c) + # Policy. ratio = torch.exp(logp - logp_old) - clip_adv = torch.clamp(ratio, 1 - self.clip_param, - 1 + self.clip_param) * adv + clip_adv = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * adv policy_loss = -torch.min(ratio * adv, clip_adv).mean() - - # entropy + # Entropy. entropy_loss = -dist.entropy().mean() - - # kl/trust region + # KL/trust region. approx_kl = (logp_old - logp).mean() return policy_loss, entropy_loss, approx_kl -# ----------------------------------------------------------------------------------- -# Models -# ----------------------------------------------------------------------------------- - - class MLPActor(nn.Module): + """Actor model. + + """ def __init__(self, obs_dim, @@ -87,36 +79,44 @@ def __init__(self, hidden_dims, activation, discrete=False, - action_modifier=None): + action_modifier=None + ): + """ + + """ super().__init__() self.pi_net = MLP(obs_dim, act_dim, hidden_dims, activation) - - # construct output action distribution + # Construct output action distribution. self.discrete = discrete if discrete: self.dist_fn = lambda x: Categorical(logits=x) else: self.logstd = nn.Parameter(-0.5 * torch.ones(act_dim)) self.dist_fn = lambda x: Normal(x, self.logstd.exp()) - - # safety filter + # Safety filter. self.action_modifier = action_modifier - def forward(self, obs, act=None, c=None): + def forward(self, + obs, + act=None, + c=None + ): + """ + + """ mu = self.pi_net(obs) - # filter action if needed + # Filter action if needed. if self.action_modifier: if len(mu.shape) == 1: - # during evalution or single env runs + # During evalution or single env runs. mu_safe = self.action_modifier(obs.unsqueeze(0), mu.unsqueeze(0), c.unsqueeze(0)).view(-1) else: - # during training or vectorized runs + # During training or vectorized runs. mu_safe = self.action_modifier(obs, mu, c) else: mu_safe = mu - dist = self.dist_fn(mu_safe) logp_a = None if act is not None: @@ -130,6 +130,7 @@ class MLPActorCritic(ppo_utils.MLPActorCritic): Attributes: actor (MLPActor): policy network. critic (MLPCritic): value network. + """ def __init__(self, @@ -137,9 +138,12 @@ def __init__(self, act_space, hidden_dims=(64, 64), activation="tanh", - action_modifier=None): - nn.Module.__init__(self) + action_modifier=None + ): + """ + """ + nn.Module.__init__(self) obs_dim = obs_space.shape[0] if isinstance(act_space, Box): act_dim = act_space.shape[0] @@ -147,31 +151,36 @@ def __init__(self, else: act_dim = act_space.n discrete = True - - # policy - self.actor = MLPActor(obs_dim, act_dim, hidden_dims, activation, - discrete, action_modifier) - # value function + # Policy. + self.actor = MLPActor(obs_dim, act_dim, hidden_dims, activation, discrete, action_modifier) + # Value function. self.critic = ppo_utils.MLPCritic(obs_dim, hidden_dims, activation) - def step(self, obs, c=None): + def step(self, + obs, + c=None + ): + """ + + """ dist, _ = self.actor(obs, c=c) a = dist.sample() logp_a = dist.log_prob(a) v = self.critic(obs) return a.numpy(), v.numpy(), logp_a.numpy() - def act(self, obs, c=None): + def act(self, + obs, + c=None + ): + """ + + """ dist, _ = self.actor(obs, c=c) a = dist.mode() return a.numpy() -# ----------------------------------------------------------------------------------- -# Storage -# ----------------------------------------------------------------------------------- - - class SafePPOBuffer(ppo_utils.PPOBuffer): """Storage for a batch of episodes during training. @@ -180,20 +189,27 @@ class SafePPOBuffer(ppo_utils.PPOBuffer): batch_size (int): number of episodes per batch. scheme (dict): describs shape & other info of data to be stored. keys (list): names of all data from scheme. + """ - def __init__(self, obs_space, act_space, num_constraints, max_length, - batch_size): + def __init__(self, + obs_space, + act_space, + num_constraints, + max_length, + batch_size + ): + """ + + """ self.max_length = max_length self.batch_size = batch_size - T, N = max_length, batch_size obs_dim = obs_space.shape if isinstance(act_space, Box): act_dim = act_space.shape[0] else: act_dim = act_space.n - self.scheme = { "obs": { "vshape": (T, N, *obs_dim) @@ -228,4 +244,4 @@ def __init__(self, obs_space, act_space, num_constraints, max_length, }, } self.keys = list(self.scheme.keys()) - self.reset() \ No newline at end of file + self.reset() diff --git a/safe_control_gym/envs/benchmark_env.py b/safe_control_gym/envs/benchmark_env.py index 39989f4b4..e9779da83 100644 --- a/safe_control_gym/envs/benchmark_env.py +++ b/safe_control_gym/envs/benchmark_env.py @@ -1,7 +1,6 @@ """Base environment class module. -This module also contains enumerations for cost functions, tasks, -disturbances, and quadrotor types. +This module also contains enumerations for cost functions, tasks, disturbances, and quadrotor types. """ import os @@ -14,14 +13,18 @@ class Cost(str, Enum): - """Reward/cost functions enumeration class.""" + """Reward/cost functions enumeration class. + + """ RL_REWARD = "rl_reward" # Default RL reward function. QUADRATIC = "quadratic" # Quadratic cost. class Task(str, Enum): - """Environment tasks enumeration class.""" + """Environment tasks enumeration class. + + """ STABILIZATION = "stabilization" # Stabilization task. TRAJ_TRACKING = "traj_tracking" # Trajectory tracking task. @@ -32,16 +35,17 @@ class BenchmarkEnv(gym.Env): Attributes: id (int): unique identifier of the current env instance (among other instances). + """ - # class variable, count env instance in current process - _count = 0 + _count = 0 # Class variable, count env instance in current process. def __init__(self, output_dir=None, seed: int = 0, info_in_reset: bool = False, episode_len_sec: int = 5, - cost: Cost = Cost.RL_REWARD): + cost: Cost = Cost.RL_REWARD + ): """Initialization method for BenchmarkEnv. Args: @@ -50,32 +54,28 @@ def __init__(self, environment's symbolic model. episode_len_sec (int, optional): Maximum episode duration in seconds. cost (Cost, optional): Cost function choice used to compute the reward in .step(). + """ - # assign unique ID based on env instance count + # Assign unique ID based on env instance count. self.id = self.__class__._count self.__class__._count += 1 - - # directory to save any env output + # Directory to save any env output. if output_dir is None: output_dir = os.getcwd() self.output_dir = output_dir - - # default seed None means pure randomness/no seeding + # Default seed None means pure randomness/no seeding. self.seed(seed) self.INFO_IN_RESET = info_in_reset self.initial_reset = False - - # maximum episode length in seconds + # Maximum episode length in seconds. self.EPISODE_LEN_SEC = episode_len_sec - - # define cost-related quantities + # Define cost-related quantities. self.COST = Cost(cost) # Default Q and R matrices for quadratic cost. if self.COST == Cost.QUADRATIC: self.Q = np.eye(self.observation_space.shape[0]) self.R = np.eye(self.action_space.shape[0]) - - # custom setups + # Custom setups. self._setup_symbolic() self._setup_disturbances() self._setup_constraints() @@ -87,7 +87,10 @@ def _check_initial_reset(self): "[ERROR] You must call env.reset() at least once before using env.step()." ) - def _randomize_values_by_info(self, original_values, randomization_info): + def _randomize_values_by_info(self, + original_values, + randomization_info + ): """Randomizes a list of values according to desired distributions. Args: @@ -97,30 +100,33 @@ def _randomize_values_by_info(self, original_values, randomization_info): Returns: dict: A dict of randomized values. + """ - randomized_values = copy.deepcopy( - original_values) # Start from a copy of the original values - rand_info_copy = copy.deepcopy( - randomization_info) # Copy the info dict to parse it with "pop" - # Randomized and replace values for which randomization info are given + # Start from a copy of the original values. + randomized_values = copy.deepcopy(original_values) + # Copy the info dict to parse it with "pop". + rand_info_copy = copy.deepcopy(randomization_info) + # Randomized and replace values for which randomization info are given. for key in original_values: if key in rand_info_copy: - # Get distribution removing it from info dict + # Get distribution removing it from info dict. distrib = getattr(self.np_random, rand_info_copy[key].pop("distrib")) - # Pop positional args + # Pop positional args. d_args = rand_info_copy[key].pop("args", []) - # Keyword args are just anything left + # Keyword args are just anything left. d_kwargs = rand_info_copy[key] - # Randomize + # Randomize. randomized_values[key] = distrib(*d_args, **d_kwargs) return randomized_values - def seed(self, seed=None): + def seed(self, + seed=None + ): """Sets up a random number generator for a given seed. - Current repo convention: non-positive seed same as None - (might not be the best, subject to discussion). + Current convention: non-positive seed same as None + """ if seed is not None: seed = seed if seed > 0 else None @@ -167,8 +173,7 @@ def _generate_trajectory(self, # Get trajectory type. valid_traj_type = ["circle", "square", "figure8"] if traj_type not in valid_traj_type: - raise ValueError( - "Trajectory type should be one of [circle, square, figure8].") + raise ValueError("Trajectory type should be one of [circle, square, figure8].") traj_period = traj_length / num_cycles direction_list = ["x", "y", "z"] # Get coordinates indexes. @@ -177,9 +182,7 @@ def _generate_trajectory(self, coord_index_a = direction_list.index(traj_plane[0]) coord_index_b = direction_list.index(traj_plane[1]) else: - raise ValueError("Trajectory plane should be in form of " - "ab" - ", where a and b can be {x, y, z}.") + raise ValueError("Trajectory plane should be in form of ab, where a and b can be {x, y, z}.") # Generate time stamps. times = np.arange(0, traj_length, sample_time) pos_ref_traj = np.zeros((len(times), 3)) @@ -187,15 +190,27 @@ def _generate_trajectory(self, speed_traj = np.zeros((len(times), 1)) # Compute trajectory points. for t in enumerate(times): - pos_ref_traj[t[0]], vel_ref_traj[t[0]] = self._get_coordinates( - t[1], traj_type, traj_period, coord_index_a, coord_index_b, - position_offset[0], position_offset[1], scaling) + pos_ref_traj[t[0]], vel_ref_traj[t[0]] = self._get_coordinates(t[1], + traj_type, + traj_period, + coord_index_a, + coord_index_b, + position_offset[0], + position_offset[1], + scaling) speed_traj[t[0]] = np.linalg.norm(vel_ref_traj[t[0]]) return pos_ref_traj, vel_ref_traj, speed_traj - def _get_coordinates(self, t, traj_type, traj_period, coord_index_a, - coord_index_b, position_offset_a, position_offset_b, - scaling): + def _get_coordinates(self, + t, + traj_type, + traj_period, + coord_index_a, + coord_index_b, + position_offset_a, + position_offset_b, + scaling + ): """Computes the coordinates of a specified trajectory at time t. Args: @@ -233,7 +248,11 @@ def _get_coordinates(self, t, traj_type, traj_period, coord_index_a, vel_ref[coord_index_b] = coords_b_dot return pos_ref, vel_ref - def _figure8(self, t, traj_period, scaling): + def _figure8(self, + t, + traj_period, + scaling + ): """Computes the coordinates of a figure8 trajectory at time t. Args: @@ -252,11 +271,14 @@ def _figure8(self, t, traj_period, scaling): coords_a = scaling * np.sin(traj_freq * t) coords_b = scaling * np.sin(traj_freq * t) * np.cos(traj_freq * t) coords_a_dot = scaling * traj_freq * np.cos(traj_freq * t) - coords_b_dot = scaling * traj_freq * (np.cos(traj_freq * t)**2 - - np.sin(traj_freq * t)**2) + coords_b_dot = scaling * traj_freq * (np.cos(traj_freq * t)**2 - np.sin(traj_freq * t)**2) return coords_a, coords_b, coords_a_dot, coords_b_dot - def _circle(self, t, traj_period, scaling): + def _circle(self, + t, + traj_period, + scaling + ): """Computes the coordinates of a circle trajectory at time t. Args: @@ -278,7 +300,11 @@ def _circle(self, t, traj_period, scaling): coords_b_dot = scaling * traj_freq * np.cos(traj_freq * t) return coords_a, coords_b, coords_a_dot, coords_b_dot - def _square(self, t, traj_period, scaling): + def _square(self, + t, + traj_period, + scaling + ): """Computes the coordinates of a square trajectory at time t. Args: @@ -330,8 +356,15 @@ def _square(self, t, traj_period, scaling): coords_b_dot = 0.0 return coords_a, coords_b, coords_a_dot, coords_b_dot - def _plot_trajectory(self, traj_type, traj_plane, traj_length, num_cycles, - pos_ref_traj, vel_ref_traj, speed_traj): + def _plot_trajectory(self, + traj_type, + traj_plane, + traj_length, + num_cycles, + pos_ref_traj, + vel_ref_traj, + speed_traj + ): """Plots a trajectory along x, y, z, and in a 3D projection. Args: @@ -350,8 +383,7 @@ def _plot_trajectory(self, traj_type, traj_plane, traj_length, num_cycles, print("Trajectory length: %s sec" % traj_length) print("Number of cycles: %d" % num_cycles) print("Trajectory period: %.2f sec" % (traj_length / num_cycles)) - print("Angular speed: %.2f rad/sec" % (2.0 * np.pi / - (traj_length / num_cycles))) + print("Angular speed: %.2f rad/sec" % (2.0 * np.pi / (traj_length / num_cycles))) print( "Position bounds: x [%.2f, %.2f] m, y [%.2f, %.2f] m, z [%.2f, %.2f] m" % (min(pos_ref_traj[:, 0]), max(pos_ref_traj[:, 0]), @@ -382,7 +414,7 @@ def _plot_trajectory(self, traj_type, traj_plane, traj_length, num_cycles, axs[2, 1].set_ylabel('vel z (m)') axs[2, 1].set_xlabel('time (s)') plt.show() - # Plot in 3D + # Plot in 3D. fig = plt.figure() ax = fig.gca(projection='3d') ax.plot(pos_ref_traj[:, 0], pos_ref_traj[:, 1], pos_ref_traj[:, 2]) diff --git a/safe_control_gym/envs/constraints.py b/safe_control_gym/envs/constraints.py index 33554e8de..a0d6f8a56 100644 --- a/safe_control_gym/envs/constraints.py +++ b/safe_control_gym/envs/constraints.py @@ -1,23 +1,20 @@ """Constraints module. -A base class for constraints and a class of a list of constraints. +Classes for constraints and lists of constraints. -Reference implementations: - * github.com/google-research/realworldrl_suite/blob/master/docs/README.md - * github.com/google-research/realworldrl_suite/blob/be7a51cffa7f5f9cb77a387c16bad209e0f851f8/realworldrl_suite/environments/cartpole.py#L38 - -TODO: - * Allow for states/inputs to be randomly sampled within constraints limits - * Constraint List have combined representation, and vertex representation """ import numpy as np -from enum import Enum import casadi as cs + +from enum import Enum + from safe_control_gym.envs.benchmark_env import BenchmarkEnv class ConstraintInputType(str, Enum): - """Allowable constraint type specifiers.""" + """Allowable constraint type specifiers. + + """ STATE = 'STATE' # Constraints who are a function of the state X. INPUT = 'INPUT' # Constraints who are a function of the input U. @@ -32,19 +29,26 @@ class Constraint: Attributes: dim (int): The number of inequalities representing the constraint. + """ dim = 0 # todo: why is this a class attribute? - def __init__(self, env: BenchmarkEnv, constraint_input_type: ConstraintInputType, active_dims=None, tolerance=None, **kwargs): + def __init__(self, + env: BenchmarkEnv, + constraint_input_type: ConstraintInputType, + active_dims=None, + tolerance=None, + **kwargs + ): """Defines params (e.g. bounds) and state. Args: env (safe_control_gym.envs.bechmark_env.BenchmarkEnv): The environment the constraint is for. constraint_input_type (ConstraintInputType): Specifies the input type to the constraint as a constraint - that acts on the state, input, or both - active_dims (list of ints): Filters the constraint to only act only select certian dimensions - tolerance (list or np.array): The distance from the constraint at which is_almost_active returns True + that acts on the state, input, or both. + active_dims (list of ints): Filters the constraint to only act only select certian dimensions. + tolerance (list or np.array): The distance from the constraint at which is_almost_active returns True. """ self.constraint_input_type = ConstraintInputType(constraint_input_type) @@ -54,8 +58,7 @@ def __init__(self, env: BenchmarkEnv, constraint_input_type: ConstraintInputType self.dim = env.action_space.shape[0] else: self.dim = env.observation_space.shape[0] + env.action_space.shape[0] - - # Only want to select specific dimensions. Implemented via a filter matrix + # Only want to select specific dimensions, implemented via a filter matrix. if active_dims is not None: if isinstance(active_dims, int): active_dims = [active_dims] @@ -63,20 +66,22 @@ def __init__(self, env: BenchmarkEnv, constraint_input_type: ConstraintInputType self.dim = len(active_dims) else: self.constraint_filter = np.eye(self.dim) - if tolerance is not None: self.tolerance = np.array(tolerance, ndmin=1) assert self.tolerance.shape[0] == len(active_dims) else: self.tolerance = np.array(np.zeros(self.dim), ndmin=1) - self.num_input_constraints = None def reset(self): - """Clears up the constraint state (if any).""" + """Clears up the constraint state (if any). + + """ pass - def get_symbolic_model(self, env): + def get_symbolic_model(self, + env + ): """Gets the symbolic form of the constraint function. Args: @@ -84,10 +89,13 @@ def get_symbolic_model(self, env): Returns: obj: The symbolic form of the constraint. + """ return NotImplementedError - def get_value(self, env): + def get_value(self, + env + ): """Gets the constraint function value. Args: @@ -100,7 +108,9 @@ def get_value(self, env): env_value = self.get_env_constraint_var(env) return np.squeeze(self.sym_func(np.array(env_value, ndmin=1))) - def is_violated(self, env): + def is_violated(self, + env + ): """Checks if constraint is violated. Args: @@ -108,25 +118,30 @@ def is_violated(self, env): Returns: bool: Whether the constraint was violeted. + """ c_value = self.get_value(env) flag = np.any(np.greater(c_value, 0.)) return bool(flag) - def is_almost_active(self, env): + def is_almost_active(self, + env + ): """Checks if constraint is nearly violated. This is checked by using a slack variable (from init args). This can be used for reward shaping/constraint penalty in RL methods. - """ + """ if not hasattr(self, "tolerance") or self.tolerance is None: return False c_value = self.get_value(env) flag = np.any(np.greater(c_value + self.tolerance, 0.)) return bool(flag) - def get_env_constraint_var(self, env: BenchmarkEnv): + def get_env_constraint_var(self, + env: BenchmarkEnv + ): if self.constraint_input_type == ConstraintInputType.STATE: return env.state elif self.constraint_input_type == ConstraintInputType.INPUT: @@ -137,11 +152,19 @@ def get_env_constraint_var(self, env: BenchmarkEnv): raise NotImplementedError("Constraint input type not implemented.") -# TODO: Add these constraints to benchmark constraints class QuadraticContstraint(Constraint): - """Constraint class for constraints of the form x.T @ P @ x <= b.""" + """Constraint class for constraints of the form x.T @ P @ x <= b. + + """ - def __init__(self, env: BenchmarkEnv, P: np.ndarray, b: float, constraint_input_type: ConstraintInputType, active_dims=None, tolerance=None): + def __init__(self, + env: BenchmarkEnv, + P: np.ndarray, + b: float, + constraint_input_type: ConstraintInputType, + active_dims=None, + tolerance=None + ): """Initializes the class. An example of how to specify in YAML, one would add: @@ -160,9 +183,9 @@ def __init__(self, env: BenchmarkEnv, P: np.ndarray, b: float, constraint_input_ P (np.array): The square matrix representing the quadratic. b (float): The scalar limit for the quadatic constraint. constraint_input_type (ConstraintInputType): Specifies the input type to the constraint as a constraint - that acts on the state, input, or both - active_dims (list of ints): Filters the constraint to only act only select certian dimensions - tolerance (list or np.array): The distance from the constraint at which is_almost_active returns True + that acts on the state, input, or both. + active_dims (list of ints): Filters the constraint to only act only select certian dimensions. + tolerance (list or np.array): The distance from the constraint at which is_almost_active returns True. """ super().__init__(env, constraint_input_type, active_dims=active_dims, tolerance=tolerance) @@ -170,8 +193,7 @@ def __init__(self, env: BenchmarkEnv, P: np.ndarray, b: float, constraint_input_ self.P = P assert isinstance(b, float), ValueError("b is not a scalar!") self.b = b - self.num_constraints = 1 # always scalar - + self.num_constraints = 1 # Always scalar. self.sym_func = lambda x: x.T @ self.constraint_filter.T @ self.P @ self.constraint_filter @ x - self.b def get_symbolic_model(self): @@ -185,12 +207,21 @@ def get_symbolic_model(self): class LinearConstraint(Constraint): - """Constraint class for constraints of the form A @ x <= b""" + """Constraint class for constraints of the form A @ x <= b. + + """ - def __init__(self, env: BenchmarkEnv, A: np.ndarray, b: np.ndarray, constraint_input_type: ConstraintInputType, active_dims=None, tolerance=None): + def __init__(self, + env: BenchmarkEnv, + A: np.ndarray, + b: np.ndarray, + constraint_input_type: ConstraintInputType, + active_dims=None, + tolerance=None + ): """Initialize the class. - An example of how to specify in YAML, one would add: + An example of how to specify this in YAML: constraints: linear_constraint: @@ -206,7 +237,7 @@ def __init__(self, env: BenchmarkEnv, A: np.ndarray, b: np.ndarray, constraint_i env (BenchmarkEnv): The environment to constraint. A (np.array or list): A matrix of the constraint (self.num_constraints by self.dim). b (np.array or list): b matrix of the constraint (1D array self.num_constraints) - constraint_input_type (ConstraintInputType): Type of constraint. + constraint_input_type (ConstraintInputType): Type of constraint. active_dims (list or int): List specifying which dimensions the constraint is active for. tolerance (float): The distance at which is_almost_active(env) triggers. @@ -219,7 +250,6 @@ def __init__(self, env: BenchmarkEnv, A: np.ndarray, b: np.ndarray, constraint_i assert b.shape[0] == A.shape[0], ValueError("Dimension 0 of b does not match A!") self.b = b self.num_constraints = A.shape[0] - self.sym_func = lambda x: self.A @ self.constraint_filter @ x - self.b def get_symbolic_model(self): @@ -233,7 +263,9 @@ def get_symbolic_model(self): class BoundedConstraint(LinearConstraint): - """ Class for bounded constraints lb <= x <= ub as polytopic constraints -Ix + b <= 0 and Ix - b <= 0 """ + """ Class for bounded constraints lb <= x <= ub as polytopic constraints -Ix + b <= 0 and Ix - b <= 0. + + """ def __init__(self, env: BenchmarkEnv, @@ -244,7 +276,7 @@ def __init__(self, tolerance=None): """Initialize the constraint. - An example of how to specify in YAML, one would add: + An example of how to specify this in YAML: constraints: bounded_constraint: @@ -276,22 +308,28 @@ def __init__(self, def get_symbolic_constraint_models(constraint_list): - """Create list of symbolic models from list of constraints.""" + """Create list of symbolic models from list of constraints. + + """ symbolic_models = [con.get_symbolic_model() for con in constraint_list] return symbolic_models class ConstraintList: - """Collates a (ordered) list of constraints.""" + """Collates a (ordered) list of constraints. - def __init__(self, constraints): + """ + + def __init__(self, + constraints + ): """Initialize the constraint list. Args: constraints: The list of constraints. """ - self.constraints = constraints # Todo: this could probably be names better since we often see constraints.constraints + self.constraints = constraints self.num_constraints = sum([con.num_constraints for con in self.constraints]) self.state_constraints = [con for con in self.constraints if con.constraint_input_type == ConstraintInputType.STATE] self.num_state_constraints = sum([con.num_constraints for con in self.state_constraints]) @@ -310,23 +348,30 @@ def __len__(self): return len(self.constraints) def get_all_symbolic_models(self): - """Return all the symbolic models the constraints.""" + """Return all the symbolic models the constraints. + + """ return get_symbolic_constraint_models(self.constraints) def get_state_constraint_symbolic_models(self): - """Return only the constraints that act on the state.""" + """Return only the constraints that act on the state. + + """ return get_symbolic_constraint_models(self.state_constraints) def get_input_constraint_symbolic_models(self): - """Return only the constraints that act on the input.""" + """Return only the constraints that act on the input. + + """ return get_symbolic_constraint_models(self.input_constraints) def get_input_and_state_constraint_symbolic_models(self): - """Return only the constraints that act on both state and inputs simultaneously.""" + """Return only the constraints that act on both state and inputs simultaneously. + + """ return get_symbolic_constraint_models(self.input_state_constraints) def get_stacked_symbolic_model(self, env): - # Todo: Can this be removed? """Gets the symbolic form of all constraints. Args: @@ -334,16 +379,19 @@ def get_stacked_symbolic_model(self, env): Returns: obj: The symbolic form of the constraint. + """ symbolic_models = [con.get_symbolic_model() for con in self.constraints] - X = env.symbolic.x_sym U = env.symbolic.u_sym stack_c_sym = cs.vertcat(*[func(X, U) for func in symbolic_models]) sym_func = cs.Function("constraints", [X, U], [stack_c_sym]) return sym_func - def get_values(self, env, only_state=False): + def get_values(self, + env, + only_state=False + ): """Gets all constraint function values. Args: @@ -359,7 +407,10 @@ def get_values(self, env, only_state=False): con_values = np.concatenate([con.get_value(env) for con in self.constraints]) return con_values - def get_violations(self, env, only_state=False): + def get_violations(self, + env, + only_state=False + ): """Gets all constraint violations. Args: @@ -375,7 +426,9 @@ def get_violations(self, env, only_state=False): flags = [con.is_violated(env) for con in self.constraints] return flags - def is_violated(self, env): + def is_violated(self, + env + ): """Checks if any of the constraints is violated. Args: @@ -388,11 +441,14 @@ def is_violated(self, env): flag = any([con.is_violated(env) for con in self.constraints]) return flag - def is_almost_active(self, env): + def is_almost_active(self, + env + ): """Checks if constraint is nearly violated. - This is checked by using a slack variable (from init args). - This can be used for reward shaping/constraint penalty in RL methods. + This is checked by using a slack variable (from init args) and can be used + for reward shaping/constraint penalty in RL methods. + """ flag = any([con.is_almost_active(env) for con in self.constraints]) return flag @@ -405,18 +461,21 @@ def is_almost_active(self, env): } -def create_ConstraintList_from_dict(constraint_dict, available_constraints, env): +def create_ConstraintList_from_dict(constraint_dict, + available_constraints, + env + ): """Create a constraint list from a dict (or YAML output). Args: - constraint_dict (dict): Dict specifying the constraint parameters - available_constraints (dict): Dict of the constraints that are available - env (BenchmarkEnv): Env for which the constraints will be applied + constraint_dict (dict): Dict specifying the constraint parameters. + available_constraints (dict): Dict of the constraints that are available. + env (BenchmarkEnv): Env for which the constraints will be applied. + """ constraint_list = [] for name, cfg in constraint_dict.items(): assert name in available_constraints, "[ERROR]. constraint not in list of available constraints" con_class = available_constraints[name] constraint_list.append(con_class(env, **cfg)) - return ConstraintList(constraint_list) diff --git a/safe_control_gym/envs/disturbances.py b/safe_control_gym/envs/disturbances.py index f5412ea6e..f4d2d6506 100644 --- a/safe_control_gym/envs/disturbances.py +++ b/safe_control_gym/envs/disturbances.py @@ -1,13 +1,21 @@ """Disturbances. + """ import numpy as np from enum import Enum class Disturbance: - """Base class for disturbance or noise applied to inputs or dyanmics.""" + """Base class for disturbance or noise applied to inputs or dyanmics. + + """ - def __init__(self, env, dim, mask=None, **kwargs): + def __init__(self, + env, + dim, + mask=None, + **kwargs + ): self.np_random = env.np_random self.dim = dim self.mask = mask @@ -15,29 +23,50 @@ def __init__(self, env, dim, mask=None, **kwargs): self.mask = np.asarray(mask) assert self.dim == len(self.mask) - def reset(self, env): - """Default nothing.""" + def reset(self, + env + ): pass - def apply(self, target, env): - """Default is identity.""" + def apply(self, + target, + env + ): + """Default is identity. + + """ return target class DisturbanceList: - """Combine list of disturbances as one.""" + """Combine list of disturbances as one. + + """ + + def __init__(self, + disturbances + ): + """Initialization of the list of disturbances. - def __init__(self, disturbances): - """Initialization of the list of disturbances.""" + """ self.disturbances = disturbances - def reset(self, env): - """Sequentially reset disturbances.""" + def reset(self, + env + ): + """Sequentially reset disturbances. + + """ for disturb in self.disturbances: disturb.reset(env) - def apply(self, target, env): - """Sequentially (in order) apply disturbances.""" + def apply(self, + target, + env + ): + """Sequentially apply disturbances. + + """ disturbed = target for disturb in self.disturbances: disturbed = disturb.apply(disturbed, env) @@ -47,31 +76,46 @@ def apply(self, target, env): class ImpulseDisturbance(Disturbance): """Impulse applied during a short time interval. - Example impulses: - * single step, square (duration=1, decay_rate=1): ______|-|_______ - * multiple step, square (duration>1, decay_rate=1): ______|-----|_____ - * multiple step, triangle (duration>1, decay_rate<1): ______/\_____ + Examples: + * single step, square (duration=1, decay_rate=1): ______|-|_______ + * multiple step, square (duration>1, decay_rate=1): ______|-----|_____ + * multiple step, triangle (duration>1, decay_rate<1): ______/\_____ + """ - def __init__(self, env, dim, mask=None, magnitude=1, step_offset=None, duration=1, decay_rate=1, **kwargs): + def __init__(self, + env, + dim, + mask=None, + magnitude=1, + step_offset=None, + duration=1, + decay_rate=1, + **kwargs + ): super().__init__(env, dim, mask) self.magnitude = magnitude self.step_offset = step_offset self.max_step = int(env.EPISODE_LEN_SEC / env.CTRL_TIMESTEP) - # specify shape of impulse + # Specify shape of the impulse. assert duration > 1 assert decay_rate > 0 and decay_rate <= 1 self.duration = duration self.decay_rate = decay_rate - def reset(self, env): + def reset(self, + env + ): if self.step_offset is None: self.current_step_offset = self.np_random.randint(self.max_step) else: self.current_step_offset = self.step_offset self.current_peak_step = int(self.current_step_offset + self.duration / 2) - def apply(self, target, env): + def apply(self, + target, + env + ): noise = 0 if env.control_step_counter > self.step_offset: peak_offset = np.abs(env.control_step_counter - self.current_peak_step) @@ -89,22 +133,35 @@ def apply(self, target, env): class StepDisturbance(Disturbance): """Constant disturbance at all time steps (but after offset). - disturbance is applied after offset step (which can be randomized or given): _______|--------- + Applied after offset step (randomized or given): _______|--------- + """ - def __init__(self, env, dim, mask=None, magnitude=1, step_offset=None, **kwargs): + def __init__(self, + env, + dim, + mask=None, + magnitude=1, + step_offset=None, + **kwargs + ): super().__init__(env, dim, mask) self.magnitude = magnitude self.step_offset = step_offset self.max_step = int(env.EPISODE_LEN_SEC / env.CTRL_TIMESTEP) - def reset(self, env): + def reset(self, + env + ): if self.step_offset is None: self.current_step_offset = self.np_random.randint(self.max_step) else: self.current_step_offset = self.step_offset - def apply(self, target, env): + def apply(self, + target, + env + ): noise = 0 if env.control_step_counter > self.step_offset: noise = self.magnitude @@ -115,21 +172,31 @@ def apply(self, target, env): class WhiteNoise(Disturbance): - """i.i.d Gaussian noise per time step.""" + """I.i.d Gaussian noise per time step. + + """ - def __init__(self, env, dim, mask=None, std=1.0, **kwargs): + def __init__(self, + env, + dim, + mask=None, + std=1.0, + **kwargs + ): super().__init__(env, dim, mask) - # i.i.d gaussian variance + # I.i.d gaussian variance. if isinstance(std, float): self.std = np.asarray([std] * self.dim) elif isinstance(std, list): self.std = np.asarray(std) else: raise ValueError("[ERROR] WhiteNoise.__init__(): std must be specified as a float or list.") - assert self.dim == len(self.std), "std shape should be the same as dim." - def apply(self, target, env): + def apply(self, + target, + env + ): noise = self.np_random.normal(0, self.std, size=self.dim) if self.mask is not None: noise *= self.mask @@ -138,22 +205,36 @@ def apply(self, target, env): class BrownianNoise(Disturbance): - """Simple random walk noise.""" + """Simple random walk noise. + + """ def __init__(self): super().__init__() class PeriodicNoise(Disturbance): - """Sinuisodal noise.""" + """Sinuisodal noise. + + """ - def __init__(self, env, dim, mask=None, scale=1.0, frequency=1.0, **kwargs): + def __init__(self, + env, + dim, + mask=None, + scale=1.0, + frequency=1.0, + **kwargs + ): super().__init__(env, dim) - # sin function parameters + # Sine function parameters. self.scale = scale self.frequency = frequency - def apply(self, target, env): + def apply(self, + target, + env + ): phase = self.np_random.uniform(low=-np.pi, high=np.pi, size=self.dim) t = env.step_counter noise = self.scale * self.np.sin(self.frequency * t + phase) @@ -167,9 +248,15 @@ class StateDependentDisturbance(Disturbance): """Time varying and state varying, e.g. friction. Here to provide an explicit form, can also enable friction in simulator directly. + """ - def __init__(self, env, dim, mask=None, **kwargs): + def __init__(self, + env, + dim, + mask=None, + **kwargs + ): super().__init__() diff --git a/safe_control_gym/envs/env_wrappers/record_episode_statistics.py b/safe_control_gym/envs/env_wrappers/record_episode_statistics.py index b80988d57..dbb8aa173 100644 --- a/safe_control_gym/envs/env_wrappers/record_episode_statistics.py +++ b/safe_control_gym/envs/env_wrappers/record_episode_statistics.py @@ -1,39 +1,48 @@ -from collections import deque -from copy import deepcopy import time - import gym import numpy as np +from collections import deque +from copy import deepcopy + from safe_control_gym.envs.env_wrappers.vectorized_env.vec_env import VecEnvWrapper class RecordEpisodeStatistics(gym.Wrapper): """ Keep track of episode length and returns per instantiated env - adapted from https://github.com/openai/gym/blob/38a1f630dc9815a567aaf299ae5844c8f8b9a6fa/gym/wrappers/record_episode_statistics.py + + Based on OpenAI's Gym wrapper record_episode_statistics.py + """ - def __init__(self, env, deque_size=None, **kwargs): + def __init__(self, + env, + deque_size=None, + **kwargs + ): super(RecordEpisodeStatistics, self).__init__(env) self.deque_size = deque_size - self.t0 = time.time() self.episode_return = 0.0 self.episode_length = 0 self.return_queue = deque(maxlen=deque_size) self.length_queue = deque(maxlen=deque_size) - - # other tracked stats + # Other tracked stats. self.episode_stats = {} self.accumulated_stats = {} self.queued_stats = {} - def add_tracker(self, name, init_value, mode="accumulate"): + def add_tracker(self, + name, + init_value, + mode="accumulate" + ): """Adds a specific stat to be tracked (accumulate|queue). - Different modes to track stats - - accumulate: rolling sum, e.g. total # of constraint violations during training. - - queue: finite, individual storage, e.g. returns, lengths, constraint costs. + Modes to track stats + * accumulate: rolling sum, e.g. total # of constraint violations during training. + * queue: finite, individual storage, e.g. returns, lengths, constraint costs. + """ self.episode_stats[name] = init_value if mode == "accumulate": @@ -43,30 +52,33 @@ def add_tracker(self, name, init_value, mode="accumulate"): else: raise Exception("Tracker mode not implemented.") - def reset(self, **kwargs): + def reset(self, + **kwargs + ): self.episode_return = 0.0 self.episode_length = 0 - # reset other tracked stats + # Reset other tracked stats. for key in self.episode_stats: self.episode_stats[key] *= 0 return self.env.reset(**kwargs) - def step(self, action): + def step(self, + action + ): observation, reward, done, info = self.env.step(action) self.episode_return += reward self.episode_length += 1 - # add other tracked stats + # Add other tracked stats. for key in self.episode_stats: if key in info: self.episode_stats[key] += info[key] - if done: info['episode'] = {'r': self.episode_return, 'l': self.episode_length, 't': round(time.time() - self.t0, 6)} self.return_queue.append(self.episode_return) self.length_queue.append(self.episode_length) self.episode_return = 0.0 self.episode_length = 0 - # other tracked stats + # Other tracked stats. for key in self.episode_stats: info['episode'][key] = deepcopy(self.episode_stats[key]) if key in self.accumulated_stats: @@ -74,31 +86,40 @@ def step(self, action): elif key in self.queued_stats: self.queued_stats[key].append(deepcopy(self.episode_stats[key])) self.episode_stats[key] *= 0 - return observation, reward, done, info class VecRecordEpisodeStatistics(VecEnvWrapper): - """ A vectorized wrapper that records eposodic statistics - e.g. episode lengths, returns, constraint violations + """ A vectorized wrapper that records eposodic statistics. + + E.g. episode lengths, returns, constraint violations. + """ - def __init__(self, venv, deque_size=None, **kwargs): + def __init__(self, + venv, + deque_size=None, + **kwargs + ): super(VecRecordEpisodeStatistics, self).__init__(venv) self.deque_size = deque_size - self.episode_return = np.zeros(self.num_envs) self.episode_length = np.zeros(self.num_envs) self.return_queue = deque(maxlen=deque_size) self.length_queue = deque(maxlen=deque_size) - - # other tracked stats + # Other tracked stats. self.episode_stats = {} self.accumulated_stats = {} self.queued_stats = {} - def add_tracker(self, name, init_value, mode="accumulate"): - """Adds a specific stat to be tracked (accumulated).""" + def add_tracker(self, + name, + init_value, + mode="accumulate" + ): + """Adds a specific stat to be tracked (accumulated). + + """ self.episode_stats[name] = [init_value for _ in range(self.num_envs)] if mode == "accumulate": self.accumulated_stats[name] = init_value @@ -107,10 +128,12 @@ def add_tracker(self, name, init_value, mode="accumulate"): else: raise Exception("Tracker mode not implemented.") - def reset(self, **kwargs): + def reset(self, + **kwargs + ): self.episode_return = np.zeros(self.num_envs) self.episode_length = np.zeros(self.num_envs) - # reset other tracked stats + # Reset other tracked stats. for key in self.episode_stats: for i in range(self.num_envs): self.episode_stats[key][i] *= 0 @@ -121,7 +144,7 @@ def step_wait(self): for i, (r, d) in enumerate(zip(reward, done)): self.episode_return[i] += r self.episode_length[i] += 1 - # add other tracked stats + # Add other tracked stats. for key in self.episode_stats: if d: inf = info["n"][i]["terminal_info"] @@ -129,14 +152,13 @@ def step_wait(self): inf = info["n"][i] if key in inf: self.episode_stats[key][i] += inf[key] - if d: info["n"][i]['episode'] = {'r': self.episode_return[i], 'l': self.episode_length[i]} self.return_queue.append(deepcopy(self.episode_return[i])) self.length_queue.append(deepcopy(self.episode_length[i])) self.episode_return[i] = 0 self.episode_length[i] = 0 - # other tracked stats + # Other tracked stats. for key in self.episode_stats: info["n"][i]['episode'][key] = deepcopy(self.episode_stats[key][i]) if key in self.accumulated_stats: diff --git a/safe_control_gym/envs/env_wrappers/vectorized_env/__init__.py b/safe_control_gym/envs/env_wrappers/vectorized_env/__init__.py index 6c4e3b810..d4d01472f 100644 --- a/safe_control_gym/envs/env_wrappers/vectorized_env/__init__.py +++ b/safe_control_gym/envs/env_wrappers/vectorized_env/__init__.py @@ -1,5 +1,3 @@ -""" -""" import numpy as np import random import torch @@ -8,40 +6,45 @@ from safe_control_gym.envs.env_wrappers.vectorized_env.subproc_vec_env import SubprocVecEnv -def make_env_fn(env_func, env_config, seed, rank): - """higher-order function for env init func +def make_env_fn(env_func, + env_config, + seed, + rank): + """Higher-order function for env init func. Args: - env_func (function): partial function that can accept args - env_config (dict): env init args - seed (int): base seed - rank (int): unique seed for each parallel env + env_func (function): partial function that can accept args. + env_config (dict): env init args. + seed (int): base seed. + rank (int): unique seed for each parallel env. Returns: - _thunk: env-constructing func + _thunk: env-constructing func. + """ def _thunk(): - # do not set seed i if 0 (e.g. for evaluation) + # Do not set seed i if 0 (e.g. for evaluation). if seed > 0: e_seed = seed + rank - random.seed(e_seed) np.random.seed(e_seed) torch.manual_seed(e_seed) - env = env_func(**env_config) env.seed(e_seed) env.action_space.seed(e_seed) else: env = env_func(**env_config) return env - return _thunk -def make_vec_envs(env_func, env_configs=None, batch_size=1, n_processes=1, seed=0): - """Produce envs with parallel rollout abilities +def make_vec_envs(env_func, + env_configs=None, + batch_size=1, + n_processes=1, + seed=0): + """Produce envs with parallel rollout abilities. Args: env_func (function): partial function that can accept args. @@ -51,7 +54,8 @@ def make_vec_envs(env_func, env_configs=None, batch_size=1, n_processes=1, seed= seed (int): base seed for the run. Returns: - VecEnv: (wrapped) parallel envs + VecEnv: (wrapped) parallel envs. + """ if env_configs is None: env_configs = [{}] * batch_size @@ -59,5 +63,5 @@ def make_vec_envs(env_func, env_configs=None, batch_size=1, n_processes=1, seed= if n_processes > 1: return SubprocVecEnv(env_fns, n_workers=n_processes) else: - # e.g. can use in evaluation (with seed -1) - return DummyVecEnv(env_fns) \ No newline at end of file + # E.g. can use in evaluation (with seed -1). + return DummyVecEnv(env_fns) diff --git a/safe_control_gym/envs/env_wrappers/vectorized_env/dummy_vec_env.py b/safe_control_gym/envs/env_wrappers/vectorized_env/dummy_vec_env.py index dccefd714..51727f37d 100644 --- a/safe_control_gym/envs/env_wrappers/vectorized_env/dummy_vec_env.py +++ b/safe_control_gym/envs/env_wrappers/vectorized_env/dummy_vec_env.py @@ -1,32 +1,40 @@ import copy import numpy as np -from safe_control_gym.utils.utils import get_random_state, set_random_state from safe_control_gym.envs.env_wrappers.vectorized_env.vec_env import VecEnv from safe_control_gym.envs.env_wrappers.vectorized_env.vec_env_utils import _flatten_list, _flatten_obs +from safe_control_gym.utils.utils import get_random_state, set_random_state class DummyVecEnv(VecEnv): - """Single thread env (allow multiple envs sequentially) + """Single thread env (allow multiple envs sequentially). - original version from baseline is bad - adapted from https://github.com/ShangtongZhang/DeepRL/blob/master/deep_rl/component/envs.py """ - def __init__(self, env_fns): + def __init__(self, + env_fns + ): + """ + + """ self.envs = [fn() for fn in env_fns] env = self.envs[0] - - VecEnv.__init__(self, len(env_fns), env.observation_space, - env.action_space) - + VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) self.actions = None self.closed = False - def step_async(self, actions): + def step_async(self, + actions + ): + """ + + """ self.actions = actions def step_wait(self): + """ + + """ results = [] for i in range(self.num_envs): obs, rew, done, info = self.envs[i].step(self.actions[i]) @@ -37,11 +45,13 @@ def step_wait(self): info["terminal_observation"] = end_obs info["terminal_info"] = end_info results.append([obs, rew, done, info]) - obs, rews, dones, infos = zip(*results) return _flatten_obs(obs), np.array(rews), np.array(dones), {"n": infos} def reset(self): + """ + + """ results = [] for env in self.envs: results.append(env.reset()) @@ -49,6 +59,9 @@ def reset(self): return _flatten_obs(obs), {"n": infos} def close(self): + """ + + """ for env in self.envs: env.close() if self.viewer is not None: @@ -56,27 +69,54 @@ def close(self): self.closed = True def get_images(self): + """ + + """ return [env.render(mode='rgb_array') for env in self.envs] - def render(self, mode='human'): + def render(self, + mode='human' + ): + """ + + """ if self.num_envs == 1: return self.envs[0].render(mode=mode) else: return super().render(mode=mode) def get_env_random_state(self): + """ + + """ return [get_random_state()] - def set_env_random_state(self, worker_random_states): + def set_env_random_state(self, + worker_random_states + ): + """ + + """ set_random_state(worker_random_states[0]) - def get_attr(self, attr_name, indices=None): - """Return attribute from vectorized environment (see base class).""" + def get_attr(self, + attr_name, + indices=None + ): + """Return attribute from vectorized environment (see base class). + + """ target_envs = self._get_target_envs(indices) return [getattr(env_i, attr_name) for env_i in target_envs] - def set_attr(self, attr_name, values, indices=None): - """Set attribute inside vectorized environments (see base class).""" + def set_attr(self, + attr_name, + values, + indices=None + ): + """Set attribute inside vectorized environments (see base class). + + """ target_envs = self._get_target_envs(indices) assert len(target_envs) == len(values) for env_i, val_i in zip(target_envs, values): @@ -87,7 +127,9 @@ def env_method(self, method_args=None, method_kwargs=None, indices=None): - """Call instance methods of vectorized environments.""" + """Call instance methods of vectorized environments. + + """ target_envs = self._get_target_envs(indices) if method_args is None: method_args = [[]] * len(target_envs) @@ -100,7 +142,12 @@ def env_method(self, in zip(target_envs, method_args, method_kwargs) ] - def _get_target_envs(self, indices): + def _get_target_envs(self, + indices + ): + """ + + """ assert indices is None or sorted( indices) == indices, "Indices must be ordered" indices = self._get_indices(indices) diff --git a/safe_control_gym/envs/env_wrappers/vectorized_env/subproc_vec_env.py b/safe_control_gym/envs/env_wrappers/vectorized_env/subproc_vec_env.py index 88b85c021..ba9f1d755 100644 --- a/safe_control_gym/envs/env_wrappers/vectorized_env/subproc_vec_env.py +++ b/safe_control_gym/envs/env_wrappers/vectorized_env/subproc_vec_env.py @@ -1,3 +1,10 @@ +"""Subprocess vectorized environments. + +See also: + * https://github.com/openai/baselines/blob/master/baselines/common/vec_env/subproc_vec_env.py + * https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/vec_env/subproc_vec_env.py + +""" import copy import numpy as np import multiprocessing as mp @@ -8,24 +15,18 @@ class SubprocVecEnv(VecEnv): - """ Multiprocess envs - - references - - https://github.com/openai/baselines/blob/master/baselines/common/vec_env/subproc_vec_env.py - - https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/vec_env/subproc_vec_env.py + """Multiprocess envs. + """ def __init__(self, env_fns, spaces=None, context="spawn", n_workers=1): self.waiting = False self.closed = False - nenvs = len(env_fns) self.n_workers = n_workers assert nenvs % n_workers == 0, "Number of envs must be divisible by number of workers to run in series" env_fns = np.array_split(env_fns, self.n_workers) - - # context is necessary for multiprocessing with cuda - # see https://pytorch.org/docs/stable/notes/multiprocessing.html + # Context is necessary for multiprocessing with CUDA, see pytorch.org/docs/stable/notes/multiprocessing.html ctx = mp.get_context(context) self.remotes, self.work_remotes = zip( *[ctx.Pipe() for _ in range(self.n_workers)]) @@ -36,16 +37,14 @@ def __init__(self, env_fns, spaces=None, context="spawn", n_workers=1): env_fn) in zip(self.work_remotes, self.remotes, env_fns) ] for p in self.ps: - p.daemon = True # if the main process crashes, we should not cause things to hang + p.daemon = True # If the main process crashes, we should not cause things to hang. with clear_mpi_env_vars(): p.start() for remote in self.work_remotes: remote.close() - self.remotes[0].send(('get_spaces_spec', None)) observation_space, action_space = self.remotes[0].recv().x self.viewer = None - VecEnv.__init__(self, nenvs, observation_space, action_space) def step_async(self, actions): @@ -73,7 +72,9 @@ def reset(self): return _flatten_obs(obs), {"n": infos} def get_images(self): - """Called by parent `render` to support tiling images""" + """Called by parent `render` to support tiling images. + + """ self._assert_not_closed() for pipe in self.remotes: pipe.send(('render', None)) @@ -110,14 +111,18 @@ def set_env_random_state(self, worker_random_states): res = [remote.recv() for remote in self.remotes] def get_attr(self, attr_name, indices=None): - """Return attribute from vectorized environment (see base class).""" + """Return attribute from vectorized environment (see base class). + + """ target_remotes, remote_env_indices = self._get_target_envs(indices) for remote, env_indices in zip(target_remotes, remote_env_indices): remote.send(("get_attr", (env_indices, attr_name))) return _flatten_list([remote.recv() for remote in target_remotes]) def set_attr(self, attr_name, values, indices=None): - """Set attribute inside vectorized environments (see base class).""" + """Set attribute inside vectorized environments (see base class). + + """ target_remotes, remote_env_indices, splits = self._get_target_envs( indices) value_splits = [] @@ -137,9 +142,10 @@ def env_method(self, method_args=None, method_kwargs=None, indices=None): - """Call instance methods of vectorized environments.""" - target_remotes, remote_env_indices, splits = self._get_target_envs( - indices) + """Call instance methods of vectorized environments. + + """ + target_remotes, remote_env_indices, splits = self._get_target_envs(indices) method_arg_splits, method_kwarg_splits = [], [] for i in range(len(splits) - 1): start, end = splits[i], splits[i + 1] @@ -161,6 +167,7 @@ def env_method(self, def _get_target_envs(self, indices): """ + Example: n_workers: 3 current envs: [0,1,2,3,4,5] @@ -169,28 +176,26 @@ def _get_target_envs(self, indices): remote_indices: [0,0,1,1] -> [0,1] splits: [0,2] -> [0,2,4] - remote_env_indices: [1,1,0,1] -> [1,1], [0,1] + remote_env_indices: [1,1,0,1] -> [1,1], [0,1] + """ assert indices is None or sorted( indices) == indices, "Indices must be ordered" indices = self._get_indices(indices) - remote_indices = [idx // self.n_workers for idx in indices] remote_env_indices = [idx % self.n_workers for idx in indices] - - remote_indices, splits = np.unique(np.array(remote_indices), - return_index=True) + remote_indices, splits = np.unique(np.array(remote_indices), return_index=True) target_remotes = [self.remotes[idx] for idx in remote_indices] remote_env_indices = np.split(np.array(remote_env_indices), splits[1:]) remote_env_indices = remote_env_indices.tolist() - splits = np.append(splits, [len(indices)]) return target_remotes, remote_env_indices, splits def worker(remote, parent_remote, env_fn_wrappers): - """worker func to execute vec_env commands""" + """Worker func to execute vec_env commands. + """ def step_env(env, action): ob, reward, done, info = env.step(action) if done: @@ -200,13 +205,12 @@ def step_env(env, action): info["terminal_observation"] = end_obs info["terminal_info"] = end_info return ob, reward, done, info - parent_remote.close() envs = [env_fn_wrapper() for env_fn_wrapper in env_fn_wrappers.x] try: while True: cmd, data = remote.recv() - # branch out for requests + # Branch out for requests. if cmd == 'step': remote.send( [step_env(env, action) for env, action in zip(envs, data)]) @@ -225,7 +229,7 @@ def step_env(env, action): remote.send(CloudpickleWrapper(get_random_state())) elif cmd == "set_random_state": set_random_state(data) - # just a placeholder to return something + # Placeholder for the return. remote.send(True) elif cmd == "get_attr": env_indices, attr_name = data diff --git a/safe_control_gym/envs/env_wrappers/vectorized_env/vec_env.py b/safe_control_gym/envs/env_wrappers/vectorized_env/vec_env.py index 3125329e0..449ea0658 100644 --- a/safe_control_gym/envs/env_wrappers/vectorized_env/vec_env.py +++ b/safe_control_gym/envs/env_wrappers/vectorized_env/vec_env.py @@ -1,12 +1,13 @@ -"""Adapted from OpenAI Baselines code +"""Adapted from OpenAI Baselines. -references -- https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_env.py -- https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/vec_env/base_vec_env.py +See also: + * https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_env.py + * https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/vec_env/base_vec_env.py """ import os import numpy as np + from abc import ABC, abstractmethod from safe_control_gym.envs.env_wrappers.vectorized_env.vec_env_utils import tile_images @@ -15,35 +16,42 @@ class VecEnv(ABC): """An abstract asynchronous, vectorized environment. - Used to batch data from multiple copies of an environment, - so that each observation becomes an batch of observations, - and expected action is a batch of actions to be applied per-environment. + Used to batch data from multiple copies of an environment, so that each observation becomes a + batch of observations, and expected action is a batch of actions to be applied per-environment. + """ closed = False viewer = None metadata = {'render.modes': ['human', 'rgb_array']} - def __init__(self, num_envs, observation_space, action_space): + def __init__(self, + num_envs, + observation_space, + action_space + ): self.num_envs = num_envs self.observation_space = observation_space self.action_space = action_space @abstractmethod def reset(self): - """Reset all the environments and return an array of observations, - or a dict of observation arrays. + """Reset all the environments and return an array of observations, or a dict of observation arrays. + + If step_async is still doing work, that work will be cancelled and step_wait() should not + be called until step_async() is invoked again. - If step_async is still doing work, that work will be cancelled - and step_wait() should not be called until step_async() is invoked again. """ pass @abstractmethod - def step_async(self, actions): + def step_async(self, + actions + ): """Tell all the environments to start taking a step with the given actions. Call step_wait() to get the results of the step. You should not call this if a step_async run is already pending. + """ pass @@ -52,15 +60,18 @@ def step_wait(self): """Wait for the step taken with step_async(). Returns (obs, rews, dones, infos): - - obs: an array of observations, or a dict of arrays of observations. - - rews: an array of rewards - - dones: an array of "episode done" booleans - - infos: a sequence of info objects + - obs: an array of observations, or a dict of arrays of observations. + - rews: an array of rewards + - dones: an array of "episode done" booleans + - infos: a sequence of info objects + """ pass def close_extras(self): - """Clean up the extra resources. Only runs when not self.closed.""" + """Clean up the extra resources. Only runs when not self.closed. + + """ pass def close(self): @@ -71,13 +82,21 @@ def close(self): self.close_extras() self.closed = True - def step(self, actions): - """Step the environments synchronously.""" + def step(self, + actions + ): + """Step the environments synchronously. + + """ self.step_async(actions) return self.step_wait() - def render(self, mode='human'): - """Display environment via a viewer""" + def render(self, + mode='human' + ): + """Display environment via a viewer. + + """ imgs = self.get_images() bigimg = tile_images(imgs) if mode == 'human': @@ -89,7 +108,9 @@ def render(self, mode='human'): raise NotImplementedError def get_images(self): - """Return RGB images from each environment""" + """Return RGB images from each environment. + + """ raise NotImplementedError @property @@ -107,12 +128,16 @@ def get_viewer(self): @abstractmethod def get_attr(self, attr_name, indices=None): - """Return attribute from vectorized environment.""" + """Return attribute from vectorized environment. + + """ pass @abstractmethod def set_attr(self, attr_name, values, indices=None): - """Set attribute inside vectorized environments.""" + """Set attribute inside vectorized environments. + + """ pass @abstractmethod @@ -121,10 +146,14 @@ def env_method(self, method_args=None, method_kwargs=None, indices=None): - """Call instance methods of vectorized environments.""" + """Call instance methods of vectorized environments. + + """ raise NotImplementedError() - def _get_indices(self, indices): + def _get_indices(self, + indices + ): """Convert a flexibly-typed reference to environment indices to an implied list of indices.""" if indices is None: indices = range(self.num_envs) @@ -136,11 +165,14 @@ def _get_indices(self, indices): class VecEnvWrapper(VecEnv): """An environment wrapper that applies to an entire batch of environments at once.""" - def __init__(self, venv, observation_space=None, action_space=None): + def __init__(self, + venv, + observation_space=None, + action_space=None + ): self.venv = venv super().__init__(num_envs=venv.num_envs, - observation_space=observation_space or - venv.observation_space, + observation_space=observation_space or venv.observation_space, action_space=action_space or venv.action_space) def step_async(self, actions): @@ -163,16 +195,25 @@ def render(self, mode='human'): def get_images(self): return self.venv.get_images() - def __getattr__(self, name): + def __getattr__(self, + name + ): if name.startswith('_'): raise AttributeError( "attempted to get missing private attribute '{}'".format(name)) return getattr(self.venv, name) - def get_attr(self, attr_name, indices=None): + def get_attr(self, + attr_name, + indices=None + ): return self.venv.get_attr(attr_name, indices) - def set_attr(self, attr_name, values, indices=None): + def set_attr(self, + attr_name, + values, + indices=None + ): return self.venv.set_attr(attr_name, values, indices) def env_method(self, diff --git a/safe_control_gym/envs/env_wrappers/vectorized_env/vec_env_utils.py b/safe_control_gym/envs/env_wrappers/vectorized_env/vec_env_utils.py index e719476c6..a59a663b1 100644 --- a/safe_control_gym/envs/env_wrappers/vectorized_env/vec_env_utils.py +++ b/safe_control_gym/envs/env_wrappers/vectorized_env/vec_env_utils.py @@ -4,8 +4,8 @@ class CloudpickleWrapper(object): - """ Uses cloudpickle to serialize contents - (otherwise multiprocessing tries to use pickle) + """Uses cloudpickle to serialize contents and stop multiprocessing from using pickle. + """ def __init__(self, x): @@ -22,15 +22,14 @@ def __setstate__(self, ob): @contextlib.contextmanager def clear_mpi_env_vars(): - """ - from mpi4py import MPI will call MPI_Init by default. + """From mpi4py import MPI will call MPI_Init by default. - If the child process has MPI environment variables, - MPI will think that the child process is an MPI process - just like the parent and do bad things such as hang. + If the child process has MPI environment variables, MPI will think that the child process + is an MPI process just like the parent and do bad things such as hang. - This context manager is a hacky way to clear those environment variables - temporarily such as when we are starting multiprocessing Processes. + This context manager is a hacky way to clear those environment variables temporarily such + as when we are starting multiprocessing Processes. + """ removed_environment = {} for k, v in list(os.environ.items()): @@ -45,16 +44,17 @@ def clear_mpi_env_vars(): def tile_images(img_nhwc): - """Tile N images into one big PxQ image + """Tile N images into one big PxQ image. (P,Q) are chosen to be as close as possible, and if N is square, then P=Q. Args: - img_nhwc: list or array of images, ndim=4 once turned into array - n = batch index, h = height, w = width, c = channel + img_nhwc: list or array of images, ndim=4 once turned into array, + n = batch index, h = height, w = width, c = channel. Returns: - img_Hh_Ww_c: ndarray with ndim=3 + img_Hh_Ww_c: ndarray with ndim=3. + """ img_nhwc = np.asarray(img_nhwc) N, h, w, c = img_nhwc.shape @@ -69,6 +69,9 @@ def tile_images(img_nhwc): def _flatten_obs(obs): + """ + + """ assert isinstance(obs, (list, tuple)) assert len(obs) > 0 @@ -80,11 +83,12 @@ def _flatten_obs(obs): def _unflatten_obs(obs): + """ + + """ assert isinstance(obs, (np.ndarray, dict)) - def split_batch(data): return [d[0] for d in np.split(data, len(data))] - if isinstance(obs, dict): keys = list(obs.keys()) unflat_obs = [split_batch(obs[k]) for k in keys] @@ -96,8 +100,10 @@ def split_batch(data): def _flatten_list(l): + """ + + """ assert isinstance(l, (list, tuple)) assert len(l) > 0 assert all([len(l_) > 0 for l_ in l]) - return [l__ for l_ in l for l__ in l_] diff --git a/safe_control_gym/envs/gym_control/cartpole.py b/safe_control_gym/envs/gym_control/cartpole.py index a81a2bc0c..bfc22f82d 100644 --- a/safe_control_gym/envs/gym_control/cartpole.py +++ b/safe_control_gym/envs/gym_control/cartpole.py @@ -3,12 +3,9 @@ Classic cart-pole system implemented by Rich Sutton et al. * http://incompleteideas.net/sutton/book/code/pole.c -Reference implementations: - * https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py - * https://github.com/bulletphysics/bullet3/blob/master/examples/pybullet/gym/pybullet_envs/bullet/cartpole_bullet.py - -Todo: - * Should environment record all past values up to the last reset? +Also see: + * github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py + * github.com/bulletphysics/bullet3/blob/master/examples/pybullet/gym/pybullet_envs/bullet/cartpole_bullet.py """ import os @@ -23,8 +20,7 @@ from safe_control_gym.envs.benchmark_env import BenchmarkEnv, Cost, Task from safe_control_gym.envs.disturbances import DisturbanceList, DISTURBANCE_TYPES from safe_control_gym.envs.constraints import create_ConstraintList_from_dict, GENERAL_CONSTRAINTS -from safe_control_gym.envs.gym_control.cartpole_constraints import CartPoleStateConstraint, CartPoleInputConstraint, \ - CartPoleSymmetricStateConstraint, CartPoleBoundConstraint +from safe_control_gym.envs.gym_control.cartpole_constraints import CartPoleStateConstraint, CartPoleInputConstraint, CartPoleSymmetricStateConstraint, CartPoleBoundConstraint from safe_control_gym.math_and_models.symbolic_systems import SymbolicModel @@ -78,16 +74,16 @@ class CartPole(BenchmarkEnv): DISTURBANCE_MODES = {"observation": {"dim": 4}, "action": {"dim": 1}, "dynamics": {"dim": 2}} INERTIAL_PROP_RAND_INFO = { - "pole_length": { # 1 + "pole_length": { # Nominal: 1 "distrib": "choice", "args": [[1, 5, 10]] }, - "cart_mass": { # 1 + "cart_mass": { # Nominal: 1 "distrib": "uniform", "low": 0.5, "high": 1.5 }, - "pole_mass": { # 0.1 + "pole_mass": { # Nominal: 0.1 "distrib": "uniform", "low": 0.05, "high": 0.15 @@ -198,12 +194,10 @@ def __init__(self, raise ValueError("[ERROR] in CartPole.__init__(), pyb_freq is not divisible by env_freq.") self.CTRL_TIMESTEP = 1. / self.CTRL_FREQ self.PYB_TIMESTEP = 1. / self.PYB_FREQ - # Set GUI and rendering constants. self.GUI = gui self.RENDER_HEIGHT = int(200) self.RENDER_WIDTH = int(320) - # Set the initial state. if init_state is None: self.INIT_X, self.INIT_X_DOT, self.INIT_THETA, self.INIT_THETA_DOT = np.zeros(4) @@ -220,11 +214,9 @@ def __init__(self, self.RANDOMIZED_INIT = randomized_init if init_state_randomization_info is not None: self.INIT_STATE_RAND_INFO = init_state_randomization_info - # Get physical properties from URDF (as default parameters). self.GRAVITY_ACC = 9.8 EFFECTIVE_POLE_LENGTH, POLE_MASS, CART_MASS = self._parse_urdf_parameters(self.URDF_PATH) - # Store prior parameters. if prior_prop is None: self.PRIOR_EFFECTIVE_POLE_LENGTH = EFFECTIVE_POLE_LENGTH @@ -236,7 +228,6 @@ def __init__(self, self.PRIOR_CART_MASS = prior_prop.get("cart_mass", CART_MASS) else: raise ValueError("[ERROR] in CartPole.__init__(), prior_prop incorrect format.") - # Store ground truth parameters. if inertial_prop is None: self.EFFECTIVE_POLE_LENGTH = EFFECTIVE_POLE_LENGTH @@ -252,20 +243,16 @@ def __init__(self, self.RANDOMIZED_INERTIAL_PROP = randomized_inertial_prop if inertial_prop_randomization_info is not None: self.INERTIAL_PROP_RAND_INFO = inertial_prop_randomization_info - # Store disturbance info. self.DISTURBANCES = disturbances self.adversary_disturbance = adversary_disturbance self.adversary_disturbance_scale = adversary_disturbance_scale - # Store constraint info self.CONSTRAINTS = constraints self.DONE_ON_VIOLATION = done_on_violation self.use_constraint_penalty = use_constraint_penalty self.constraint_penalty = constraint_penalty - self.VERBOSE = verbose - # Set up action and observation space. self._set_action_space() self._set_observation_space() @@ -277,10 +264,8 @@ def __init__(self, self.PYB_CLIENT = p.connect(p.DIRECT) # disable urdf caching for randomization via reloading urdf p.setPhysicsEngineParameter(enableFileCaching=0) - # Call BenchmarkEnv constructor. super().__init__(seed=seed, output_dir=output_dir, info_in_reset=info_in_reset, episode_len_sec=episode_len_sec, cost=cost) - # Create X_GOAL and U_GOAL references for the assigned task. self.TASK = Task(task) if task_info is not None: @@ -301,7 +286,7 @@ def __init__(self, ) self.X_GOAL = np.vstack([ - POS_REF[:, 0], # TODO Add offset? + POS_REF[:, 0], # TODO Add offset. VEL_REF[:, 0], np.zeros(POS_REF.shape[0]), np.zeros(VEL_REF.shape[0]) @@ -309,7 +294,9 @@ def __init__(self, @property def control_step_counter(self): - """Smiliar to `step_counter` but on control frequency.""" + """Smiliar to `step_counter` but on control frequency. + + """ return self.step_counter // (self.PYB_FREQ // self.CTRL_FREQ) def step(self, action): @@ -363,40 +350,33 @@ def reset(self): self.current_preprocessed_action = None if self.adversary_disturbance is not None: self.adv_action = None - # PyBullet simulation reset. p.resetSimulation(physicsClientId=self.PYB_CLIENT) p.setGravity(0, 0, -self.GRAVITY_ACC, physicsClientId=self.PYB_CLIENT) p.setTimeStep(self.PYB_TIMESTEP, physicsClientId=self.PYB_CLIENT) p.setRealTimeSimulation(0, physicsClientId=self.PYB_CLIENT) - # Choose randomized or deterministic inertial properties. prop_values = {"pole_length": self.EFFECTIVE_POLE_LENGTH, "cart_mass": self.CART_MASS, "pole_mass": self.POLE_MASS} if self.RANDOMIZED_INERTIAL_PROP: prop_values = self._randomize_values_by_info(prop_values, self.INERTIAL_PROP_RAND_INFO) if any(phy_quantity < 0 for phy_quantity in prop_values.values()): raise ValueError("[ERROR] in CartPole.reset(), negative randomized inertial properties.") - self.OVERRIDDEN_EFFECTIVE_POLE_LENGTH = prop_values["pole_length"] self.OVERRIDDEN_CART_MASS = prop_values["cart_mass"] self.OVERRIDDEN_POLE_MASS = prop_values["pole_mass"] - - # refer to `slender rod` in https://en.wikipedia.org/wiki/List_of_moments_of_inertia + # See `slender rod`, https://en.wikipedia.org/wiki/List_of_moments_of_inertia. OVERRIDDEN_POLE_INERTIA = (1 / 12) * self.OVERRIDDEN_POLE_MASS * (2 * self.OVERRIDDEN_EFFECTIVE_POLE_LENGTH)**2 - # Load the cartpole with new urdf. override_urdf_tree = self._create_urdf(self.URDF_PATH, length=self.OVERRIDDEN_EFFECTIVE_POLE_LENGTH, inertia=OVERRIDDEN_POLE_INERTIA) self.override_path = os.path.join(self.output_dir, "pid-{}_id-{}_cartpole.urdf".format(os.getpid(), self.id)) override_urdf_tree.write(self.override_path) - self.CARTPOLE_ID = p.loadURDF( self.override_path, basePosition=[0, 0, 0], # flags = p.URDF_USE_INERTIA_FROM_FILE, physicsClientId=self.PYB_CLIENT) - # remove cache file after loading into pybullet + # Remove cache file after loading it into PyBullet. os.remove(self.override_path) - # Cartpole settings. for i in [-1, 0, 1]: # Slider, cart, and pole. p.changeDynamics(self.CARTPOLE_ID, linkIndex=i, linearDamping=0, angularDamping=0, physicsClientId=self.PYB_CLIENT) @@ -413,17 +393,14 @@ def reset(self): linkIndex=1, # Pole. mass=self.OVERRIDDEN_POLE_MASS, physicsClientId=self.PYB_CLIENT) - # Randomize initial state. init_values = {"init_x": self.INIT_X, "init_x_dot": self.INIT_X_DOT, "init_theta": self.INIT_THETA, "init_theta_dot": self.INIT_THETA_DOT} if self.RANDOMIZED_INIT: init_values = self._randomize_values_by_info(init_values, self.INIT_STATE_RAND_INFO) - OVERRIDDEN_INIT_X = init_values["init_x"] OVERRIDDEN_INIT_X_DOT = init_values["init_x_dot"] OVERRIDDEN_INIT_THETA = init_values["init_theta"] OVERRIDDEN_INIT_THETA_DOT = init_values["init_theta_dot"] - p.resetJointState( self.CARTPOLE_ID, jointIndex=0, # Slider-to-cart joint. @@ -436,15 +413,12 @@ def reset(self): targetValue=OVERRIDDEN_INIT_THETA, targetVelocity=OVERRIDDEN_INIT_THETA_DOT, physicsClientId=self.PYB_CLIENT) - # Compute state (x, x_dot, theta, theta_dot). self.state = np.hstack( (p.getJointState(self.CARTPOLE_ID, jointIndex=0, physicsClientId=self.PYB_CLIENT)[0:2], p.getJointState(self.CARTPOLE_ID, jointIndex=1, physicsClientId=self.PYB_CLIENT)[0:2])) - # Debug visualization if GUI enabled self.line = None - # Return either an observation and dictionary or just the observation. if self.INFO_IN_RESET: return self._get_observation(), self._get_reset_info() @@ -483,20 +457,26 @@ def render(self, mode='human'): return np.reshape(rgb, (h, w, 4)) def close(self): - """Clean up the environment and PyBullet connection.""" + """Clean up the environment and PyBullet connection. + + """ if self.PYB_CLIENT >= 0: p.disconnect(physicsClientId=self.PYB_CLIENT) self.PYB_CLIENT = -1 def set_adversary_control(self, action): - """Sets disturbance by an adversary controller, called before (each) step().""" + """Sets disturbance by an adversary controller, called before (each) step(). + + """ if self.adversary_disturbance is not None: clipped_adv_action = np.clip(action, self.adversary_action_space.low, self.adversary_action_space.high) self.adv_action = clipped_adv_action * self.adversary_disturbance_scale def _setup_disturbances(self): - """Sets up scaling and actions space for an adversarial disturbance.""" - # default no passive disturbance + """Sets up scaling and actions space for an adversarial disturbance. + + """ + # Default: no passive disturbances. self.disturbances = {} if self.DISTURBANCES is not None: @@ -504,44 +484,41 @@ def _setup_disturbances(self): assert mode in self.DISTURBANCE_MODES, "[ERROR] in Cartpole._setup_disturbances(), disturbance mode not available." disturb_list = [] shared_args = self.DISTURBANCE_MODES[mode] - - # each disturbance for this mode + # Each disturbance for the mode. for name, cfg in disturbs.items(): assert name in DISTURBANCE_TYPES, "[ERROR] in Cartpole._setup_disturbances(), disturbance type not available." disturb_cls = DISTURBANCE_TYPES[name] disturb = disturb_cls(self, **shared_args, **cfg) disturb_list.append(disturb) - - # combine as one for this mode + # Combine as one for the mode. self.disturbances[mode] = DisturbanceList(disturb_list) - - # adversary disturbance (set from outside of env, non-passive) + # Adversary disturbance (set from outside of env, active/non-passive). if self.adversary_disturbance is not None: assert self.adversary_disturbance in self.DISTURBANCE_MODES, "[ERROR] in Cartpole._setup_disturbances()" - shared_args = self.DISTURBANCE_MODES[self.adversary_disturbance] dim = shared_args["dim"] - # TODO: symmetric space can be problematic for obs ? self.adversary_action_space = spaces.Box(low=-1, high=1, shape=(dim,)) - # adversary obs same as protagonist + # Adversary obs are the same as those of the protagonist. self.adversary_observation_space = self.observation_space def _setup_constraints(self): - """Sets up a list (ConstraintList) of constraints.""" + """Sets up a list (ConstraintList) of constraints. + + """ self.constraints = None self.num_constraints = 0 - if self.CONSTRAINTS is not None: self.constraints = create_ConstraintList_from_dict(self.CONSTRAINTS, self.AVAILABLE_CONSTRAINTS, self) self.num_constraints = self.constraints.num_constraints def _setup_symbolic(self): - """Creates symbolic (CasADi) models for dynamics, observation, and cost.""" + """Creates symbolic (CasADi) models for dynamics, observation, and cost. + + """ l, m, M = self.PRIOR_EFFECTIVE_POLE_LENGTH, self.PRIOR_POLE_MASS, self.PRIOR_CART_MASS Mm, ml = m + M, m * l g = self.GRAVITY_ACC dt = self.CTRL_TIMESTEP - # Input variables. x = cs.MX.sym('x') x_dot = cs.MX.sym('x_dot') @@ -551,42 +528,40 @@ def _setup_symbolic(self): U = cs.MX.sym('U') nx = 4 nu = 1 - # Dynamics. temp_factor = (U + ml * theta_dot**2 * cs.sin(theta)) / Mm theta_dot_dot = ((g * cs.sin(theta) - cs.cos(theta) * temp_factor) / (l * (4.0 / 3.0 - m * cs.cos(theta)**2 / Mm))) X_dot = cs.vertcat(x_dot, temp_factor - ml * theta_dot_dot * cs.cos(theta) / Mm, theta_dot, theta_dot_dot) - # Observation. Y = cs.vertcat(x, x_dot, theta, theta_dot) - # Define cost (quadratic form). Q = cs.MX.sym('Q', nx, nx) R = cs.MX.sym('R', nu, nu) Xr = cs.MX.sym('Xr', nx, 1) Ur = cs.MX.sym('Ur', nu, 1) cost_func = 0.5 * (X - Xr).T @ Q @ (X - Xr) + 0.5 * (U - Ur).T @ R @ (U - Ur) - # Define dynamics and cost dictionaries. dynamics = {"dyn_eqn": X_dot, "obs_eqn": Y, "vars": {"X": X, "U": U}} cost = {"cost_func": cost_func, "vars": {"X": X, "U": U, "Xr": Xr, "Ur": Ur, "Q": Q, "R": R}} - # Setup symbolic model. self.symbolic = SymbolicModel(dynamics=dynamics, cost=cost, dt=dt) def _set_action_space(self): - """Returns the action space of the environment.""" + """Returns the action space of the environment. + + """ self.action_scale = 10 self.action_threshold = 1 if self.NORMALIZED_RL_ACTION_SPACE else self.action_scale self.action_space = spaces.Box(low=-self.action_threshold, high=self.action_threshold, shape=(1,)) def _set_observation_space(self): - """Returns the observation space of the environment.""" + """Returns the observation space of the environment. + + """ # Angle at which to fail the episode (0.20943951023931953). self.theta_threshold_radians = 12 * (2 * math.pi / 360) # NOTE: different value in PyBullet gym (0.4) and OpenAI gym (2.4). self.x_threshold = 2.4 - # Limit set to 2x: i.e. a failing observation is still within bounds. OBS_BOUND = np.array([self.x_threshold * 2, np.finfo(np.float32).max, self.theta_threshold_radians * 2, np.finfo(np.float32).max]) self.observation_space = spaces.Box(-OBS_BOUND, OBS_BOUND, dtype=np.float32) @@ -606,14 +581,12 @@ def _preprocess_control(self, action): print("[WARNING]: action was clipped in CartPole._preprocess_control().") if self.NORMALIZED_RL_ACTION_SPACE: force = self.action_scale * force - - # apply disturbances + # Apply disturbances. if "action" in self.disturbances: force = self.disturbances["action"].apply(force, self) if self.adversary_disturbance == "action": force = force + self.adv_action - - # only use the scalar value. + # Only use the scalar value. force = force[0] return force @@ -627,7 +600,7 @@ def _advance_simulation(self, force): """ tab_force = None - # determine disturbance force + # Determine the disturbance force. passive_disturb = "dynamics" in self.disturbances adv_disturb = self.adversary_disturbance == "dynamics" if passive_disturb or adv_disturb: @@ -636,13 +609,12 @@ def _advance_simulation(self, force): tab_force = self.disturbances["dynamics"].apply(tab_force, self) if adv_disturb and self.adv_action is not None: tab_force = tab_force + self.adv_action - # clear adversary action, wait for next one + # Clear adversary's action, wait for the next one. self.adv_action = None - for _ in range(int(self.PYB_FREQ / self.CTRL_FREQ)): - # apply disturbance (by tabbing pole on x-z plane) + # apply disturbance (by tabbing pole on x-z plane). if tab_force is not None: - # convert 2D force to 3D for pybullet + # Convert 2D force to 3D on for PyBullet. tab_force_3d = [float(tab_force[0]), 0.0, float(tab_force[1])] p.applyExternalForce( self.CARTPOLE_ID, @@ -654,7 +626,6 @@ def _advance_simulation(self, force): physicsClientId=self.PYB_CLIENT)[0], # exert force on pole center flags=p.WORLD_FRAME, physicsClientId=self.PYB_CLIENT) - # Debug visualization if self.GUI: center = np.asarray(p.getLinkState(self.CARTPOLE_ID, linkIndex=1, physicsClientId=self.PYB_CLIENT)[0]) @@ -663,16 +634,14 @@ def _advance_simulation(self, force): self.line = p.addUserDebugLine(center.tolist(), (center - ff).tolist(), lineColorRGB=[0, 0, 0], lineWidth=1) else: p.addUserDebugLine(center.tolist(), (center - ff).tolist(), lineColorRGB=[0, 0, 0], lineWidth=1, replaceItemUniqueId=self.line) - - # apply main control + # Apply control. p.setJointMotorControl2( self.CARTPOLE_ID, jointIndex=0, # Slider-to-cart joint. controlMode=p.TORQUE_CONTROL, force=force, physicsClientId=self.PYB_CLIENT) - - # step simulation and counter. + # Step simulation and counter. p.stepSimulation(physicsClientId=self.PYB_CLIENT) self.step_counter += 1 @@ -685,7 +654,7 @@ def _get_observation(self): """ if not np.array_equal(self.state, np.clip(self.state, self.observation_space.low, self.observation_space.high)) and self.VERBOSE: print("[WARNING]: observation was clipped in CartPole._get_observation().") - # apply observation disturbance + # Apply observation disturbance. obs = deepcopy(self.state) if "observation" in self.disturbances: obs = self.disturbances["observation"].apply(obs, self) @@ -701,11 +670,8 @@ def _get_reward(self): if self.COST == Cost.RL_REWARD: if self.constraints is not None and self.use_constraint_penalty and self.constraints.is_almost_active(self): return self.constraint_penalty - # Constant reward if episode not done (pole stays upright). return 1.0 - - # # a more control-oriented rl rewarc # state = self.state # x, theta = state[0], state[2] # length = self.OVERRIDDEN_EFFECTIVE_POLE_LENGTH @@ -715,7 +681,6 @@ def _get_reward(self): # # shape (*,) # reward = np.exp(-np.sum(np.square(ee_pos - goal_pos) * np.asarray([1.0, 1.0]), -1) / length**2) # return reward - if self.COST == Cost.QUADRATIC: if self.TASK == Task.STABILIZATION: return float(-1 * self.symbolic.loss(x=self.state, Xr=self.X_GOAL, u=self.current_preprocessed_action, Ur=self.U_GOAL, Q=self.Q, R=self.R)["l"]) @@ -752,10 +717,7 @@ def _get_info(self): info["constraint_values"] = self.constraints.get_values(self) violation = np.any(np.greater(info["constraint_values"], 0.)) info["constraint_violation"] = int(violation) - - # HACK: adding Timilimit flag if self.step_counter / self.PYB_FREQ >= self.EPISODE_LEN_SEC: - # should be `not done` instead of True, for hack just True x, _, theta, _ = self.state done = bool(x < -self.x_threshold or x > self.x_threshold or theta < -self.theta_threshold_radians or theta > self.theta_threshold_radians) if self.constraints is not None: @@ -776,7 +738,7 @@ def _get_reset_info(self): info["x_reference"] = self.X_GOAL info["u_reference"] = self.U_GOAL if self.constraints is not None: - # NOTE: Cannot evaluate constraints on reset because there are no inputs. How to handle this? + # NOTE: Cannot evaluate constraints on reset/without inputs. info["constraint_values"] = self.constraints.get_values(self, only_state=True) return info @@ -793,11 +755,9 @@ def _parse_urdf_parameters(self, file_name): """ URDF_TREE = (etxml.parse(file_name)).getroot() - EFFECTIVE_POLE_LENGTH = 0.5 * float(URDF_TREE[3][0][0][0].attrib["size"].split(" ")[-1]) # Note: HALF length of pole. POLE_MASS = float(URDF_TREE[3][1][1].attrib["value"]) CART_MASS = float(URDF_TREE[1][2][0].attrib["value"]) - return EFFECTIVE_POLE_LENGTH, POLE_MASS, CART_MASS def _create_urdf(self, file_name, length=None, inertia=None): @@ -814,32 +774,30 @@ def _create_urdf(self, file_name, length=None, inertia=None): """ tree = etxml.parse(file_name) root = tree.getroot() - - # overwrite pod length + # Overwrite pod length. if length is not None: - # pole visual geometry box + # Pole visual geometry box. out = root[3][0][0][0].attrib["size"] out = " ".join(out.split(" ")[:-1] + [str(2 * length)]) root[3][0][0][0].attrib["size"] = out - # pole visual origin + # Pole visual origin. out = root[3][0][1].attrib["xyz"] out = " ".join(out.split(" ")[:-1] + [str(length)]) root[3][0][1].attrib["xyz"] = out - # pole inertial origin + # Pole inertial origin. out = root[3][1][0].attrib["xyz"] out = " ".join(out.split(" ")[:-1] + [str(length)]) root[3][1][0].attrib["xyz"] = out - # pole inertia + # Pole inertia. root[3][1][2].attrib["ixx"] = str(inertia) root[3][1][2].attrib["iyy"] = str(inertia) root[3][1][2].attrib["izz"] = str(0.0) - # pole collision geometry box + # Pole collision geometry box. out = root[3][2][0][0].attrib["size"] out = " ".join(out.split(" ")[:-1] + [str(2 * length)]) root[3][2][0][0].attrib["size"] = out - # pole collision origin + # Pole collision origin. out = root[3][2][1].attrib["xyz"] out = " ".join(out.split(" ")[:-1] + [str(length)]) root[3][2][1].attrib["xyz"] = out - return tree diff --git a/safe_control_gym/envs/gym_control/cartpole.yaml b/safe_control_gym/envs/gym_control/cartpole.yaml index a295b4e2f..0f5cd107a 100644 --- a/safe_control_gym/envs/gym_control/cartpole.yaml +++ b/safe_control_gym/envs/gym_control/cartpole.yaml @@ -3,26 +3,26 @@ ctrl_freq: 50 pyb_freq: 50 gui: False normalized_rl_action_space: False -# state initialization +# State initialization init_state: null randomized_init: True init_state_randomization_info: null -# randomization +# Randomization inertial_prop: null randomized_inertial_prop: False inertial_prop_randomization_info: null -# task +# Task task: stabilization task_info: null cost: rl_reward -# disturbance +# Disturbance disturbances: null adversary_disturbance: null adversary_disturbance_scale: 0.01 -# constraints +# Constraints constraints: null done_on_violation: False use_constraint_penalty: False constraint_penalty: -1 -# misc +# Misc verbose: False \ No newline at end of file diff --git a/safe_control_gym/envs/gym_control/cartpole_constraints.py b/safe_control_gym/envs/gym_control/cartpole_constraints.py index 87279d719..20238c6ff 100644 --- a/safe_control_gym/envs/gym_control/cartpole_constraints.py +++ b/safe_control_gym/envs/gym_control/cartpole_constraints.py @@ -6,6 +6,7 @@ from safe_control_gym.envs.constraints import BoundedConstraint, ConstraintInputType, Constraint + class CartPoleStateConstraint(BoundedConstraint): """Constrain the cart's state to the observation space bounds. @@ -20,25 +21,22 @@ def __init__(self, Args: env (BenchmarkEnv): The environment to constrain. - low (list): to overwrite the environment minimums - high (list): To overwrite the environment maximums + low (list): to overwrite the environment minimums. + high (list): To overwrite the environment maximums. """ if high is None: self.high = np.array([env.x_threshold * 2, # Limit set to 2x: i.e. a failing observation is still within bounds. np.finfo(np.float32).max, - env.theta_threshold_radians * 2, # Limit set to 2x: i.e. a failing observation is - # still within bounds. + env.theta_threshold_radians * 2, # Limit set to 2x: i.e. a failing observation is still within bounds. np.finfo(np.float32).max]) else: assert len(high) == env.observation_space.shape[0] self.high = high - if low is None: self.low = -1 * np.array([env.x_threshold * 2, # Limit set to 2x: i.e. a failing observation is still within bounds. np.finfo(np.float32).max, - env.theta_threshold_radians * 2, # Limit set to 2x: i.e. a failing - # observation is still within bounds. + env.theta_threshold_radians * 2, # Limit set to 2x: i.e. a failing observation is still within bounds. np.finfo(np.float32).max]) else: assert len(low) == env.observation_space.shape[0] @@ -96,7 +94,6 @@ def __init__(self, else: assert isinstance(high, float) self.high = high - if low is None: self.low = -1 * env.action_threshold else: @@ -130,13 +127,11 @@ def is_violated(self, env): flag = np.any(np.greater(c_value, 0.)) return flag -# TODO: Can this be formulated as a BoundedConstraint type? + class CartPoleSymmetricStateConstraint(BoundedConstraint): """Symmetric state bound constraint. - Note: This only counts as a single constraint? - # todo: A bounded constraint is technically 2 cosntraints, but is considered a single constraints in se_ppo...how to handle? - """ + """ def __init__(self, env, @@ -144,7 +139,8 @@ def __init__(self, constraint_input_type, active_dims=None, tolerance=None, - **kwrags): + **kwrags + ): assert bound is not None self.bound = np.array(bound, ndmin=1) super().__init__(env, @@ -154,14 +150,13 @@ def __init__(self, active_dims=active_dims, tolerance=tolerance, **kwrags) - self.num_constraints = self.bound.shape[0] # todo: how to fix this? + self.num_constraints = self.bound.shape[0] def get_value(self, env): c_value = np.abs(self.constraint_filter @ env.state) - self.bound return c_value -# TODO: Can this be formulated as a bounded constraint type? class CartPoleBoundConstraint(Constraint): """Implements bound constraint for state or input as c(x, u) <= 0. @@ -181,35 +176,28 @@ def __init__(self, slack=None ): super().__init__() - assert var_name == "state" or var_name == "input", ( - "Must specify state or input to constrain.") - assert low is not None or high is not None, ( - "Must specify either lower or upper bound.") - - # which variable (state, input) to constrain + assert var_name == "state" or var_name == "input", ("Must specify state or input to constrain.") + assert low is not None or high is not None, ("Must specify either lower or upper bound.") + # Which variable (state, input) to constrain. self.var_name = var_name if var_name == "state": var_dim = env.observation_space.shape[0] else: var_dim = env.action_space.shape[0] - # which fields of the variable to bound + # Which fields of the variable to bound. self.index = index - - # construct A matrix + # Construct A matrix. weight = np.eye(var_dim) if isinstance(index, int): weight = weight[[index]] elif isinstance(index, list): weight = weight[index] - self.low = np.array(low, ndmin=1) self.high = np.array(high, ndmin=1) - - # construct A', b' + # Construct A', b'. dim = 0 full_weight = [] full_threshold = [] - if self.low is not None: dim += len(self.low) full_weight.append(-1 * weight) @@ -218,13 +206,15 @@ def __init__(self, dim += len(self.high) full_weight.append(weight) full_threshold.append(-1 * self.high) - self.dim = dim self.full_weight = np.concatenate(full_weight) self.full_threshold = np.concatenate(full_threshold) self.slack = np.array(slack, ndmin=1) def get_symbolic_model(self, env): + """ + + """ X = env.symbolic.x_sym U = env.symbolic.u_sym if self.var_name == "state": @@ -238,7 +228,9 @@ def get_symbolic_model(self, env): return sym_func def get_value(self, env): + """ + """ if self.var_name == "state": var = env.state else: diff --git a/safe_control_gym/envs/gym_pybullet_drones/base_aviary.py b/safe_control_gym/envs/gym_pybullet_drones/base_aviary.py index 21f7789dc..a6cb61944 100644 --- a/safe_control_gym/envs/gym_pybullet_drones/base_aviary.py +++ b/safe_control_gym/envs/gym_pybullet_drones/base_aviary.py @@ -37,7 +37,7 @@ class Physics(Enum): """ - PYB = "pyb" # Base PyBullet physics update + PYB = "pyb" # Base PyBullet physics update. DYN = "dyn" # Update with an explicit model of the dynamics. PYB_GND = "pyb_gnd" # PyBullet physics update with ground effect. PYB_DRAG = "pyb_drag" # PyBullet physics update with drag. diff --git a/safe_control_gym/envs/gym_pybullet_drones/quadrotor.py b/safe_control_gym/envs/gym_pybullet_drones/quadrotor.py index 8e5eb6036..ce0023e13 100644 --- a/safe_control_gym/envs/gym_pybullet_drones/quadrotor.py +++ b/safe_control_gym/envs/gym_pybullet_drones/quadrotor.py @@ -26,14 +26,12 @@ class Quadrotor(BaseAviary): Including symbolic model, constraints, randomization, adversarial disturbances, multiple cost functions, stabilization and trajectory tracking references. - TODO: - * Handling prior model param still out of sync with cartpole. """ AVAILABLE_CONSTRAINTS = { "quadrotor_state": QuadrotorStateConstraint, "quadrotor_input": QuadrotorInputConstraint, - "quadrotor_diag": QuadrotorDiagConstraint # Todo: Delete this. + "quadrotor_diag": QuadrotorDiagConstraint } AVAILABLE_CONSTRAINTS.update(deepcopy(GENERAL_CONSTRAINTS)) @@ -50,12 +48,12 @@ class Quadrotor(BaseAviary): } INERTIAL_PROP_RAND_INFO = { - "M": { # 0.27 + "M": { # Nominal: 0.27 'distrib': "uniform", 'low': 0.22, 'high': 0.32 }, - "Iyy": { # 1.4e-5 + "Iyy": { # Nominal: 1.4e-5 'distrib': "uniform", 'low': 1.3e-5, 'high': 1.5e-5 @@ -181,14 +179,11 @@ def __init__(self, ) self.CTRL_TIMESTEP = 1. / self.CTRL_FREQ self.PYB_TIMESTEP = 1. / self.PYB_FREQ - # Store initial state info. if init_state is None: - self.INIT_X, self.INIT_X_DOT, self.INIT_Z, self.INIT_Z_DOT, self.INIT_THETA, self.INIT_THETA_DOT = np.zeros( - 6) + self.INIT_X, self.INIT_X_DOT, self.INIT_Z, self.INIT_Z_DOT, self.INIT_THETA, self.INIT_THETA_DOT = np.zeros(6) elif self.QUAD_TYPE == QuadType.ONE_D: - self.INIT_X, self.INIT_X_DOT, self.INIT_THETA, self.INIT_THETA_DOT = np.zeros( - 4) + self.INIT_X, self.INIT_X_DOT, self.INIT_THETA, self.INIT_THETA_DOT = np.zeros(4) if isinstance(init_state, np.ndarray): self.INIT_Z, self.INIT_Z_DOT = init_state elif isinstance(init_state, dict): @@ -218,11 +213,8 @@ def __init__(self, self.INIT_STATE_RAND_INFO = init_state_randomization_info # Do NOT randomize x, x_dot, theta, theta_dot for the 1D quadrotor. if self.QUAD_TYPE == QuadType.ONE_D: - for init_name in [ - "init_x", "init_x_dot", "init_theta", "init_theta_dot" - ]: + for init_name in ["init_x", "init_x_dot", "init_theta", "init_theta_dot"]: self.INIT_STATE_RAND_INFO.pop(init_name, None) - # Decide whether to randomize the inertial properties and how (see info dictionary). self.RANDOMIZED_INERTIAL_PROP = randomized_inertial_prop if inertial_prop_randomization_info is not None: @@ -230,7 +222,6 @@ def __init__(self, # Do NOT randomize J for the 1D quadrotor. if self.QUAD_TYPE == QuadType.ONE_D: self.INERTIAL_PROP_RAND_INFO.pop("Iyy", None) - # Store disturbance info. self.DISTURBANCES = disturbances self.adversary_disturbance = adversary_disturbance @@ -240,11 +231,9 @@ def __init__(self, self.DISTURBANCE_MODES["observation"]["dim"] = 2 self.DISTURBANCE_MODES["action"]["dim"] = 1 self.DISTURBANCE_MODES["dynamics"]["dim"] = 1 - # Store constraint info self.CONSTRAINTS = constraints self.DONE_ON_VIOLATION = done_on_violation - self.VERBOSE = verbose # Call BaseAviary constructor. super().__init__(seed=seed, @@ -253,10 +242,8 @@ def __init__(self, cost=Cost(cost), gui=gui, freq=self.PYB_FREQ, - aggregate_phy_steps=int(self.PYB_FREQ - / self.CTRL_FREQ), + aggregate_phy_steps=int(self.PYB_FREQ / self.CTRL_FREQ), physics=Physics(physics)) - # Override inertial properties of passed as arguments. if inertial_prop is None: pass @@ -308,14 +295,14 @@ def __init__(self, # if self.QUAD_TYPE == QuadType.ONE_D: self.X_GOAL = np.vstack([ - POS_REF[:, 2], # TODO Add offset? + POS_REF[:, 2], # TODO Add offset VEL_REF[:, 2] ]).transpose() elif self.QUAD_TYPE == QuadType.TWO_D: self.X_GOAL = np.vstack([ - POS_REF[:, 0], # TODO Add offset? + POS_REF[:, 0], # TODO Add offset VEL_REF[:, 0], - POS_REF[:, 2], # TODO Add offset? + POS_REF[:, 2], # TODO Add offset VEL_REF[:, 2], np.zeros(POS_REF.shape[0]), np.zeros(VEL_REF.shape[0]) @@ -362,7 +349,6 @@ def reset(self): self.current_preprocessed_action = None if self.adversary_disturbance is not None: self.adv_force = None - # Choose randomized or deterministic inertial properties. prop_values = { "M": self.MASS, @@ -373,11 +359,8 @@ def reset(self): prop_values, self.INERTIAL_PROP_RAND_INFO) if any(phy_quantity < 0 for phy_quantity in prop_values.values()): raise ValueError("[ERROR] in CartPole.reset(), negative randomized inertial properties.") - self.OVERRIDDEN_QUAD_MASS = prop_values["M"] - self.OVERRIDDEN_QUAD_INERTIA = [ - self.J[0, 0], prop_values["Iyy"], self.J[2, 2] - ] + self.OVERRIDDEN_QUAD_INERTIA = [self.J[0, 0], prop_values["Iyy"], self.J[2, 2]] # Override inertial properties. p.changeDynamics( self.DRONE_IDS[0], @@ -385,7 +368,6 @@ def reset(self): mass=self.OVERRIDDEN_QUAD_MASS, localInertiaDiagonal=self.OVERRIDDEN_QUAD_INERTIA, physicsClientId=self.PYB_CLIENT) - # Randomize initial state. init_values = { "init_x": self.INIT_X, @@ -396,28 +378,22 @@ def reset(self): "init_theta_dot": self.INIT_THETA_DOT, } if self.RANDOMIZED_INIT: - init_values = self._randomize_values_by_info( - init_values, self.INIT_STATE_RAND_INFO) - + init_values = self._randomize_values_by_info(init_values, self.INIT_STATE_RAND_INFO) OVERRIDDEN_INIT_X = init_values["init_x"] OVERRIDDEN_INIT_X_DOT = init_values["init_x_dot"] OVERRIDDEN_INIT_Z = init_values["init_z"] OVERRIDDEN_INIT_Z_DOT = init_values["init_z_dot"] OVERRIDDEN_INIT_THETA = init_values["init_theta"] OVERRIDDEN_INIT_THETA_DOT = init_values["init_theta_dot"] - - p.resetBasePositionAndOrientation( - self.DRONE_IDS[0], [OVERRIDDEN_INIT_X, 0, OVERRIDDEN_INIT_Z], - p.getQuaternionFromEuler([0, OVERRIDDEN_INIT_THETA, 0]), - physicsClientId=self.PYB_CLIENT) + p.resetBasePositionAndOrientation(self.DRONE_IDS[0], [OVERRIDDEN_INIT_X, 0, OVERRIDDEN_INIT_Z], + p.getQuaternionFromEuler([0, OVERRIDDEN_INIT_THETA, 0]), + physicsClientId=self.PYB_CLIENT) p.resetBaseVelocity(self.DRONE_IDS[0], [OVERRIDDEN_INIT_X_DOT, 0, OVERRIDDEN_INIT_Z_DOT], [0, OVERRIDDEN_INIT_THETA_DOT, 0], physicsClientId=self.PYB_CLIENT) - # Update BaseAviary internal variables before calling self._get_observation(). self._update_and_store_kinematic_information() - # Return either an observation and dictionary or just the observation. if self.INFO_IN_RESET: return self._get_observation(), self._get_reset_info() @@ -433,15 +409,14 @@ def render(self, mode='human'): ndarray: A multidimensional array with the RGB frame captured by PyBullet's camera. """ - [w, h, rgb, dep, seg - ] = p.getCameraImage(width=self.RENDER_WIDTH, - height=self.RENDER_HEIGHT, - shadow=1, - viewMatrix=self.CAM_VIEW, - projectionMatrix=self.CAM_PRO, - renderer=p.ER_TINY_RENDERER, - flags=p.ER_SEGMENTATION_MASK_OBJECT_AND_LINKINDEX, - physicsClientId=self.PYB_CLIENT) + [w, h, rgb, dep, seg] = p.getCameraImage(width=self.RENDER_WIDTH, + height=self.RENDER_HEIGHT, + shadow=1, + viewMatrix=self.CAM_VIEW, + projectionMatrix=self.CAM_PRO, + renderer=p.ER_TINY_RENDERER, + flags=p.ER_SEGMENTATION_MASK_OBJECT_AND_LINKINDEX, + physicsClientId=self.PYB_CLIENT) # Image.fromarray(np.reshape(rgb, (h, w, 4)), 'RGBA').show() return np.reshape(rgb, (h, w, 4)) @@ -467,48 +442,42 @@ def set_adversary_control(self, action): self.adv_action = clipped_adv_action * self.adversary_disturbance_scale def _setup_disturbances(self): - """Sets up scaling and actions space for an adversarial disturbance.""" - # default no passive disturbance + """Sets up scaling and actions space for an adversarial disturbance. + + """ + # Default: no passive disturbances. self.disturbances = {} if self.DISTURBANCES is not None: for mode, disturbs in self.DISTURBANCES.items(): - assert mode in self.DISTURBANCE_MODES, "[ERROR] in Quadrotor._setup_disturbances(), disturbance mode " \ - "not available." + assert mode in self.DISTURBANCE_MODES, "[ERROR] in Quadrotor._setup_disturbances(), disturbance mode not available." disturb_list = [] shared_args = self.DISTURBANCE_MODES[mode] - - # each disturbance for this mode + # Each disturbance for this mode. for name, cfg in disturbs.items(): - assert name in DISTURBANCE_TYPES, "[ERROR] in Quadrotor._setup_disturbances(), disturbance " \ - "type not available." + assert name in DISTURBANCE_TYPES, "[ERROR] in Quadrotor._setup_disturbances(), disturbance type not available." disturb_cls = DISTURBANCE_TYPES[name] disturb = disturb_cls(self, **shared_args, **cfg) disturb_list.append(disturb) - - # combine as one for this mode + # Combine as one for this mode. self.disturbances[mode] = DisturbanceList(disturb_list) - - # adversary disturbance (set from outside of env, non-passive) + # Adversary disturbance (set from outside of env, active/non-passive). if self.adversary_disturbance is not None: assert self.adversary_disturbance in self.DISTURBANCE_MODES, "[ERROR] in Cartpole._setup_disturbances()" - shared_args = self.DISTURBANCE_MODES[self.adversary_disturbance] dim = shared_args["dim"] - # TODO: symmetric space can be problematic for obs ? self.adversary_action_space = spaces.Box(low=-1, high=1, shape=(dim,)) - # adversary obs same as protagonist + # Adversary obs are the same as those of the protagonist. self.adversary_observation_space = self.observation_space def _setup_constraints(self): """Sets up a list (ConstraintList) of constraints. - # todo: Can we put this in BenchmarkEnv? + """ self.constraints = None self.num_constraints = 0 - if self.CONSTRAINTS is not None: self.constraints = create_ConstraintList_from_dict( self.CONSTRAINTS, self.AVAILABLE_CONSTRAINTS, self) @@ -536,7 +505,7 @@ def _setup_symbolic(self): U = cs.vertcat(T) # Define dynamics equations. X_dot = cs.vertcat(z_dot, T / m - g) - # Define observation equation + # Define observation equation. Y = cs.vertcat(z, z_dot) elif self.QUAD_TYPE == QuadType.TWO_D: nx, nu = 6, 2 @@ -562,8 +531,7 @@ def _setup_symbolic(self): R = cs.MX.sym('R', nu, nu) Xr = cs.MX.sym('Xr', nx, 1) Ur = cs.MX.sym('Ur', nu, 1) - cost_func = 0.5 * (X - Xr).T @ Q @ (X - Xr) + 0.5 * (U - Ur).T @ R @ ( - U - Ur) + cost_func = 0.5 * (X - Xr).T @ Q @ (X - Xr) + 0.5 * (U - Ur).T @ R @ (U - Ur) # Define dynamics and cost dictionaries. dynamics = {"dyn_eqn": X_dot, "obs_eqn": Y, "vars": {"X": X, "U": U}} cost = { @@ -577,8 +545,6 @@ def _setup_symbolic(self): "R": R } } - # todo: Add bound constraint attribute here from ConstraintList? - # Setup symbolic model. self.symbolic = SymbolicModel(dynamics=dynamics, cost=cost, dt=dt) @@ -586,8 +552,7 @@ def _set_action_space(self): """Returns the action space of the environment. Returns: - gym.spaces: The quadrotor environment's action space, - of size 1 or 2 depending on QUAD_TYPE. + gym.spaces: The quadrotor environment's action space, of size 1 or 2 depending on QUAD_TYPE. """ if self.NORMALIZED_RL_ACTION_SPACE: @@ -604,8 +569,7 @@ def _set_observation_space(self): """Returns the observation space of the environment. Returns: - gym.spaces: The bounded observation (state) space, - of size 2 or 6 depending on QUAD_TYPE. + gym.spaces: The bounded observation (state) space, of size 2 or 6 depending on QUAD_TYPE. """ self.x_threshold = 2 @@ -641,23 +605,19 @@ def _preprocess_control(self, action): """ if self.NORMALIZED_RL_ACTION_SPACE: - action = (1 + (0.1 * action)) * ( - (self.GRAVITY_ACC * self.MASS) / int(self.QUAD_TYPE)) + action = (1 + (0.1 * action)) * ((self.GRAVITY_ACC * self.MASS) / int(self.QUAD_TYPE)) thrust = np.clip(action, self.action_space.low, self.action_space.high) if not np.array_equal(thrust, np.array(action)) and self.VERBOSE: print( "[WARNING]: action was clipped in Quadrotor._preprocess_control()." ) self.current_preprocessed_action = thrust - - # apply disturbances + # Apply disturbances. if "action" in self.disturbances: thrust = self.disturbances["action"].apply(thrust, self) if self.adversary_disturbance == "action": thrust = thrust + self.adv_force - - pwm = cmd2pwm(thrust, self.PWM2RPM_SCALE, self.PWM2RPM_CONST, self.KF, - self.MIN_PWM, self.MAX_PWM) + pwm = cmd2pwm(thrust, self.PWM2RPM_SCALE, self.PWM2RPM_CONST, self.KF, self.MIN_PWM, self.MAX_PWM) rpm = pwm2rpm(pwm, self.PWM2RPM_SCALE, self.PWM2RPM_CONST) return rpm @@ -677,8 +637,7 @@ def _advance_simulation(self, action): """ disturb_force = None - - # determine disturbance force + # Determine disturbance force. passive_disturb = "dynamics" in self.disturbances adv_disturb = self.adversary_disturbance == "dynamics" if passive_disturb or adv_disturb: @@ -688,16 +647,15 @@ def _advance_simulation(self, action): disturb_force, self) if adv_disturb and self.adv_action is not None: disturb_force = disturb_force + self.adv_action - # clear adversary action, wait for next one + # Clear the adversary action, wait for the next one. self.adv_action = None - - # construct full disturbance force (3D) + # Construct full (3D) disturbance force. if disturb_force: if self.QUAD_TYPE == QuadType.ONE_D: - # only disturb on z direction + # Only disturb on z direction. disturb_force = [0, 0, float(disturb_force)] elif self.QUAD_TYPE == QuadType.TWO_D: - # only disturb on x-z plane + # Only disturb on x-z plane. disturb_force = [ float(disturb_force[0]), 0, float(disturb_force[1]) @@ -713,8 +671,7 @@ def _get_observation(self): """Returns the current observation (state) of the environment. Returns: - ndarray: The state of the quadrotor, - of size 2 or 6 depending on QUAD_TYPE. + ndarray: The state of the quadrotor, of size 2 or 6 depending on QUAD_TYPE. """ full_state = self._get_drone_state_vector(0) @@ -727,16 +684,13 @@ def _get_observation(self): self.state = np.hstack( [pos[0], vel[0], pos[2], vel[2], rpy[1], ang_v[1]]).reshape( (6,)) - if not np.array_equal( - self.state, - np.clip(self.state, self.observation_space.low, - self.observation_space.high)): + if not np.array_equal(self.state, + np.clip(self.state, self.observation_space.low, self.observation_space.high)): if self.GUI and self.VERBOSE: print( "[WARNING]: observation was clipped in Quadrotor._get_observation()." ) - - # apply observation disturbance + # Apply observation disturbance. obs = deepcopy(self.state) if "observation" in self.disturbances: obs = self.disturbances["observation"].apply(obs, self) @@ -751,29 +705,23 @@ def _get_reward(self): """ if self.COST == Cost.RL_REWARD: full_state = self._get_drone_state_vector(0) - pos, _, rpy, vel, ang_v, _ = np.split(full_state, - [3, 7, 10, 13, 16]) + pos, _, rpy, vel, ang_v, _ = np.split(full_state, [3, 7, 10, 13, 16]) if self.QUAD_TYPE == QuadType.ONE_D: - dist = np.linalg.norm( - np.array([0, 0, self.TASK_INFO["stabilization_goal"][1]]) - - pos)**2 + dist = np.linalg.norm(np.array([0, 0, self.TASK_INFO["stabilization_goal"][1]]) - pos)**2 elif self.QUAD_TYPE == QuadType.TWO_D: dist = np.linalg.norm( - np.array([ - self.TASK_INFO["stabilization_goal"][0], 0, - self.TASK_INFO["stabilization_goal"][1] - ]) - pos)**2 + np.array([self.TASK_INFO["stabilization_goal"][0], 0, + self.TASK_INFO["stabilization_goal"][1]]) - pos)**2 return -1 * dist if self.COST == Cost.QUADRATIC: state = self._get_observation() if self.TASK == Task.STABILIZATION: - return float( - -1 * self.symbolic.loss(x=state, - Xr=self.X_GOAL, - u=self.current_preprocessed_action, - Ur=self.U_GOAL, - Q=self.Q, - R=self.R)["l"]) + return float(-1 * self.symbolic.loss(x=state, + Xr=self.X_GOAL, + u=self.current_preprocessed_action, + Ur=self.U_GOAL, + Q=self.Q, + R=self.R)["l"]) if self.TASK == Task.TRAJ_TRACKING: return -1 @@ -834,12 +782,9 @@ def _get_reset_info(self): info["x_reference"] = self.X_GOAL info["u_reference"] = self.U_GOAL if self.constraints is not None: - info[ - "symbolic_constraints"] = self.constraints.get_all_symbolic_models( - ) + info["symbolic_constraints"] = self.constraints.get_all_symbolic_models() #info["constraint_values"] = self.constraints.get_values(self) #info["constraint_violations"] = self.constraints.get_violations(self) - return info def _parse_urdf_parameters(self, file_name: str = "cf2x.urdf"): diff --git a/safe_control_gym/envs/gym_pybullet_drones/quadrotor.yaml b/safe_control_gym/envs/gym_pybullet_drones/quadrotor.yaml index 69b881a8c..36148eafc 100644 --- a/safe_control_gym/envs/gym_pybullet_drones/quadrotor.yaml +++ b/safe_control_gym/envs/gym_pybullet_drones/quadrotor.yaml @@ -6,22 +6,22 @@ physics: pyb gui: False quad_type: 2 normalized_rl_action_space: False -# state initialization +# State initialization init_state: null randomized_init: True init_state_randomization_info: null -# randomization +# Randomization inertial_prop: null randomized_inertial_prop: False inertial_prop_randomization_info: null -# task +# Task task: stabilization task_info: null cost: rl_reward -# disturbance +# Disturbance disturbances: null -# constraints +# Constraints constraints: null done_on_violation: False -# misc +# Misc verbose: False diff --git a/safe_control_gym/envs/gym_pybullet_drones/quadrotor_constraints.py b/safe_control_gym/envs/gym_pybullet_drones/quadrotor_constraints.py index 00b81329c..a01393c32 100644 --- a/safe_control_gym/envs/gym_pybullet_drones/quadrotor_constraints.py +++ b/safe_control_gym/envs/gym_pybullet_drones/quadrotor_constraints.py @@ -21,8 +21,8 @@ def __init__(self, Args: env (BenchmarkEnv): The environment to constrain. - low (list): to overwrite the environment minimums - high (list): To overwrite the environment maximums + low (list): to overwrite the environment minimums. + high (list): To overwrite the environment maximums. """ if env.QUAD_TYPE == QuadType.ONE_D: @@ -45,7 +45,6 @@ def __init__(self, np.finfo(np.float32).max, env.theta_threshold_radians * 2, np.finfo(np.float32).max]) - # Overwrite with provided high and low if high is not None: assert len(high) == env.observation_space.shape[0] @@ -53,7 +52,6 @@ def __init__(self, if low is not None: assert len(low) == env.observation_space.shape[0] self.low = low - super().__init__(env, self.low, self.high, ConstraintInputType.STATE, **kwargs) def get_value(self, env): @@ -66,11 +64,9 @@ def get_value(self, env): ndarray: The evaluation of the constraint. """ - return np.squeeze(self.sym_func(env.state)) - class QuadrotorDiagConstraint(Constraint): """Constrain the quadrotor's commanded input to the action space bounds. @@ -91,7 +87,6 @@ def __init__(self, env): self.b = 1.1 else: raise NotImplementedError - self.sym_func = lambda x: self.h @ x - self.b def get_symbolic_model(self, env): @@ -132,6 +127,7 @@ def is_violated(self, env): flag = np.any(np.greater(c_value, 0.)) return flag + class QuadrotorInputConstraint(BoundedConstraint): """Constrain the quadrotor's commanded input to the action space bounds. @@ -146,8 +142,8 @@ def __init__(self, Args: env (BenchmarkEnv): The environment to constrain. - low (list): to overwrite the environment minimums - high (list): To overwrite the environment maximums + low (list): to overwrite the environment minimums. + high (list): To overwrite the environment maximums. """ if high is None: @@ -155,14 +151,11 @@ def __init__(self, else: assert len(high) == env.action_space.shape[0] self.high = high - if low is None: self.low = np.zeros(int(env.QUAD_TYPE)) else: - assert len(low) == env.action_space.shape[0] self.low = low - super().__init__(env, self.low, self.high, ConstraintInputType.INPUT, **kwargs) def get_value(self, env): diff --git a/safe_control_gym/math_and_models/distributions.py b/safe_control_gym/math_and_models/distributions.py index b0057128b..bf2507451 100644 --- a/safe_control_gym/math_and_models/distributions.py +++ b/safe_control_gym/math_and_models/distributions.py @@ -1,15 +1,15 @@ """Probability distributions for PyTorch. -Adapted from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/distributions.py +Based on https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/distributions.py -Todo: - * missing SoftCategorical (differentiable Categorical). """ import torch class Normal(torch.distributions.Normal): - """Multivariate Gaussian distribution given mean and std tensors.""" + """Multivariate Gaussian distribution given mean and std tensors. + + """ def log_prob(self, actions): """Log probability of actions given current distribution. @@ -33,12 +33,16 @@ def entropy(self): return super().entropy().sum(-1) def mode(self): - """Mode (max probability point) of current distribution.""" + """Mode (max probability point) of current distribution. + + """ return self.mean class Categorical(torch.distributions.Categorical): - """Categorical distribution given class probabilities or logits, not differentiable.""" + """Categorical distribution given class probabilities or logits, not differentiable. + + """ def sample(self): """Sample from the current distribution. @@ -59,9 +63,10 @@ def log_prob(self, actions): (torch.FloatTensor): shape (batch, 1). """ - return (super().log_prob(actions.squeeze(-1)).view( - actions.size(0), -1).sum(-1).unsqueeze(-1)) + return (super().log_prob(actions.squeeze(-1)).view(actions.size(0), -1).sum(-1).unsqueeze(-1)) def mode(self): - """Mode (max probability point) of current distribution.""" + """Mode (max probability point) of current distribution. + + """ return self.probs.argmax(dim=-1, keepdim=True) diff --git a/safe_control_gym/math_and_models/neural_networks.py b/safe_control_gym/math_and_models/neural_networks.py index 4c28aaf9b..1b92c69cb 100644 --- a/safe_control_gym/math_and_models/neural_networks.py +++ b/safe_control_gym/math_and_models/neural_networks.py @@ -1,13 +1,5 @@ """Neural networks. -This module demonstrates documentation as specified by the `Google Python -Style Guide`_. Docstrings may extend over multiple lines. Sections are created -with a section header and a colon followed by a block of indented text. - -Todo: - * For module TODOs - * You have to also use ``sphinx.ext.todo`` extension - """ import numpy as np import torch @@ -16,37 +8,16 @@ def get_activation(name): - """Example function with types documented in the docstring. - - `PEP 484`_ type annotations are supported. If attribute, parameter, and - return types are annotated according to `PEP 484`_, they do not need to be - included in the docstring: - - Args: - name (int): The first parameter. - - Returns: - bool: The return value. True for success, False otherwise. + """ """ return getattr(F, name) if name else lambda x: x def init_(module): - """Example function with types documented in the docstring. - - `PEP 484`_ type annotations are supported. If attribute, parameter, and - return types are annotated according to `PEP 484`_, they do not need to be - included in the docstring: - - Args: - module (int): The first parameter. - - Returns: - bool: The return value. True for success, False otherwise. + """ """ - # could have different gains with different activations nn.init.orthogonal_(module.weight.data, gain=1) nn.init.constant_(module.bias.data, 0) return module @@ -55,18 +26,6 @@ def init_(module): class MLP(nn.Module): """MLP network (can be used as value or policy). - If the class has public attributes, they may be documented here - in an ``Attributes`` section and follow the same formatting as a - function's ``Args`` section. Alternatively, attributes may be documented - inline with the attribute's declaration (see __init__ method below). - - Properties created with the ``@property`` decorator should be documented - in the property's getter method. - - Attributes: - attr1 (str): Description of `attr1`. - attr2 (:obj:`int`, optional): Description of `attr2`. - """ def __init__(self, @@ -76,8 +35,9 @@ def __init__(self, act="relu", output_act=None, init_weights=False, - **kwargs): - """Mlti-layer perception / fully-connected network. + **kwargs + ): + """Multi-layer perception/fully-connected network. Args: input_dim (int): input dimension. @@ -90,7 +50,6 @@ def __init__(self, super(MLP, self).__init__() dims = [input_dim] + hidden_dims + [output_dim] init_func = init_ if init_weights else lambda x: x - self.fcs = nn.ModuleList([ init_func(nn.Linear(dims[i], dims[i + 1])) for i in range(len(dims) - 1) @@ -99,17 +58,7 @@ def __init__(self, self.output_act = get_activation(output_act) def forward(self, x): - """Class methods are similar to regular functions. - - Note: - Do not include the `self` parameter in the ``Args`` section. - - Args: - param1: The first parameter. - param2: The second parameter. - - Returns: - True if successful, False otherwise. + """ """ out = x @@ -122,18 +71,6 @@ def forward(self, x): class CNN(nn.Module): """CNN network for encoding images. - If the class has public attributes, they may be documented here - in an ``Attributes`` section and follow the same formatting as a - function's ``Args`` section. Alternatively, attributes may be documented - inline with the attribute's declaration (see __init__ method below). - - Properties created with the ``@property`` decorator should be documented - in the property's getter method. - - Attributes: - attr1 (str): Description of `attr1`. - attr2 (:obj:`int`, optional): Description of `attr2`. - """ def __init__(self, @@ -141,14 +78,9 @@ def __init__(self, output_dim, act="relu", output_act="relu", - **kwargs): - """Example of docstring on the __init__ method. - - Note: - Do not include the `self` parameter in the ``Args`` section. - - Args: - param1 (str): Description of `param1`. + **kwargs + ): + """ """ super(CNN, self).__init__() @@ -157,24 +89,13 @@ def __init__(self, init_(nn.Conv2d(32, 64, 4, stride=2)), init_(nn.Conv2d(64, 32, 3, stride=1)) ]) - # too lazy for auto-calculate, do it with a test run instead conv_out_dim = 32 * 7 * 7 self.fc = init_(nn.Linear(conv_out_dim, output_dim)) self.act = get_activation(act) self.output_act = get_activation(output_act) def forward(self, x, normalize=False): - """Class methods are similar to regular functions. - - Note: - Do not include the `self` parameter in the ``Args`` section. - - Args: - param1: The first parameter. - param2: The second parameter. - - Returns: - True if successful, False otherwise. + """ """ out = x / 255.0 if normalize else x @@ -188,31 +109,14 @@ def forward(self, x, normalize=False): class RNN(nn.Module): """RNN network (can be used as value or policy). - If the class has public attributes, they may be documented here - in an ``Attributes`` section and follow the same formatting as a - function's ``Args`` section. Alternatively, attributes may be documented - inline with the attribute's declaration (see __init__ method below). - - Properties created with the ``@property`` decorator should be documented - in the property's getter method. - - Attributes: - attr1 (str): Description of `attr1`. - attr2 (:obj:`int`, optional): Description of `attr2`. - """ def __init__(self, input_dim, output_dim, - **kwargs): - """Example of docstring on the __init__ method. - - Note: - Do not include the `self` parameter in the ``Args`` section. - - Args: - param1 (str): Description of `param1`. + **kwargs + ): + """ """ super(RNN, self).__init__() @@ -224,20 +128,7 @@ def __init__(self, nn.init.orthogonal_(param) def forward(self, x, hxs, masks): - """Run the RNN taking account in masking. - - The first condition applies during exploraion or evaluation, the second - condition applies during training/action evaluation. - - Note: - Do not include the `self` parameter in the ``Args`` section. - - Args: - param1: The first parameter. - param2: The second parameter. - - Returns: - True if successful, False otherwise. + """ """ if x.size(0) == hxs.size(0): @@ -268,7 +159,6 @@ def forward(self, x, hxs, masks): # We can now process steps that don't have any zeros in masks together! start_idx = has_zeros[i] end_idx = has_zeros[i + 1] - rnn_scores, hxs = self.gru( x[start_idx:end_idx], (hxs * masks[start_idx]).unsqueeze(0)) outputs.append(rnn_scores) diff --git a/safe_control_gym/math_and_models/normalization.py b/safe_control_gym/math_and_models/normalization.py index 3c30072d0..4893210d0 100644 --- a/safe_control_gym/math_and_models/normalization.py +++ b/safe_control_gym/math_and_models/normalization.py @@ -1,29 +1,22 @@ """Perform normalization on inputs or rewards. -References: - * github.com/ShangtongZhang/DeepRL/blob/932ea88082e0194126b87742bd4a28c4599aa1b8/deep_rl/utils/normalizer.py#L69 - * github.com/DLR-RM/stable-baselines3/blob/d7c6aff2523122aebf1e70210b2c56642e0b2735/stable_baselines3/common/vec_env/vec_normalize.py#L138 - * github.com/openai/large-scale-curiosity/blob/e0a698676d19307a095cd4ac1991c4e4e70e56fb/cppo_agent.py#L226 - -Todo: - * - """ import numpy as np import torch + from gym.spaces import Box def normalize_angle(x): - """Wraps input angle to [-pi, pi].""" + """Wraps input angle to [-pi, pi]. + + """ return ((x + np.pi) % (2 * np.pi)) - np.pi class RunningMeanStd(): """Calulates the running mean and std of a data stream. - Reference: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - Attributes: mean (np.array): mean of data stream. var (np.array): variance of data stream. @@ -56,15 +49,15 @@ def update(self, arr): self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): - """Util function for `update` method.""" + """Util function for `update` method. + + """ delta = batch_mean - self.mean tot_count = self.count + batch_count - new_mean = self.mean + delta * batch_count / tot_count m_a = self.var * self.count m_b = batch_var * batch_count - m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / ( - self.count + batch_count) + m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) new_var = m_2 / (self.count + batch_count) new_count = batch_count + self.count self.mean = new_mean @@ -73,7 +66,7 @@ def update_from_moments(self, batch_mean, batch_var, batch_count): class BaseNormalizer(object): - """Template for normalizers, also default. + """Template/default normalizer. Attributes: read_only (bool): if to freeze the current stats being tracked. @@ -90,20 +83,28 @@ def unset_read_only(self): self.read_only = False def __call__(self, x, *args, **kwargs): - """Invokes normalization on the given input.""" + """Invokes normalization on the given input. + + """ return x def state_dict(self): - """Returns snapshot of current stats.""" + """Returns snapshot of current stats. + + """ return {} def load_state_dict(self, _): - """Restores the stats from a snapshot.""" + """Restores the stats from a snapshot. + + """ pass class MeanStdNormalizer(BaseNormalizer): - """Normalize by the running average.""" + """Normalize by the running average. + + """ def __init__(self, shape=(), read_only=False, clip=10.0, epsilon=1e-8): """Initializes the data stream tracker. @@ -122,7 +123,9 @@ def __init__(self, shape=(), read_only=False, clip=10.0, epsilon=1e-8): self.epsilon = epsilon def __call__(self, x): - """Update tracker given data, optionally normalize the data.""" + """Update tracker given data, optionally normalize the data. + + """ x = np.asarray(x) if not self.read_only: self.rms.update(x) @@ -144,10 +147,9 @@ class RewardStdNormalizer(MeanStdNormalizer): Papers: * arxiv.org/pdf/1808.04355.pdf * arxiv.org/pdf/1810.12894.pdf - Implementations: - * github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/vec_env/vec_normalize.py - * github.com/openai/large-scale-curiosity/blob/e0a698676d19307a095cd4ac1991c4e4e70e56fb/cppo_agent.py#L226 - See github.com/openai/baselines/issues/538 for a dicussion on its necessity + + Also see: + * github.com/openai/baselines/issues/538 """ @@ -167,25 +169,25 @@ def __init__(self, gamma=0.99, read_only=False, clip=10.0, epsilon=1e-8): self.ret = None def __call__(self, x, dones): - """Update tracker given reward, optionally normalize the reward (only scaling).""" - x = np.asarray(x) + """Update tracker given reward, optionally normalize the reward (only scaling). + """ + x = np.asarray(x) if not self.read_only: # Track running average of forward discounted returns. if self.ret is None: self.ret = np.zeros(x.shape[0]) - self.ret = self.ret * self.gamma + x self.rms.update(self.ret) # Prevent information leak from previous episodes. self.ret[dones.astype(np.long)] = 0 - - return np.clip(x / np.sqrt(self.rms.var + self.epsilon), -self.clip, - self.clip) + return np.clip(x / np.sqrt(self.rms.var + self.epsilon), -self.clip, self.clip) class RescaleNormalizer(BaseNormalizer): - """Apply constant scaling.""" + """Apply constant scaling. + + """ def __init__(self, coef=1.0): """Initializes with fixed scaling constant. @@ -198,32 +200,41 @@ def __init__(self, coef=1.0): self.coef = coef def __call__(self, x): - """Scale the input.""" + """Scale the input. + + """ if not isinstance(x, torch.Tensor): x = np.asarray(x) return self.coef * x class ImageNormalizer(RescaleNormalizer): - """Scale image pixles from [0,255] to [0,1]""" + """Scale image pixles from [0,255] to [0,1]. + + """ def __init__(self): super().__init__(self, 1.0 / 255) class ActionUnnormalizer(BaseNormalizer): - """Assumes policy output action is in [-1,1], unnormalize it for gym env.""" + """Assumes policy output action is in [-1,1], unnormalize it for gym env. + + """ def __init__(self, action_space): - """Defines the mean and std for the bounded action space.""" + """Defines the mean and std for the bounded action space. + + """ super().__init__() - assert isinstance(action_space, - Box), "action space must be gym.spaces.Box" + assert isinstance(action_space, Box), "action space must be gym.spaces.Box" low, high = action_space.low, action_space.high self.mean = (low + high) / 2.0 self.std = (high - low) / 2.0 def __call__(self, action): - """Unnormalizes given input action.""" + """Unnormalizes given input action. + + """ x = np.asarray(action) return self.mean + x * self.std diff --git a/safe_control_gym/math_and_models/symbolic_systems.py b/safe_control_gym/math_and_models/symbolic_systems.py index 71d70cf3e..286cffd7e 100644 --- a/safe_control_gym/math_and_models/symbolic_systems.py +++ b/safe_control_gym/math_and_models/symbolic_systems.py @@ -1,12 +1,4 @@ -"""Symbolic Model . - -This module demonstrates documentation as specified by the `Google Python -Style Guide`_. Docstrings may extend over multiple lines. Sections are created -with a section header and a colon followed by a block of indented text. - -Todo: - * For module TODOs - * You have to also use ``sphinx.ext.todo`` extension +"""Symbolic Models. """ import numpy as np @@ -17,16 +9,12 @@ class SymbolicModel(): """Implements the dynamics model with symbolic variables. x_dot = f(x,u), y = g(x,u), with other pre-defined, symbolic functions - (e.g. cost, constraints), can serve as priors for the controller. + (e.g. cost, constraints), serve as priors for the controllers. - Attributes: - attr1 (str): Description of `attr1`. - attr2 (:obj:`int`, optional): Description of `attr2`. - Notes: * naming convention on symbolic variable and functions. - * for single-letter symbol, use {}_sym, otherwise use underscore for delimiter - * for symbolic functions to be exposed, use {}_func + * for single-letter symbol, use {}_sym, otherwise use underscore for delimiter. + * for symbolic functions to be exposed, use {}_func. """ @@ -36,25 +24,10 @@ def __init__(self, dt=1e-3, integration_algo='cvodes', funcs=None): - """Example of docstring on the __init__ method. - - The __init__ method may be documented in either the class level - docstring, or as a docstring on the __init__ method itself. - - Either form is acceptable, but the two should not be mixed. Choose one - convention to document the __init__ method and be consistent with it. - - Note: - Do not include the `self` parameter in the ``Args`` section. - - Args: - param1 (str): Description of `param1`. - param2 (:obj:`int`, optional): Description of `param2`. Multiple - lines are supported. - param3 (:obj:`list` of :obj:`str`): Description of `param3`. + """ """ - # setup for dynamics + # Setup for dynamics. self.x_sym = dynamics["vars"]["X"] self.u_sym = dynamics["vars"]["U"] self.x_dot = dynamics["dyn_eqn"] @@ -62,57 +35,49 @@ def __init__(self, self.y_sym = self.x_sym else: self.y_sym = dynamics["obs_eqn"] - - # sampling time + # Sampling time. self.dt = dt - - # integration algorithm + # Integration algorithm. self.integration_algo = integration_algo - - # other symbolic functions + # Other symbolic functions. if funcs is not None: for name, func in funcs.items(): assert name not in self.__dict__ self.__dict__[name] = func - - # variable dimensions + # Variable dimensions. self.nx = self.x_sym.shape[0] self.nu = self.u_sym.shape[0] self.ny = self.y_sym.shape[0] - - # setup cost function + # Setup cost function. self.cost_func = cost["cost_func"] - print(self.cost_func) self.Q = cost["vars"]["Q"] self.R = cost["vars"]["R"] self.Xr = cost["vars"]["Xr"] self.Ur = cost["vars"]["Ur"] - - # setup symbolic model + # Setup symbolic model. self.setup_model() - - # setup Jacobian and Hessian of the dynamics and cost functions + # Setup Jacobian and Hessian of the dynamics and cost functions. self.setup_linearization() def setup_model(self): - """Exposes functions to evaluate the model.""" - # Continuous time dynamics. - self.fc_func = cs.Function('fc', [self.x_sym, self.u_sym], [self.x_dot], - ['x', 'u'], ['f']) - # discrete time dynamics - self.fd_func = cs.integrator('fd', self.integration_algo, { - 'x': self.x_sym, - 'p': self.u_sym, - 'ode': self.x_dot - }, {'tf': self.dt}) + """Exposes functions to evaluate the model. + """ + # Continuous time dynamics. + self.fc_func = cs.Function('fc', [self.x_sym, self.u_sym], [self.x_dot], ['x', 'u'], ['f']) + # Discrete time dynamics. + self.fd_func = cs.integrator('fd', self.integration_algo, {'x': self.x_sym, + 'p': self.u_sym, + 'ode': self.x_dot}, {'tf': self.dt} + ) # Observation model. - self.g_func = cs.Function('g', [self.x_sym, self.u_sym], [self.y_sym], - ['x', 'u'], ['g']) + self.g_func = cs.Function('g', [self.x_sym, self.u_sym], [self.y_sym], ['x', 'u'], ['g']) def setup_linearization(self): - """Exposes functions for the linearized model.""" + """Exposes functions for the linearized model. + + """ # Jacobians w.r.t state & input. self.dfdx = cs.jacobian(self.x_dot, self.x_sym) self.dfdu = cs.jacobian(self.x_dot, self.u_sym) @@ -124,11 +89,9 @@ def setup_linearization(self): self.dg_func = cs.Function('dg', [self.x_sym, self.u_sym], [self.dgdx, self.dgdu], ['x', 'u'], ['dgdx', 'dgdu']) - # Evaluation point for linearization. self.x_eval = cs.MX.sym('x_eval', self.nx, 1) self.u_eval = cs.MX.sym('u_eval', self.nu, 1) - # Linearized dynamics model. self.x_dot_linear = self.x_dot + self.dfdx @ ( self.x_eval - self.x_sym) + self.dfdu @ (self.u_eval - self.u_sym) @@ -141,26 +104,20 @@ def setup_linearization(self): 'p': cs.vertcat(self.u_eval, self.x_sym, self.u_sym), 'ode': self.x_dot_linear }, {'tf': self.dt}) - # Linearized observation model. self.y_linear = self.y_sym + self.dgdx @ ( self.x_eval - self.x_sym) + self.dgdu @ (self.u_eval - self.u_sym) self.g_linear_func = cs.Function( 'g_linear', [self.x_eval, self.u_eval, self.x_sym, self.u_sym], [self.y_linear], ['x_eval', 'u_eval', 'x', 'u'], ['g_linear']) - - # Jacobian and Hessian of cost function + # Jacobian and Hessian of cost function. self.l_x = cs.jacobian(self.cost_func, self.x_sym) self.l_xx = cs.jacobian(self.l_x, self.x_sym) self.l_u = cs.jacobian(self.cost_func, self.u_sym) self.l_uu = cs.jacobian(self.l_u, self.u_sym) self.l_xu = cs.jacobian(self.l_x, self.u_sym) - l_inputs = [self.x_sym, self.u_sym, self.Xr, self.Ur, self.Q, self.R] l_inputs_str = ['x', 'u', 'Xr', 'Ur', 'Q', 'R'] - l_outputs = [ - self.cost_func, self.l_x, self.l_xx, self.l_u, self.l_uu, self.l_xu - ] + l_outputs = [self.cost_func, self.l_x, self.l_xx, self.l_u, self.l_uu, self.l_xu] l_outputs_str = ['l', 'l_x', 'l_xx', 'l_u', 'l_uu', 'l_xu'] - self.loss = cs.Function('loss', l_inputs, l_outputs, l_inputs_str, - l_outputs_str) + self.loss = cs.Function('loss', l_inputs, l_outputs, l_inputs_str, l_outputs_str) diff --git a/safe_control_gym/utils/configuration.py b/safe_control_gym/utils/configuration.py index 8907cf5f1..efcba660b 100644 --- a/safe_control_gym/utils/configuration.py +++ b/safe_control_gym/utils/configuration.py @@ -1,3 +1,6 @@ +"""Configuration utilities. + +""" import argparse from dict_deep import deep_set import os @@ -9,7 +12,9 @@ class ConfigFactory: - """Manager class that's in charge of experiment configs.""" + """Manager class that's in charge of experiment configs. + + """ def __init__(self): self.parser = argparse.ArgumentParser(description="Benchmark") @@ -23,18 +28,21 @@ def __init__(self): ) def add_argument(self, *args, **kwargs): - """Extends to new arguments.""" + """Extends to new arguments. + + """ self.parser.add_argument(*args, **kwargs) def add_arguments(self): - """Registers base arguments (for experiment bookkeeping).""" + """Registers base arguments (for experiment bookkeeping). + + """ self.add_argument("--tag", type=str, help='id of the experiment') self.add_argument("--seed", type=int, help="random seed, 0 is no seed") self.add_argument("--device", type=str, help="cpu or cuda(gpu)") self.add_argument("--output_dir", type=str, help="output saving folder") self.add_argument("--restore", type=str, help='folder to reload from') - - # need to explicit provide from cmd (if train for the 1st time) + # Need to explicitly provide from command line (if training for the 1st time). self.add_argument("--algo", type=str, help='algorithm/controller') self.add_argument("--task", type=str, help='task/environment') self.add_argument("--overrides", @@ -47,23 +55,22 @@ def add_arguments(self): help="override key-value pairs") def merge(self): - """Creates experiment config object from command line and config files.""" + """Creates experiment config object from command line and config files. + + """ config_dict = self.base_dict args, _ = self.parser.parse_known_args() - if args.restore: - # restore for continual training or evaluation + # Restore for continual training or evaluation. restore_path = os.path.join(args.restore, "config.yaml") config_dict.update(read_file(restore_path)) elif args.algo and args.task: - # start fresh training + # Start fresh training. config_dict["algo_config"] = get_config(args.algo) config_dict["task_config"] = get_config(args.task) else: warnings.warn("No agent/task config given.") - - # experiment-specific overrides - # e.g. specific training hyperparameters or testing conditions + # Experiment-specific overrides, e.g. training hyperparameters. if args.overrides: for f in args.overrides: merge_dict(config_dict, read_file(f)) @@ -71,18 +78,14 @@ def merge(self): kv_dict = {} for kv in args.kv_overrides: k, v = kv.split("=") - try: - # string as a python expression - v = eval(v) + try: + v = eval(v) # String as a python expression. except: - # normal python string - pass + pass # Normal python string. deep_set(kv_dict, k.strip(), v) merge_dict(config_dict, kv_dict) - - # commind line overrides (e.g. retains `restore` field) + # Command line overrides (e.g. retains `restore` field). cmdline_dict = {k: v for k, v in args.__dict__.items() if v is not None} config_dict.update(cmdline_dict) - - # allow attribute-style access. + # Allow attribute-style access. return munch.munchify(config_dict) diff --git a/safe_control_gym/utils/logging.py b/safe_control_gym/utils/logging.py index 58e21c038..be73ab426 100644 --- a/safe_control_gym/utils/logging.py +++ b/safe_control_gym/utils/logging.py @@ -1,78 +1,70 @@ """Logging utilities. -Todo: - * - """ from collections import defaultdict import logging import os import numpy as np - import imageio -from torch.utils.tensorboard import SummaryWriter -# ----------------------------------------------------------------------------------- -# Terminal Logger -# ----------------------------------------------------------------------------------- +from torch.utils.tensorboard import SummaryWriter class StdoutLogger: - """Channel print content to std out and log file.""" + """Channel print content to std out and log file. + + """ def __init__(self, logger_name, log_dir, level=logging.INFO): logger = logging.getLogger(logger_name) formatter = logging.Formatter('%(asctime)s : %(message)s') - # Log to file ('w' to overwrite, 'a' to keep appending). log_file = os.path.join(log_dir, "std_out.txt") file_handler = logging.FileHandler(log_file, mode='a') file_handler.setFormatter(formatter) - # Log to std out. stream_handler = logging.StreamHandler() stream_handler.setFormatter(formatter) logger.setLevel(level) logger.addHandler(file_handler) logger.addHandler(stream_handler) - self.logger = logger self.file_handler = file_handler def info(self, msg): - """Chain print message to logger.""" + """Chain print message to logger. + + """ self.logger.info(msg) def close(self): - """Free log file.""" - self.file_handler.close() + """Free log file. - -# ----------------------------------------------------------------------------------- -# File Logger -# ----------------------------------------------------------------------------------- + """ + self.file_handler.close() class FileLogger: """Logger for saving statistics and other outputs to text files. - References: - * Michael Zhang's logger https://github.com/michaelrzhang/logger + Based on https://github.com/michaelrzhang/logger Initializes the log directory and creates log files given by name in arguments. Can be used to append future log values to each file. + """ def __init__(self, log_dir): # Creates folder for logging stats self.log_dir = os.path.join(log_dir, "logs") os.makedirs(self.log_dir, exist_ok=True) - # name of stats being tracked self.log_names = [] def init_logfile(self, name, xlabel="step"): - """Makes text file for logging the stat.""" + """Makes text file for logging the stat. + + """ fname = self.get_log_fname(name) # Already exist due to restore. if os.path.exists(fname): @@ -83,34 +75,37 @@ def init_logfile(self, name, xlabel="step"): log_file.write("{},{}\n".format(xlabel, name)) def get_log_fname(self, name): - """Gets log file name for the stat.""" + """Gets log file name for the stat. + + """ return os.path.join(self.log_dir, '{}.log'.format(name)) def log(self, name, value, step): - """Logs the stat to its corresponding text file.""" + """Logs the stat to its corresponding text file. + + """ if name not in self.log_names: # Initialize only if not done so already. self.init_logfile(name) self.log_names.append(name) - fname = self.get_log_fname(name) with open(fname, 'a') as log_file: log_file.write("{},{}\n".format(step, value)) def restore(self, step): - """Resets all log files to ignore lines after `step`.""" - # find all stats log files + """Resets all log files to ignore lines after `step`. + + """ + # Find all stats log files. log_files = [] for res, _, files in os.walk(self.log_dir): for each_file in files: if ".log" in files: log_files.append(os.path.join(res, each_file)) - for fname in log_files: with open(fname, "r") as file: lines = file.readlines() - - # find which line to start purging + # Find which line to start purging. stop_idx = None for i, each_line in enumerate(lines): temp = each_line.strip().split(',') @@ -119,42 +114,27 @@ def restore(self, step): # Skip header. if i == 0: continue - # first invalid line + # First invalid line. if idx > step: break - - # overwrite log file with only valid lines + # Overwrite log file with only valid lines. lines = lines[:stop_idx] with open(fname, "w") as file: for each_line in lines: file.write(each_line) -# ----------------------------------------------------------------------------------- -# CSV Logger -# ----------------------------------------------------------------------------------- - - -class CSVLogger: - """Logger for saving periodic stats to as csv file.""" - - def __init__(self): - pass - - -# ----------------------------------------------------------------------------------- -# Main Logger -# ----------------------------------------------------------------------------------- - - class ExperimentLogger: - """A hybrid logger.""" + """A hybrid logger. + + """ def __init__(self, log_dir, log_std_out=True, log_file_out=False, - use_tensorboard=False): + use_tensorboard=False + ): """Initializes loggers. Args: @@ -162,29 +142,29 @@ def __init__(self, log_std_out (bool): if to save terminal logs. log_file_out (bool): if to write data logs to text files. use_tensorboard (bool): if to use tensorboard. + """ self.log_dir = log_dir os.makedirs(log_dir, exist_ok=True) - # container for a log period + # Container for a log period. self.stats_buffer = defaultdict(list) - # Terminal logging. self.log_std_out = log_std_out if log_std_out: self.std_out_logger = StdoutLogger("Benchmark", log_dir) - # Text file logging. self.log_file_out = log_file_out if log_file_out: self.file_logger = FileLogger(log_dir) - # Tensorboard logging. self.use_tensorboard = use_tensorboard if use_tensorboard: self.tb_logger = SummaryWriter(log_dir=log_dir) def load(self, step): - """Resume from experiment, but ignores any logs after `step`.""" + """Resume from experiment, but ignores any logs after `step`. + + """ if self.log_file_out: self.file_logger.restore(step) if self.use_tensorboard: @@ -192,14 +172,18 @@ def load(self, step): purge_step=step) def close(self): - """Cleans up logging resources.""" + """Cleans up logging resources. + + """ if self.log_std_out: self.std_out_logger.close() if self.use_tensorboard: self.tb_logger.close() def info(self, msg): - """Logs a message to std output.""" + """Logs a message to std output. + + """ if self.log_std_out: self.std_out_logger.info(msg) else: @@ -212,14 +196,16 @@ def add_scalar(self, store=True, write=True, write_tb=True): - """Logs a scalar data.""" - # add to buffer (to be logged to terminal) + """Logs a scalar data. + + """ + # Add to buffer (to be logged to terminal). if store: self.stats_buffer[name].append(val) - # log to text file + # Log to text file. if self.log_file_out and write: self.file_logger.log(name, val, step) - # log to tb + # Log to tensorboard. if self.use_tensorboard and write_tb: self.tb_logger.add_scalar(name, val, step) @@ -230,69 +216,60 @@ def add_scalars(self, store=True, write=True, write_tb=True): - """Logs a group of scalars.""" + """Logs a group of scalars. + + """ assert isinstance(data, dict) for name, val in data.items(): - # scalars under the same name group + # Scalars under the same name group. full_name = prefix + "/" + name if prefix else name self.add_scalar(full_name, val, step, store, write, write_tb) def dump_scalars(self): """Produce a summary of stats within the log period (from buffer). - References: - * https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/logger.py - * https://github.com/openai/spinningup/blob/master/spinup/utils/logx.py - Currently only dump to terminal as a table summary, can dump to a CSV file in the future, but feels repetitive & less flexible than `add_scalar(..., write=True)`. + """ keys, values = [], [] tag = None - - # sorted keys are important for consistency betwen log steps !!! + # Important: sorted keys are important for consistency betwen log steps. for key, val_list in sorted(self.stats_buffer.items()): if len(val_list) == 1: - # align left + # Left align. val_str = "{:<8.3g}".format(val_list[0]) else: val_np = np.asarray(val_list) - val_str = "{:.3f} +/- {:.3f}".format(val_np.mean(), - val_np.std()) - - # Find tag and add it to the dict + val_str = "{:.3f} +/- {:.3f}".format(val_np.mean(), val_np.std()) + # Find tag and add it to the dict. if key.find("/") > 0: tag = key[:key.find("/") + 1] trunc_tag = self._truncate(tag) if trunc_tag not in keys: keys.append(trunc_tag) values.append("") - # Remove tag from key + # Remove tag from key. if tag is not None and tag in key: key = str(" " + key[len(tag):]) - keys.append(self._truncate(key)) values.append(self._truncate(val_str)) - - # Find max widths + # Find max widths. if len(keys) == 0: print("Tried to write empty key-value dict") return else: key_width = max(map(len, keys)) val_width = max(map(len, values)) - - # Write out the data + # Write out the data. dashes = "-" * (key_width + val_width + 7) lines = [dashes] for key, value in zip(keys, values): key_space = " " * (key_width - len(key)) val_space = " " * (val_width - len(value)) - lines.append("| {}{} | {}{} |".format(key, key_space, value, - val_space)) + lines.append("| {}{} | {}{} |".format(key, key_space, value, val_space)) lines.append(dashes) - summary = "\n" + "\n".join(lines) + "\n" self.info(summary) self.stats_buffer.clear() @@ -305,7 +282,7 @@ def _truncate(self, string, max_length=23): def log_video(self, name, video, fps=20): """Saves a video for evaluation, video: list of np.arrays of shape (H,W,C). - Reference: https://imageio.readthedocs.io/en/stable/format_gif-pil.html + """ vid_kargs = {'fps': fps} vid_name = '{}/{}'.format(self.log_dir, name) diff --git a/safe_control_gym/utils/plotting.py b/safe_control_gym/utils/plotting.py index 4182b10d7..5895f4e41 100644 --- a/safe_control_gym/utils/plotting.py +++ b/safe_control_gym/utils/plotting.py @@ -1,8 +1,5 @@ """Plotting utilities. -Todo: - * - """ from collections import defaultdict import os @@ -13,12 +10,10 @@ from tensorboard.backend.event_processing.event_accumulator import EventAccumulator -# ----------------------------------------------------------------------------------- -# Utils -# ----------------------------------------------------------------------------------- DIV_LINE_WIDTH = 50 + COLORS = [ "blue", "green", @@ -46,14 +41,15 @@ "darkblue", ] -# reference: https://matplotlib.org/stable/gallery/lines_bars_and_markers/linestyles.html + LINE_STYLES = [ - ('solid', 'solid'), # Same as (0, ()) or '-' - ('dotted', 'dotted'), # Same as (0, (1, 1)) or '.' - ('dashed', 'dashed'), # Same as '--' + ('solid', 'solid'), + ('dotted', 'dotted'), + ('dashed', 'dashed'), ('dashdot', 'dashdot'), ] + LINE_STYLES2 = [('loosely dotted', (0, (1, 10))), ('dotted', (0, (1, 1))), ('densely dotted', (0, (1, 1))), ('loosely dashed', (0, (5, 10))), ('dashed', (0, (5, 5))), @@ -63,25 +59,32 @@ ('densely dashdotted', (0, (3, 1, 1, 1))), ('dashdotdotted', (0, (3, 5, 1, 5, 1, 5))), ('loosely dashdotdotted', (0, (3, 10, 1, 10, 1, 10))), - ('densely dashdotdotted', (0, (3, 1, 1, 1, 1, 1)))] + ('densely dashdotdotted', (0, (3, 1, 1, 1, 1, 1))) +] def rolling_window(a, window): - """""" + """Window data. + + """ shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) def window_func(x, y, window, func): - """""" + """Evaluate a function on windowed data. + + """ yw = rolling_window(y, window) yw_func = func(yw, axis=-1) return x[window - 1:], yw_func def filter_log_dirs(pattern, negative_pattern=" ", root="./log", **kwargs): - """Gets list of experiment folders as specified.""" + """Gets list of experiment folders as specified. + + """ dirs = [item[0] for item in os.walk(root)] leaf_dirs = [] for i in range(len(dirs)): @@ -100,10 +103,12 @@ def filter_log_dirs(pattern, negative_pattern=" ", root="./log", **kwargs): def align_runs(xy_list, x_num_max=None): - """Aligns the max of the x data across runs""" + """Aligns the max of the x data across runs. + + """ x_max = float("inf") for x, y in xy_list: - # align length of x data (get min across all runs) + # Align length of x data (get min across all runs). x_max = min(x_max, len(x)) if x_num_max: x_max = min(x_max, x_num_max) @@ -112,7 +117,9 @@ def align_runs(xy_list, x_num_max=None): def smooth_runs(xy_list, window=10): - """Smooth the data curves by mean filtering.""" + """Smooth the data curves by mean filtering. + + """ smoothed_list = [ window_func(np.asarray(x), np.asarray(y), window, np.mean) for x, y in xy_list @@ -121,7 +128,9 @@ def smooth_runs(xy_list, window=10): def select_runs(xy_list, criterion, top_k=0): - """Pickes the top k runs based on a criterion.""" + """Pickes the top k runs based on a criterion. + + """ perf = [criterion(y) for _, y in xy_list] top_k_runs = np.argsort(perf)[-top_k:] selected_list = [] @@ -132,32 +141,31 @@ def select_runs(xy_list, criterion, top_k=0): def interpolate_runs(xy_list, interp_interval=100): - """Uses the same x data by interpolation across runs.""" + """Uses the same x data by interpolation across runs. + + """ x_right = float("inf") for x, y in xy_list: x_right = min(x_right, x[-1]) - # shape (data_len,) + # Shape: (data_len,). x = np.arange(0, x_right, interp_interval) y = [] for x_, y_ in xy_list: y.append(np.interp(x, x_, y_)) - # shape (num_runs, data_len) + # Shape: (num_runs, data_len). y = np.asarray(y) return x, y -# ----------------------------------------------------------------------------------- -# Single Experiment Plotting -# ----------------------------------------------------------------------------------- - - def load_from_log_file(path): - """Return x, y sequence data from the stat csv.""" + """Return x, y sequence data from the stat csv. + + """ with open(path, "r") as f: lines = f.readlines() - # x, y labels + # Labels. xk, yk = [k.strip() for k in lines[0].strip().split(",")] - # x, y values + # Values. x, y = [], [] for l in lines[1:]: data = l.strip().split(",") @@ -169,14 +177,16 @@ def load_from_log_file(path): def load_from_logs(log_dir): - """Return dict of stats under log_dir folder (`exp_dir/logs/`)""" + """Return dict of stats under log_dir folder (`exp_dir/logs/`). + + """ log_files = [] - # fetch all log files + # Fetch all log files. for r, _, f in os.walk(log_dir): for file in f: if ".log" in file: log_files.append(os.path.join(r, file)) - # fetch all stats from log files + # Fetch all stats from log files. data = {} for path in log_files: name = path.split(log_dir)[-1].replace(".log", "") @@ -193,18 +203,17 @@ def plot_from_logs(src_dir, out_dir, window=None, keys=None): out_dir (str): folder to save figures. window (int): window size for smoothing. keys (list): specify name of stats to plot, None means plot all. + """ - # find all logs + # Find all logs. log_files = [] for r, _, f in os.walk(src_dir): for file in f: if ".log" in file: log_files.append(os.path.join(r, file)) - - # make a figure for each log file + # Make a figure for each log file. stats = {} for path in log_files: - # name = os.path.basename(path).split(".")[0] name = path.split(src_dir)[-1].replace(".log", "") if keys: if name not in keys: @@ -229,15 +238,11 @@ def plot_from_tensorboard_log(src_dir, xlabel="step"): """Generates a plot for each stat from tfb log file in source folder. - References: - * https://stackoverflow.com/questions/36700404/tensorflow-opening-log-data-written-by-summarywriter - * https://github.com/tensorflow/tensorboard/blob/master/tensorboard/backend/event_processing/event_accumulator.py """ event_acc = EventAccumulator(src_dir) event_acc.Reload() if not keys: keys = event_acc.Tags()["scalars"] - stats = {} for k in keys: _, x, y = zip(*event_acc.Scalars(k)) @@ -250,17 +255,12 @@ def plot_from_tensorboard_log(src_dir, plt.title(k) plt.xlabel(xlabel) plt.ylabel(k) - # use "-" instead of "/" to connect group and stat name + # Use "-" instead of "/" to connect group and stat name. out_path = os.path.join(out_dir, k.replace("/", "-") + ".jpg") plt.savefig(out_path) return stats -# ----------------------------------------------------------------------------------- -# Cross Experiment Plotting -# ----------------------------------------------------------------------------------- - - def plot_from_experiments(legend_dir_specs, out_path="temp.jpg", scalar_name=None, @@ -270,7 +270,8 @@ def plot_from_experiments(legend_dir_specs, window=None, x_num_max=None, num_std=1, - use_tb_log=True): + use_tb_log=True + ): """Generates plot among algos, each with several seed runs. Example: @@ -297,14 +298,14 @@ def plot_from_experiments(legend_dir_specs, ylabel="Reward", window=10 ) + """ assert scalar_name is not None, "Must provide a scalar name to plot" - - # get all stats + # Get all stats. stats = defaultdict(list) for l, dirs in legend_dir_specs.items(): for d in dirs: - # pick from either log source (tensorboard or log text files) + # Pick from either log source (tensorboard or log text files). if use_tb_log: event_acc = EventAccumulator(d) event_acc.Reload() @@ -313,33 +314,29 @@ def plot_from_experiments(legend_dir_specs, else: path = os.path.join(d, "logs", scalar_name + ".log") _, x, _, y = load_from_log_file(path) - - # smoothing + # Smoothing. x, y = np.asarray(x), np.asarray(y) if window: x, y = window_func(x, y, window, np.mean) stats[l].append([x, y]) - - # post-processing + # Post-processing. x_max = float("inf") for _, runs in stats.items(): for x, y in runs: - # align length of x data (get min across all runs & all algos) + # Align length of x data (get min across all runs & all algos). x_max = min(x_max, len(x)) if x_num_max: x_max = min(x_max, x_num_max) - processed_stats = {} for name, runs in stats.items(): - # use same x for all runs to an algo + # Use same x for all runs to an algorithm. x = np.array([x[:x_max] for x, _ in runs])[0] - # different y for different runs + # Different y for different runs. y = np.stack([y[:x_max] for _, y in runs]) y_mean = np.mean(y, axis=0) y_std = np.std(y, axis=0) processed_stats[name] = [x, y_mean, y_std] - - # actual plot + # Actual plot. plt.clf() for i, name in enumerate(processed_stats.keys()): color = COLORS[i] @@ -354,18 +351,17 @@ def plot_from_experiments(legend_dir_specs, plt.xlabel(xlabel) plt.ylabel(ylabel) plt.legend() - plt.tight_layout() plt.savefig(out_path) plt.show() return stats, processed_stats -def get_log_dirs(all_logdirs, select=None, exclude=None): +def get_log_dirs(all_logdirs, + select=None, + exclude=None + ): """Find all folders for plotting. - - Reference: - * https://github.com/openai/spinningup/blob/038665d62d569055401d91856abb287263096178/spinup/utils/plot.py All 3 arguments can be exposed as list args from command line. @@ -374,6 +370,7 @@ def get_log_dirs(all_logdirs, select=None, exclude=None): pull data from it; 2) if not, check to see if the entry is a prefix for a real directory, and pull data from that. + """ logdirs = [] for logdir in all_logdirs: @@ -385,22 +382,17 @@ def get_log_dirs(all_logdirs, select=None, exclude=None): prefix = logdir.split(os.sep)[-1] listdir = os.listdir(basedir) logdirs += sorted([fulldir(x) for x in listdir if prefix in x]) - """ - Enforce selection rules, which check logdirs for certain substrings. - Makes it easier to look at graphs from particular ablations, if you - launch many jobs at once with similar names. - """ + # Enforce selection rules, which check logdirs for certain substrings. Makes it easier to look + # at graphs from particular ablations, if you launch many jobs at once with similar names. if select is not None: logdirs = [log for log in logdirs if all(x in log for x in select)] if exclude is not None: logdirs = [ log for log in logdirs if all(not (x in log) for x in exclude) ] - - # Verify logdirs + # Verify logdirs. print('Plotting from...\n' + '=' * DIV_LINE_WIDTH + '\n') for logdir in logdirs: print(logdir) print('\n' + '=' * DIV_LINE_WIDTH) - return logdirs diff --git a/safe_control_gym/utils/registration.py b/safe_control_gym/utils/registration.py index 71da1e8c9..d63e9560b 100644 --- a/safe_control_gym/utils/registration.py +++ b/safe_control_gym/utils/registration.py @@ -1,6 +1,6 @@ -"""Example Google style docstrings. +"""Environments and controllers registration. -Reference to https://github.com/openai/gym/blob/master/gym/envs/registration.py +Based on https://github.com/openai/gym/blob/master/gym/envs/registration.py """ import copy @@ -13,7 +13,9 @@ def load(name): - """Loads a callable from a string with format `module_path:callable_name`.""" + """Loads a callable from a string with format `module_path:callable_name`. + + """ mod_name, attr_name = name.split(":") mod = importlib.import_module(mod_name) fn = getattr(mod, attr_name) @@ -21,7 +23,9 @@ def load(name): class Spec(): - """ A specification for a particular instance of the environment.""" + """ A specification for a particular instance of the environment. + + """ def __init__(self, id, entry_point=None, config_entry_point=None): """Used in function "register". @@ -37,11 +41,15 @@ def __init__(self, id, entry_point=None, config_entry_point=None): self.config_entry_point = config_entry_point def __repr__(self): - """Defines a class instance's string representation.""" + """Defines a class instance's string representation. + + """ return "Spec({})".format(self.id) def get_config(self): - """Fetches config (as dict) for the registered class.""" + """Fetches config (as dict) for the registered class. + + """ if isinstance(self.config_entry_point, str): if self.config_entry_point.endswith(".yaml"): # Specified as file path. @@ -52,18 +60,18 @@ def get_config(self): # Specified as "module_path:config_dict_name". config = load(self.config_entry_point) elif self.config_entry_point is None: - # no default config (provided as overwrites) + # No default config. config = {} else: - raise Exception("Config type {} is not supported.".format( - self.config_entry_point)) + raise Exception("Config type {} is not supported.".format(self.config_entry_point)) return config def make(self, *args, **kwargs): - """Instantiates an instance of the registered class with appropriate kwargs.""" + """Instantiates an instance of the registered class with appropriate kwargs. + + """ if self.entry_point is None: - raise Exception('Attempting to make deprecated env {}.'.format( - self.id)) + raise Exception('Attempting to make deprecated env {}.'.format(self.id)) if callable(self.entry_point): obj = self.entry_point(*args, **kwargs) else: @@ -79,67 +87,81 @@ def make(self, *args, **kwargs): class Registry(): - """Register a callable by ID.""" + """Register a callable by ID. + + """ def __init__(self): self.specs = {} def make(self, path, *args, **kwargs): - """Create an instance of the registered callable by id `path`.""" + """Create an instance of the registered callable by id `path`. + + """ spec = self.specs[path] obj = spec.make(*args, **kwargs) return obj def all(self): - """Returns all registered callables.""" + """Returns all registered callables. + + """ return self.specs.values() def spec(self, path): - """Returns spec of the registered callable by id.""" + """Returns spec of the registered callable by id. + + """ if ':' in path: mod_name, id = path.split(':') try: importlib.import_module(mod_name) except: - raise Exception( - '''A module ({}) was specified for the environment but was not - found, make sure the package is installed with `pip install` - before calling `gym.make()`'''.format(mod_name)) + raise Exception('''A module ({}) was specified for the environment but was not found,make sure the + package is installed with `pip install` before calling `gym.make()`'''.format(mod_name)) else: id = path - try: return self.specs[id] except: raise Exception("Key not found in registry.") def register(self, id, **kwargs): - """Saves a reference to the callable with a id.""" + """Saves a reference to the callable with a id. + + """ if id in self.specs: raise Exception('Cannot re-register id: {}'.format(id)) self.specs[id] = Spec(id, **kwargs) def register(id, **kwargs): - """Saves the callable to the global registry.""" + """Saves the callable to the global registry. + + """ return registry.register(id, **kwargs) def make(id, *args, **kwargs): - """Creates an instance of the callable from global registry.""" + """Creates an instance of the callable from global registry. + + """ return registry.make(id, *args, **kwargs) def spec(id): - """Gets the spec of the callable from global registry.""" + """Gets the spec of the callable from global registry. + + """ return registry.spec(id) def get_config(id): - """Gets the config of the callable from global registry.""" + """Gets the config of the callable from global registry. + + """ return registry.spec(id).get_config() -# Registry: global registry (must be a singleton). -# used by functions `register`, `make`, `spec` and `get_config`. +# Registry: global registry (singleton) used by functions `register`, `make`, `spec` and `get_config`. registry = Registry() diff --git a/safe_control_gym/utils/utils.py b/safe_control_gym/utils/utils.py index d10d41e58..15cacbb76 100644 --- a/safe_control_gym/utils/utils.py +++ b/safe_control_gym/utils/utils.py @@ -1,4 +1,6 @@ -"""Collection of utility functions.""" +"""Miscellaneous utility functions. + +""" import argparse import datetime import gym @@ -9,21 +11,24 @@ import sys import munch import yaml - import imageio import numpy as np import torch def mkdirs(*paths): - """Makes a list of directories.""" + """Makes a list of directories. + + """ for path in paths: if not os.path.exists(path): os.makedirs(path) def eval_token(token): - """Converts string token to int, float or str.""" + """Converts string token to int, float or str. + + """ if token.isnumeric(): return int(token) try: @@ -33,10 +38,11 @@ def eval_token(token): def read_file(file_path, sep=","): - """Loads content from a file (json, yaml, csv, txt) + """Loads content from a file (json, yaml, csv, txt). - For json & yaml files returns a dict, - for csv & txt returns list of lines. + For json & yaml files returns a dict. + Ror csv & txt returns list of lines. + """ if len(file_path) < 1 or not os.path.exists(file_path): return None @@ -61,7 +67,9 @@ def read_file(file_path, sep=","): def merge_dict(source_dict, update_dict): - """Merges updates into source recursively.""" + """Merges updates into source recursively. + + """ for k, v in update_dict.items(): if k in source_dict and isinstance(source_dict[k], dict) and isinstance( v, dict): @@ -71,14 +79,18 @@ def merge_dict(source_dict, update_dict): def get_time(): - """Gets current timestamp (as string).""" + """Gets current timestamp (as string). + + """ start_time = datetime.datetime.now() time = str(start_time.strftime("%Y_%m_%d-%X")) return time def get_random_state(): - """Snapshots the random state at any moment.""" + """Snapshots the random state at any moment. + + """ return { "random": random.getstate(), "numpy": np.random.get_state(), @@ -87,14 +99,18 @@ def get_random_state(): def set_random_state(state_dict): - """Resets the random state for experiment restore.""" + """Resets the random state for experiment restore. + + """ random.setstate(state_dict["random"]) np.random.set_state(state_dict["numpy"]) torch.torch.set_rng_state(state_dict["torch"]) def set_seed(seed, cuda=False): - """General seeding function for reproducibility.""" + """General seeding function for reproducibility. + + """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) @@ -106,29 +122,31 @@ def set_seed(seed, cuda=False): def set_dir_from_config(config): """Creates a output folder for experiment (and save config files). - naming format: - {root (e.g. results)}/{tag (exp id)}/{seed}_{timestamp}_{git commit id} + Naming format: {root (e.g. results)}/{tag (exp id)}/{seed}_{timestamp}_{git commit id} + """ # Make output folder. timestamp = datetime.datetime.now().strftime("%b-%d-%H-%M") commit_id = subprocess.check_output( ["git", "describe", "--tags", "--always"]).decode("utf-8").strip() - run_dir = "seed{}_{}_{}".format(str(config.seed), str(timestamp), - str(commit_id)) + run_dir = "seed{}_{}_{}".format(str(config.seed), + str(timestamp), + str(commit_id) + ) config.output_dir = os.path.join(config.output_dir, config.tag, run_dir) mkdirs(config.output_dir) - # Save config. with open(os.path.join(config.output_dir, 'config.yaml'), "w") as file: yaml.dump(munch.unmunchify(config), file, default_flow_style=False) - # Save command. with open(os.path.join(config.output_dir, 'cmd.txt'), 'a') as file: file.write(" ".join(sys.argv) + "\n") def set_seed_from_config(config): - """Sets seed, only set if nonzero, 0 seed default to no seeding.""" + """Sets seed, only set if nonzero, 0 seed default to no seeding. + + """ seed = config.seed use_cuda = True if "cuda" in config.device else False if seed > 0: @@ -136,7 +154,9 @@ def set_seed_from_config(config): def set_device_from_config(config): - """Sets device, using GPU is set to `cuda` for now, no specific GPU yet.""" + """Sets device, using GPU is set to `cuda` for now, no specific GPU yet. + + """ use_cuda = (config.device == "cuda") and torch.cuda.is_available() config.device = "cuda" if use_cuda else "cpu" @@ -148,6 +168,7 @@ def save_video(name, frames, fps=20): name (str): path name to save the video. frames (list): frames of the video as list of np.arrays. fps (int, optional): frames per second. + """ assert ".gif" in name or ".mp4" in name, "invalid video name" vid_kwargs = {'fps': fps} @@ -164,6 +185,7 @@ def str2bool(val): Returns: bool: Interpretation of `val` as True or False. + """ if isinstance(val, bool): return val @@ -172,15 +194,12 @@ def str2bool(val): elif val.lower() in ('no', 'false', 'f', 'n', '0'): return False else: - raise argparse.ArgumentTypeError( - "[ERROR] in str2bool(), a Boolean value is expected") + raise argparse.ArgumentTypeError("[ERROR] in str2bool(), a Boolean value is expected") def unwrap_wrapper(env, wrapper_class): """Retrieve a ``VecEnvWrapper`` object by recursively searching. - Reference: - * https://github.com/DLR-RM/stable-baselines3/blob/ddbe0e93f9fe55152f2354afd058b28e6ccc3345/stable_baselines3/common/env_util.py """ env_tmp = env while isinstance(env_tmp, gym.Wrapper): @@ -193,7 +212,5 @@ def unwrap_wrapper(env, wrapper_class): def is_wrapped(env, wrapper_class): """Check if a given environment has been wrapped with a given wrapper. - Reference: - * https://github.com/DLR-RM/stable-baselines3/blob/ddbe0e93f9fe55152f2354afd058b28e6ccc3345/stable_baselines3/common/env_util.py """ - return unwrap_wrapper(env, wrapper_class) is not None \ No newline at end of file + return unwrap_wrapper(env, wrapper_class) is not None