# Imports

In [None]:
# general imports
import sys                       
import numpy as np                

# gym for spaces and environment definitions
import gym
# env checker
from stable_baselines3.common import env_checker

# stable baselines3 -> SAC
from stable_baselines3 import SAC
from stable_baselines3.sac import MlpPolicy

# stbale baselines3 -> HER
from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy

# Tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('tensorboard_log/',comment="-SAC_HER_buff20000")

# grpc communication
sys.path.insert(1, '/tum_nrp/grpc/python/communication')
import experiment_api_wrapper as eaw

# for auto-reloading external modules see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

from env import SimEnv, SimGoalEnv
from train_helpers import evaluate, train

# Parameters

In [None]:
params = {
    "VERBOSE": 1,
    "SETTING": 'reduced4+',            # 'reduced', 'reduced2', 'reduced3', 'reduced3+', 'reduced4', 'reduced4+'
    "OBJ_SPACE_LOW": np.array([-1.5, -1, -0.55, -0.44, -0.48, 0, -np.pi/2, -np.pi/2, -np.pi, -np.pi, -np.pi, -np.pi]), # observation-space (ee-pos, cyl-pos, joints)
    "OBJ_SPACE_HIGH": np.array([1.5, 1.8, 2.2, 0.48, 0.44, 1.12, np.pi/2, np.pi/2, np.pi, np.pi, np.pi, np.pi]),
    "SPACE_NORM": 1,                   #  1 -> yes, 0 -> no (normalize the action and observation space)
    "CYLINDER": 'whole_table',         # 'no', fix', 'semi_random', 'semi_random_sides', 'half_table', '3/4-table', '7/8-table', 'whole_table'
    "BUFFER_SIZE": 20000,
    "THRESHOLD": 0.14,                  # initial treshold
    "THRESHOLD_SCHEDULING": 1,          # 1-> yes, 0-> no
    "MIN_THRESHOLD": 0.08,
    "REWARD_TYPE": 'sparse',           # 'sparse', 'dense', 'extra_dense'
    "LEARNING_STARTS": 10,#100,            # number of random movements before learning starts,#
    "TOGGLE_REWARD": 0,
    "STEPS": 20,#1500,                     # number of steps while training (=num_episodes when MAX_EPISODE_LEGTH is 1)
    "MAX_EPISODE_LENGTH": 1,           # 'None' (no limit) or value 
    "EXPLORATION": 1,                  # just let it on 1 and ignore it
    "WRITER": writer,
    "USE_HER": 1,                      # 1-> yes, 0-> no
    "ENTROPY_COEFFICIENT": 0.007,      # 'auto' or value between 0 and 1 // 0.007 turned out to work well
    "GLOBAL_STEPPER": 0, 
    "EVALUATION_STEPS": 5,#50,            # number of evaluation steps per investigates treshold (x4)
    "EVALS": [0.25, 0.20, 0.15, 0.10], # here, the list MUST contain always 4 tresholds for evaluation
    "BATCH_SIZE": 64,
    "ACTION_NOISE": None,
    "RANDOM_EXPLORATION": 0.0,
    "LR": 3e-4,
    "TB_LOGGER": None}

# Model

In [None]:
# create a experiment (connection)
exp = eaw.ExperimentWrapper()

# test if simulation can be reached
server_id = exp.client.test()
if server_id:
    print("Simulation is available, id: ", server_id)
else:
    print("Simulation NOT available")
    
    
# create an environment (choice depends on usage of HER)
if params["USE_HER"] == 1:
    env = SimGoalEnv(exp, params, writer)
else:
    env = SimEnv(exp, params, writer)
    
# check env
env_checker.check_env(env)

# set reinforcement learning algorithm
model_class = SAC

########################
# Model initialization #
########################

if params["USE_HER"] == 1:
    # underlying model

    # HER goal selection strategy
    goal_selection_strategy = 'future'

    # if True the HER transitions will get sampled online
    online_sampling = True

    # time limit for the episodes
    max_episode_length = 1

    # model
    model = model_class(
        "MultiInputPolicy",
        env,
        buffer_size=params["BUFFER_SIZE"],
        learning_starts=params["LEARNING_STARTS"], 
        ent_coef=params["ENTROPY_COEFFICIENT"],
        replay_buffer_class=HerReplayBuffer,
        # Parameters for HER
        replay_buffer_kwargs=dict(
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            online_sampling=online_sampling,
            max_episode_length=max_episode_length,
        ),
        verbose=params["VERBOSE"],
    )
    
    # start training
    train(model, env, params)
    
else:
    model = model_class(
        "MlpPolicy", 
        env,
        buffer_size=params["BUFFER_SIZE"], 
        learning_starts=params["LEARNING_STARTS"], 
        ent_coef=params["ENTROPY_COEFFICIENT"],
        verbose=params["VERBOSE"]
    )

    # start training (without train method)
    model.learn(params["STEPS"])
    

In [None]:
##############
# save model #
##############

model.save("r4+_whole_table")

In [None]:
####################
# model evaluation #
####################

# disable plotting while evaluation because of some issues with inconsistent lengths
env.set_eval(ev=True)

evaluate(model, env, params)


# Continue Training

This part enables us to continue the training.
(e.g. when results are likely to improve with some more episodes).

In [None]:
#####################
# continue training # 
#####################

# close old processes
env.close()

# make a new environment
if params["USE_HER"] == 1:
    env = SimGoalEnv(exp, params)
else:
    env = SimEnv(exp, params)
    
# check env
env_checker.check_env(env)

# change env in model
model.set_env(env)

# make adjustments
params["STEPS"] = 800 # actually 700, because 100 are random movements
params["EVALS"] = [0.25, 0.20, 0.15, 0.10]

# train
train(model, env, params)

# Experimental Section

Everything below here is just experimental and is not guaranteed to work.

## Multiple trainings in a row

This should enable to run multiple (different) experiments after another.

In [None]:
# parameters
pars = { 1: {
    "VERBOSE": 1,
    "SETTING": 'reduced2', 
    "CYLINDER": 'semi_random_sides', #'fix', 'semi_random', 'semi_random_sides'
    "BUFFER_SIZE": 20000,
    "THRESHOLD": 0.30,
    "REWARD_TYPE": 'sparse',
    "LEARNING_STARTS": 80,
    "TOGGLE_REWARD": 0,
    "STEPS": 500,
    "EXPLORATION": 1,
    "WRITER": writer,
    "USE_HER": 1, 
    "ENTROPY_COEFFICIENT": 0.01, # 'auto' or value between 0 and 1
    "GLOBAL_STEPPER": 0,
    "EVALUATION_STEPS": 30, 
    "BATCH_SIZE": 64,
    "ACTION_NOISE": None,
    "RANDOM_EXPLORATION": 0.0,
    "LR": 3e-4,
    "TB_LOGGER": None},
     2: {
    "VERBOSE": 1,
    "SETTING": 'reduced2', #'reduced', 'reduced2', 'reduced3'
    "CYLINDER": 'semi_random_sides', #'fix', 'semi_random', 'semi_random_sides'
    "BUFFER_SIZE": 20000,
    "TRESHOLD": 0.2,
    "REWARD_TYPE": 'sparse',
    "LEARNING_STARTS": 80,
    "TOGGLE_REWARD": 0,
    "STEPS": 500,
    "EXPLORATION": 1,
    "WRITER": writer,
    "USE_HER": 0, 
    "ENTROPY_COEFFICIENT": 0.01, # 'auto' or value between 0 and 1
    "GLOBAL_STEPPER": 0,
    "EVALUATION_STEPS": 30, 
    "BATCH_SIZE": 64,
    "ACTION_NOISE": None,
    "RANDOM_EXPLORATION": 0.0,
    "LR": 3e-4,
    "TB_LOGGER": None}
}

for key,value in pars.items():
    # create new log
    writer = SummaryWriter('tensorboard_log/',comment="-SAC_HER_buff20000") # set up tensorboard storage

    # create a experiment (connection)
    exp = eaw.ExperimentWrapper()

    # test if simulation can be reached
    server_id = exp.client.test()
    if server_id:
        print("Simulation is available, id: ", server_id)
    else:
        print("Simulation NOT available")


    # create an environment
    if pars[key]["USE_HER"] == 1:
        env = SimGoalEnv(exp, pars[key])
    else:
        env = SimEnv(exp, pars[key])

    # check env
    env_checker.check_env(env)

    # set reinforcement learning algorithm
    model_class = SAC

    ######################
    # Model -> SAC + HER #
    ######################

    if pars[key]["USE_HER"] == 1:
        # underlying model
        model_class = SAC

        # HER goal selection strategy
        goal_selection_strategy = 'future'

        # If True the HER transitions will get sampled online
        online_sampling = True

        # Time limit for the episodes
        max_episode_length = 1

        # model
        model = model_class(
            "MultiInputPolicy",
            env,
            buffer_size=pars[key]["BUFFER_SIZE"],
            learning_starts=pars[key]["LEARNING_STARTS"], 
            ent_coef=pars[key]["ENTROPY_COEFFICIENT"],
            replay_buffer_class=HerReplayBuffer,
            # Parameters for HER
            replay_buffer_kwargs=dict(
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                online_sampling=online_sampling,
                max_episode_length=max_episode_length,
            ),
            verbose=pars[key]["VERBOSE"],
        )

        train(model, env, params)

    else:
        model = model_class(
            "MlpPolicy", 
            env,
            buffer_size=pars[key]["BUFFER_SIZE"], 
            learning_starts=pars[key]["LEARNING_STARTS"], 
            ent_coef=pars[key]["ENTROPY_COEFFICIENT"],
            verbose=pars[key]["VERBOSE"])

        model.learn(pars[key]["STEPS"])

    print("Evaluation: ")

    ##############
    # Evaluation #
    ##############
    obs = env.reset()

    eval_steps = 0

    successful = 0
    failed = 0

    while eval_steps < pars[key]["EVALUATION_STEPS"]:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        if done:
            successful += 1
        else:
            failed += 1
        obs = env.reset()
        eval_steps += 1

    # success rate:
    print("success_rate: ", successful/pars[key]["EVALUATION_STEPS"])


## Parallelization

In [None]:
from collections import deque 

class ResultBuffer():
    def __init__(self):
        # store from reset
        self.r_pos_cyl
        self.r_r_js
        self.r_pos_ee
        self.r_obs
        
        # store from step
        self.s_r_js
        self.s_pos_ee
        self.s_obs
        self.s_dist
        self.s_reward
        self.s_done
        self.s_info
        
    def append_reset(self, r_pos_cyl, r_r_js, r_pos_ee, r_obs):
        self.r_pos_cyl = r_pos_cyl
        self.r_r_js = r_r_js
        self.r_pos_ee = r_pos_ee
        self.r_obs = r_obs