# Imports

In [None]:
# general imports
import sys                       
import numpy as np
import time

# gym for spaces and environment definitions
import gym
# env checker
from stable_baselines3.common import env_checker

# find nan and inf values propagating quietly in the network
from stable_baselines3.common.vec_env import DummyVecEnv, VecCheckNan

# stable baselines3 -> TD3
from stable_baselines3 import TD3

# stbale baselines3 -> HER
from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy

# Tensorboard
from torch.utils.tensorboard import SummaryWriter

# grpc communication
sys.path.insert(1, '/tum_nrp/grpc/python/communication')
import experiment_api_wrapper as eaw

# for auto-reloading external modules see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

from env import SimEnv, SimGoalEnv
from train_helpers import evaluate3, train

# Parameters

In [None]:
params = {
    "VERBOSE": 1,
    "SETTING": 'reduced4+',                 # 'reduced', 'reduced2', 'reduced3', 'reduced3+', 'reduced4', 'reduced4+', 'full'
    "OBJ_SPACE_LOW": np.array([-0.92, -0.51, 0.58, -0.44, -0.48, 0, -np.pi/2, -np.pi/2, -0.001, -np.pi/2, -0.001, -np.pi]), # observation-space (ee-pos, cyl-pos, joints)
    "OBJ_SPACE_HIGH": np.array([0.92, 1.32, 2.07, 0.48, 0.44, 1.12, np.pi/2, 0.001, np.pi, np.pi/2, np.pi, np.pi]),
    "SPACE_NORM": 1,                   #  1 -> yes, 0 -> no (normalize the action and observation space)
    "CYLINDER": 'whole_table',         # 'no'(no augmentation), fix', 'semi_random', 'semi_random_sides', 'half_table', '3/4-table', '7/8-table', 'whole_table'
    "BUFFER_SIZE": 2000,
    "THRESHOLD": 0.2,                  # initial treshold
    "THRESHOLD_SCHEDULING": 1,         # 1-> yes, 0-> no
    "MIN_THRESHOLD": 0.02,
    "REWARD_TYPE": 'dense',            # 'sparse', 'dense', 'extra_dense'
    "LEARNING_STARTS": 100,            # number of random movements before learning starts,#
    "TOGGLE_REWARD": 0,
    "STEPS": 10000,                     # number of steps while training (=num_episodes when MAX_EPISODE_LEGTH is 1)
    "MAX_EPISODE_LENGTH": 1,           # 'None' (no limit) or value 
    "EXPLORATION": 1,                  # just let it on 1 and ignore it
    "USE_HER": 1,                      # 1-> yes, 0-> no
    "GLOBAL_STEPPER": 0, 
    "EVALUATION_STEPS": 500,           # number of evaluation steps per investigates treshold (x4)
    "EVALS": [0.1, 0.07, 0.05, 0.03],  # here, the list MUST contain always 4 tresholds for evaluation
    "BATCH_SIZE": 100,
    "ACTION_NOISE": None,
    "LR": 1e-3,
# td3 specific
    "TARGET_POLICY_NOISE": 0.2,
    "TARGET_NOISE_CLIP": 0.5}

# Model

The next cell trains a model from scratch. If you want to use a pre-trained model, skip this.

In [None]:
timestr = time.strftime("%Y%m%d-%H%M%S")
file_name = file_name = f"TD3_{params['THRESHOLD']}_{params['REWARD_TYPE']}_{params['SETTING']}_{params['STEPS']}_{timestr}"
writer = SummaryWriter(log_dir=f"tensorboard_log/{file_name}/") # set up tensorboard storage

# create a experiment (connection)
exp = eaw.ExperimentWrapper()

# test if simulation can be reached
server_id = exp.client.test()
if server_id:
    print("Simulation is available, id: ", server_id)
else:
    print("Simulation NOT available")
    
    
# create an environment (choice depends on usage of HER)
if params["USE_HER"] == 1:
    env = SimGoalEnv(exp, params, writer)
else:
    env = SimEnv(exp, params, writer)
    
# check env
# env_checker.check_env(env)

# find nan and inf values propagating quietly in the network
# env = DummyVecEnv([lambda: env])
# env = VecCheckNan(env, raise_exception=True)

# set reinforcement learning algorithm
model_class = TD3

########################
# Model initialization #
########################

if params["USE_HER"] == 1:
    # underlying model

    # HER goal selection strategy
    goal_selection_strategy = 'future'

    # if True the HER transitions will get sampled online
    online_sampling = True

    # time limit for the episodes
    max_episode_length = 1

    # model
    model = model_class(
        "MultiInputPolicy",
        env,
        learning_rate=params["LR"],
        buffer_size=params["BUFFER_SIZE"],
        learning_starts=params["LEARNING_STARTS"],
        batch_size=params["BATCH_SIZE"],
        action_noise=params["ACTION_NOISE"],
        replay_buffer_class=HerReplayBuffer,
        # Parameters for HER
        replay_buffer_kwargs=dict(
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            online_sampling=online_sampling,
            max_episode_length=max_episode_length),
        target_policy_noise=params["TARGET_POLICY_NOISE"], 
        target_noise_clip=params["TARGET_NOISE_CLIP"], 
        tensorboard_log=f"tensorboard_log/{file_name}/",
        verbose=params["VERBOSE"]
    )
    
    # start training
    train(model, env, params)
    
else:
    model = model_class(
        "MlpPolicy", 
        env,
        learning_rate=params["LR"],
        buffer_size=params["BUFFER_SIZE"], 
        learning_starts=params["LEARNING_STARTS"],
        batch_size=params["BATCH_SIZE"],
        action_noise=params["ACTION_NOISE"],
        target_policy_noise=params["TARGET_POLICY_NOISE"], 
        target_noise_clip=params["TARGET_NOISE_CLIP"], 
        tensorboard_log=f"tensorboard_log/{file_name}/",
        verbose=params["VERBOSE"]
    )
    
    # start training (without train method)
    model.learn(params["STEPS"])

In [None]:
##############
# save model #
##############
model.save('./saved_models/'+file_name)
model.get_parameters()

In [None]:
##############
# load model #
##############
exp = eaw.ExperimentWrapper()
load_file_name = "TD3_0.2_dense_reduced4+_10000_20210729-123802"
writer = SummaryWriter(log_dir=f"tensorboard_log/{load_file_name}/") # set up tensorboard storage

# test if simulation can be reached
server_id = exp.client.test()
if server_id:
    print("Simulation is available, id: ", server_id)
else:
    print("Simulation NOT available")

# make a new environment
if params["USE_HER"] == 1:
    env = SimGoalEnv(exp, params, writer)
else:
    env = SimEnv(exp, params, writer)
    
# check env
env.set_eval(ev=True)
# env_checker.check_env(env)

model=TD3.load('./saved_models/'+load_file_name, env=env)

In [None]:
####################
# model evaluation #
####################

# disable plotting while evaluation because of some issues with inconsistent lengths
env.set_eval(ev=True)

evaluate3(model, env, params, writer, max_threshold=max(params["EVALS"]))

# Continue Training

This part enables us to continue the training.
(e.g. when results are likely to improve with some more episodes).

In [None]:
#####################
# continue training # 
#####################

# make adjustments
params["STEPS"] = 10000 

# set up logging
timestr = time.strftime("%Y%m%d-%H%M%S")
file_name = f"TD3_{params['THRESHOLD']}_{params['REWARD_TYPE']}_{params['SETTING']}_{params['STEPS']}_{timestr}"
writer = SummaryWriter(log_dir=f"tensorboard_log/{file_name}/") # set up tensorboard storage

# close old processes
env.close()

# make a new environment
if params["USE_HER"] == 1:
    env = SimGoalEnv(exp, params, writer)
else:
    env = SimEnv(exp, params, writer)
    
# check env
# env_checker.check_env(env)

# change env in model
model.set_env(env)
model.tensorboard_log=f"tensorboard_log/{file_name}/"

if params["USE_HER"] == 1:
    # start training
    train(model, env, params)
else:
    # start training (without train method)
    model.learn(params["STEPS"])

model.save('./saved_models/'+file_name)
model.get_parameters()