In [1]:
import torch
import json
import pickle
import yaml
import random
import torch.nn as nn
import numpy as np
from tqdm import trange
import data_panda as rbt
from torch.utils.tensorboard import SummaryWriter
import glob
import joblib
import os
from optuna.trial import TrialState
import logging
import sys
import optuna





device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device=='cpu':
    print('gpu is not available')
    print(device)
    exit()

# folder to load config file




config = rbt.load_config("config_dueling.yaml")

def objective(trial):
    state_shape = config['state_shape']
    env = rbt.Panda_RL()
    env.renderize = config['renderize']  # stop robot viewing
    env.delta = config['delta']
    agent = rbt.DQNAgent(state_shape, device, epsilon=1).to(device)

    #init agent, target network and Optimizer
    agent = rbt.DuelingDQNAgent(state_shape,device,config["dueling_layers"], epsilon=1).to(device)
    target_network = rbt.DuelingDQNAgent(state_shape,device, config["dueling_layers"], epsilon=1).to(device)
    target_network.load_state_dict(agent.state_dict())
    optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)


    RESTORE_AGENT = config['RESTORE_AGENT']  # Restore a trained agent
    NEW_BUFFER = config['NEW_BUFFER']   # Restore a buffer
    TRAIN = config['TRAIN']  # Train or only simulate
    
    

    # agent.n_layer1=config['layer1']
    # agent.n_layer2=config['layer1']*2

    env.step_penalty=config['step_penalty']
    env.collision_penalty=config['collision_penalty']
    env.sig_p=config['sig_p']
    env.sig_R=config['sig_R']


    # set a seed
    seed = config['seed']
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    # Fill buffer with samples collected ramdomly from environment
    buffer_len = trial.suggest_int('Buffer len',1000,20000)
     
    exp_replay = rbt.PrioritizedReplayBuffer(buffer_len)
    RES_DIR = rbt.set_res_dir(comment=config['comment'])
    comment = ""
    # monitor_tensorboard()
    tb = SummaryWriter(log_dir=RES_DIR, comment=comment)

    LOAD_MODEL=config['LOAD_MODEL']
    if LOAD_MODEL:
        folder=config['resume_folder']
        agent.load_weights(folder,model=config['model_resume'])
        percentage_of_total_steps=config['percentage_of_total_steps_resume']
        print(f"Loaded {folder} {config['model_resume']} ")
        #print(f"Restored  {folder}")  


    if NEW_BUFFER:
        for i in trange(500,desc="Buffering",ncols=70):

            state = env.reset()
            # Play 100 runs of experience with 100 steps and  stop if reach 10**4 samples
            rbt.play_and_record(state, agent, env, exp_replay, n_steps=50)

            if len(exp_replay) == buffer_len:
                break
        print(f"New buffer with {len(exp_replay)} samples")
    else:
        exp_replay = rbt.PrioritizedReplayBuffer(buffer_len)
        exp_replay.load_buffer(config['resume_folder'])
        



    tmax = config['tmax']
    env.reset_j1=config['reset_j1']
    env.mag=config['mag']




    # monitor_tensorboard()
    tb = SummaryWriter(log_dir=RES_DIR, comment=comment)

    percentage_of_total_steps = config['percentage_of_total_steps']



    # setup some parameters for training

    timesteps_per_epoch = config['timesteps_per_epoch']
    #timesteps_per_epoch = trial.suggest_int('timesteps_per_epoch',1,3)
    batch_size = config['batch_size']
    total_steps = config['total_steps']
    #total_steps = 10

    # init Optimizer
    lr_exp = config['lr_exp']
    lr_exp=trial.suggest_int('lr',-4,-2)
    lr=10**lr_exp
    opt = torch.optim.Adam(agent.parameters(), lr=lr)

    # set exploration epsilon
    start_epsilon = config['start_epsilon']
    #start_epsilon = 0.1
    end_epsilon = config['end_epsilon']
    eps_decay_final_step = percentage_of_total_steps*total_steps

    # setup some frequency for logging and updating target network
    loss_freq = config['loss_freq']
    refresh_target_network_freq = trial.suggest_int('Refresh rate',100,300)
    eval_freq = config['eval_freq']

    # to clip the gradients
    max_grad_norm = config['max_grad_norm']




    hyperparameters_train = {"start_epsilon": start_epsilon,
                            "end_epsilon": end_epsilon,
                            "lr": lr_exp,
                            "batch_size": batch_size,
                            "total_steps": total_steps,
                            "percentage_of_total_steps": percentage_of_total_steps,
                            "refresh_target_network_freq": refresh_target_network_freq,
                            "buffer_len": buffer_len,
                            "tmax": tmax
                            #"agent": str(agent.network)
                            }


    def save_hyperparameter(dict, directory):
        with open(directory+"/hyperparameters.json", "w") as outfile:
            json.dump(dict, outfile)


    # Start training
    state = env.reset()
    # tb.add_graph(agent.network, torch.tensor(
    #     state, device=device, dtype=torch.float32))
    save_hyperparameter(hyperparameters_train, RES_DIR)
    loss_min = np.inf
    rw_max=-np.inf
    print(f"buffer size = {len(exp_replay)} ")  
    print(f"Frequency evaluation = {eval_freq}")
    print(f"Device: {device}")



    for step in trange(total_steps + 1, desc="Training", ncols=70):


        # reduce exploration as we progress
        agent.epsilon = rbt.epsilon_schedule(
            start_epsilon, end_epsilon, step, eps_decay_final_step)

        # take timesteps_per_epoch and update experience replay buffer
        _, state = rbt.play_and_record(
            state, agent, env, exp_replay, timesteps_per_epoch)

        # train by sampling batch_size of data from experience replay
        states, actions, rewards, next_states, done_flags, weights, idxs = exp_replay.sample(
            batch_size)
        actions = [agent.get_action_index(i) for i in actions]

        # loss = <compute TD loss>
        opt.zero_grad()
        loss = rbt.compute_td_loss_priority_replay(agent, target_network, exp_replay,
                                                states, actions, rewards, next_states, done_flags, weights, idxs,
                                                gamma=0.99,
                                                device=device)
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        opt.step()
        if loss < loss_min:
            torch.save(agent.state_dict(), RES_DIR+'/best-model-loss.pt')
            loss_min=loss
        tb.add_scalar("1/Epsilon", agent.epsilon, step)
        tb.add_scalar("1/TD Loss", loss, step)

        if step % refresh_target_network_freq == 0:
            # Load agent weights into target_network
            target_network.load_state_dict(agent.state_dict())

        if step % eval_freq == 0:
            # eval the agent
            assert not np.isnan(loss.cpu().detach().numpy())
            #clear_output(True)        
            m_reward,m_steps,m_collisions,m_successes,fit,_ = rbt.evaluate(env, agent, n_games=10,
                                    greedy=True, t_max=tmax)
            tb.add_scalar("1/Mean reward per episode", m_reward, step)
            tb.add_scalar("1/Mean of steps", m_steps, step)
            tb.add_scalar("2/Mean fitness reached", fit, step)
            tb.add_scalar("2/Mean of collisions", m_collisions, step)
            tb.add_scalar("2/Mean of successes", m_successes, step)
            #print(f"Last mean reward = {m_reward}")
 
        if m_reward > rw_max:
            torch.save(agent.state_dict(), RES_DIR+'/best-model-rw.pt')
            rw_max=m_reward
            
        
        #clear_output(True)
    exp_replay.save_buffer(RES_DIR)
    torch.save(agent.state_dict(), RES_DIR+'/last-model.pt')
    tb.close()
    rbt.eval_trained_models(env,agent,RES_DIR,device)
    
    return rw_max

In [2]:
def report_optuna_experiment(study):
    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [3]:

# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "study_teste"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.create_study(study_name=study_name, storage=storage_name, direction="maximize")


study.optimize(objective, n_trials=3)
# study = joblib.load('study_new_env1.pkl')
# print(study.trials_dataframe())





[32m[I 2022-10-27 17:35:07,835][0m A new study created in RDB with name: study_teste[0m


A new study created in RDB with name: study_teste
Current number of result directories: 5
runs/teste_6


pybullet build time: Jul 21 2022 19:48:53
Buffering:  19%|███▉                 | 94/500 [00:05<00:24, 16.80it/s]


New buffer with 4702 samples
buffer size = 4702 
Frequency evaluation = 2000
Device: cuda


Training: 100%|██████████████| 100001/100001 [10:18<00:00, 161.57it/s]


Replay Buffer and priorities Saved
runs/teste_6/last-model.pt
11976.304011692726 453.4 0.1 0.0 3.2291898809190953
runs/teste_6/best-model-loss.pt
27541.145886890903 477.5 0.05 0.0 2.5384263642128824
runs/teste_6/best-model-rw.pt


[32m[I 2022-10-27 17:46:01,049][0m Trial 0 finished with value: 51305.291787838665 and parameters: {'Buffer len': 4702, 'lr': -3, 'Refresh rate': 223}. Best is trial 0 with value: 51305.291787838665.[0m


46420.27837721947 474.8 0.05 0.0 2.570781643868136
m_reward,m_steps,m_collisions,m_successes,fit
Trial 0 finished with value: 51305.291787838665 and parameters: {'Buffer len': 4702, 'lr': -3, 'Refresh rate': 223}. Best is trial 0 with value: 51305.291787838665.
Current number of result directories: 6
runs/teste_7


Buffering:  10%|██                   | 48/500 [00:02<00:24, 18.34it/s]


New buffer with 2446 samples
buffer size = 2446 
Frequency evaluation = 2000
Device: cuda


Training: 100%|██████████████| 100001/100001 [08:20<00:00, 199.88it/s]


Replay Buffer and priorities Saved
runs/teste_7/last-model.pt
1261.1411313627386 65.6 1.0 0.0 2.9126755777787596
runs/teste_7/best-model-loss.pt
-6489.02303891498 75.8 1.0 0.0 3.79049956821493
runs/teste_7/best-model-rw.pt


[32m[I 2022-10-27 17:54:35,109][0m Trial 1 finished with value: 46209.13466826778 and parameters: {'Buffer len': 2446, 'lr': -2, 'Refresh rate': 249}. Best is trial 1 with value: 46209.13466826778.[0m


36043.20051427425 426.2 0.15 0.0 2.8288687423138947
m_reward,m_steps,m_collisions,m_successes,fit
Trial 1 finished with value: 46209.13466826778 and parameters: {'Buffer len': 2446, 'lr': -2, 'Refresh rate': 249}. Best is trial 1 with value: 46209.13466826778.
Current number of result directories: 7
runs/teste_8


Buffering:  72%|██████████████▍     | 362/500 [00:18<00:06, 19.91it/s]


New buffer with 18149 samples
buffer size = 18149 
Frequency evaluation = 2000
Device: cuda


Training: 100%|██████████████| 100001/100001 [10:40<00:00, 156.12it/s]


Replay Buffer and priorities Saved
runs/teste_8/last-model.pt
36899.17723838846 432.4 0.15 0.0 2.6335973290679915
runs/teste_8/best-model-loss.pt
36717.381212163964 450.35 0.1 0.0 2.719322654055339
runs/teste_8/best-model-rw.pt


[32m[I 2022-10-27 18:05:58,992][0m Trial 2 finished with value: 47790.620073709775 and parameters: {'Buffer len': 18149, 'lr': -3, 'Refresh rate': 123}. Best is trial 1 with value: 46209.13466826778.[0m


35218.16564118344 378.4 0.25 0.0 2.7277807860797383
m_reward,m_steps,m_collisions,m_successes,fit
Trial 2 finished with value: 47790.620073709775 and parameters: {'Buffer len': 18149, 'lr': -3, 'Refresh rate': 123}. Best is trial 1 with value: 46209.13466826778.


In [10]:
# with open('runs/env1_after_opt_27/buffer.pickle', 'rb') as handle:
#     buffer=pickle.load(handle)


# with open('runs/env1_after_opt_27/priorities.pickle', 'rb') as handle:
#     prio=pickle.load(handle)
# print("Replay Buffer Loaded")

# with open('runs/env1_after_opt_27/buffer_3000.pickle', 'wb') as handle:
#     pickle.dump(buffer[0:5000], handle)
    
# with open('runs/env1_after_opt_27/prio_3000.pickle', 'wb') as handle:
#     pickle.dump(prio[0:5000], handle)
# print("Replay Buffer Saved")

Replay Buffer Loaded
Replay Buffer Saved


In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=4)
# joblib.dump(study, "study_new_env1.pkl")

