In [1]:
# load agents and environments
%run Agent_Env_concat_changed_TD3.ipynb

In [2]:
# for tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/')

In [3]:
# get configures
arg = Config()

In [4]:
arg.__dict__

{'SEED_NUMBER': 0,
 'ACTION_DIM': 2,
 'STATE_DIM': 22,
 'TERMINAL_ACTION': tensor(0.1000),
 'DELTA_T': tensor(1),
 'EPISODE_TIME': tensor(40),
 'EPISODE_LEN': tensor(40),
 'TOT_T': 2000000000,
 'BATCH_SIZE': 512,
 'REWARD': tensor(10),
 'DISCOUNT_FACTOR': 0.99,
 'SOFT_UPDATE_TAU': 0.005,
 'STD_STEP_SIZE': 0.0001,
 'MEMORY_SIZE': 500000.0,
 'filename': '20200605-110526',
 'data_path': './',
 'goal_radius_range': tensor([0.1625, 0.1625]),
 'initial_radius_range': [0.25, 1.0],
 'relative_angle_range': tensor([-0.6981,  0.6981]),
 'process_gain_range': tensor([0.0500, 0.0500, 0.1571, 0.1571]),
 'noise_covariance_range': tensor([2.5000e-05, 2.5000e-03, 2.4674e-04, 2.4674e-02]),
 'perturbation_velocity_range': [-0.05,
  0.05,
  -0.15707963267948966,
  0.15707963267948966],
 'perturbation_duration': 10,
 'perturbation_std': 2,
 'perturbation_delay_T_range': [1, 10]}

In [5]:
# fix random seed, so that results can be repeated
random.seed(arg.SEED_NUMBER) # rand seed
torch.manual_seed(arg.SEED_NUMBER) # cpu seed
if torch.cuda.is_available():
    torch.cuda.manual_seed(arg.SEED_NUMBER) # gpu seed
np.random.seed(arg.SEED_NUMBER) # numpy random seed

torch.backends.cudnn.deterministic = True # only deterministic cudnn algorithms
torch.backends.cudnn.benchmark = True # not choose the fastest cudnn algorithm

In [6]:
# if gpu is to be used
CUDA = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))

GeForce RTX 2070


In [7]:
# initialize environment 
env = Model(arg, enable_perturbation=False, enable_radius_staircase = False)
# reset environment for the first trial
x, target_position, isPerturbation, pro_gains, pro_noise_covariance, goal_radius = env.reset()

In [8]:
# save configures
filename = arg.filename
#filename = '20200602-183506'

argument = arg.__dict__
torch.save(argument, './trained_agent/'+filename+'_arg_test.pkl')

In [9]:
# initialize agent and reset the agent for the first trial
agent = Agent(arg)
#agent.load(filename)
b_v, b_x, state = agent.Bstep.reset(pro_gains, pro_noise_covariance, goal_radius, target_position, 
                                                isPerturbation) 

In [10]:
# define action space noise
action_std = torch.tensor(0.20)
noise = ActionNoise(arg.ACTION_DIM, mean=torch.tensor(0), std=action_std)

In [11]:
tot_t = 0. # number of total time steps
episode = 0.
episode_number_perturbation = 0
reward_log_perturbation = 0
rewarded_number_log_perturbation = 0
step_log_perturbation = 0
episode_number_normal = 0
reward_log_normal = 0
rewarded_number_log_normal = 0
step_log_normal = 0
policy_loss_log = 0
value_loss_log = 0

while tot_t<arg.TOT_T: #arg.TOT_T
    t = torch.zeros(1) # time step in one trial
    
    cross_start_threshold = False
    done = torch.tensor([0]) # done flag
    while t <= arg.EPISODE_LEN: # 70 steps==7s
        action, action_orgin = agent.select_action(state, action_noise = noise)
        
        if not cross_start_threshold and not is_terminal_action(action_orgin, arg.TERMINAL_ACTION): # start threshold
            cross_start_threshold = True
            
        next_x, reached_target, perturbation_v_t, perturbation_w_t = env(x, action, t) #track true next_x
        next_b_v, next_b_x, info = agent.Bstep(b_v, b_x, next_x, action, action_orgin, perturbation_v_t, perturbation_w_t)
        t += 1
        next_state = agent.Bstep.Breshape(next_b_v, next_b_x, t) # reshaped updated belief
        
        reward = return_reward(target_position, info, reached_target, next_b_x, env.goal_radius, 
                               arg.REWARD, finetuning=0) # gaussian reward based on belief
        
        TimeEnd = (t == arg.EPISODE_LEN) # if the monkey can't catch the firefly in EPISODE_LEN, reset the game.
        
        if (info['stop'] and cross_start_threshold) or TimeEnd:
            isPerturbation_log = isPerturbation
            done = torch.tensor([1])
            next_x, target_position, isPerturbation, pro_gains, pro_noise_covariance, goal_radius = env.reset()
            
            next_b_v, next_b_x, next_state = agent.Bstep.reset(pro_gains, pro_noise_covariance, goal_radius,
                                                   target_position, isPerturbation)
            
        agent.memory.push(state.cuda(), action, next_state.cuda(), reward.cuda(), done.cuda()) #episodic
        #agent.memory.push(state.cuda(), action, next_state.cuda(), reward.cuda(), torch.tensor([0]).cuda()) # continous
        
        if tot_t > 1000:
            policy_loss, value_loss = agent.learn(arg.BATCH_SIZE, tot_t) # sample from buffer,back-propagate
            policy_loss_log += policy_loss
            value_loss_log += value_loss
        
        # update variables
        x = next_x
        state = next_state
        b_v = next_b_v 
        b_x = next_b_x
        tot_t += 1
        
        if episode > 1e4 and tot_t % 100 == 0:
            # update action space exploration noise
            action_std -= arg.STD_STEP_SIZE  # exploration noise
            action_std = torch.max(torch.tensor(0.001), action_std)
            noise.reset(torch.tensor(0), action_std)
                
        if (info['stop'] and cross_start_threshold) or TimeEnd:
            break
       
    # end of one episode, do some checking
    episode += 1 # trial or episode number
    if isPerturbation_log:
        episode_number_perturbation += 1
        reward_log_perturbation += reward.item()
        rewarded_number_log_perturbation += int(reached_target)
        step_log_perturbation += t.item()
    else: 
        episode_number_normal += 1
        reward_log_normal += reward.item()
        rewarded_number_log_normal += int(reached_target)
        step_log_normal += t.item()
        
    if episode % 500 == 499:  
        agent.save()
        
        print ("t: {}, Ep: {}, action std: {:0.2f}, target radius upper bound: {:0.3f}".format(tot_t,episode,noise.scale,
                                                                                    env.initial_radius_upper_bound))
        print ("Normal trial: mean steps: {:0.3f}, mean reward: {:0.3f}, mean reward per step: {:0.3f}, \
rewarded fraction: {:0.3f}".format(
        step_log_normal/episode_number_normal, reward_log_normal/episode_number_normal, reward_log_normal/step_log_normal,
        rewarded_number_log_normal/episode_number_normal))
        #print ("Perturbation trial: mean steps: {:0.3f}, mean reward: {:0.3f}, mean reward per step: {:0.3f}, \
#rewarded fraction: {:0.3f}\n".format(
        #step_log_perturbation/episode_number_perturbation, reward_log_perturbation/episode_number_perturbation, 
            #reward_log_perturbation/step_log_perturbation,rewarded_number_log_perturbation/episode_number_perturbation))
        
        
        writer.add_scalar('average policy loss',policy_loss_log*2/(step_log_perturbation+step_log_normal),episode)
        writer.add_scalar('average value loss',value_loss_log/(step_log_perturbation+step_log_normal),episode)
        
        writer.add_scalar('average steps per normal trial',step_log_normal/episode_number_normal,episode)
        writer.add_scalar('average reward per normal trial',reward_log_normal/episode_number_normal,episode)
        writer.add_scalar('average reward per normal step',reward_log_normal/step_log_normal,episode)
        writer.add_scalar('average rewarded ratio per normal trial',
                          rewarded_number_log_normal/episode_number_normal,episode)
        
        #writer.add_scalar('average steps per perturbation trial',step_log_perturbation/episode_number_perturbation,episode)
        #writer.add_scalar('average reward per perturbation trial',
        #                  reward_log_perturbation/episode_number_perturbation,episode)
        #writer.add_scalar('average reward per perturbation step',reward_log_perturbation/step_log_perturbation,episode)
        #writer.add_scalar('average rewarded ratio per perturbation trial',
        #                  rewarded_number_log_perturbation/episode_number_perturbation,episode)
        
        episode_number_perturbation = 0
        reward_log_perturbation = 0
        rewarded_number_log_perturbation = 0
        step_log_perturbation = 0
        episode_number_normal = 0
        reward_log_normal = 0
        rewarded_number_log_normal = 0
        step_log_normal = 0
        policy_loss_log = 0
        value_loss_log = 0

t: 9803.0, Ep: 499.0, action std: 0.20, target radius upper bound: 0.250
Normal trial: mean steps: 19.645, mean reward: 0.074, mean reward per step: 0.004, rewarded fraction: 0.044
t: 20563.0, Ep: 999.0, action std: 0.20, target radius upper bound: 0.250
Normal trial: mean steps: 21.520, mean reward: 5.128, mean reward per step: 0.238, rewarded fraction: 0.492
t: 31631.0, Ep: 1499.0, action std: 0.20, target radius upper bound: 0.250
Normal trial: mean steps: 22.136, mean reward: 8.635, mean reward per step: 0.390, rewarded fraction: 0.618
t: 42878.0, Ep: 1999.0, action std: 0.20, target radius upper bound: 0.250
Normal trial: mean steps: 22.494, mean reward: 8.713, mean reward per step: 0.387, rewarded fraction: 0.630
t: 53900.0, Ep: 2499.0, action std: 0.20, target radius upper bound: 0.250
Normal trial: mean steps: 22.044, mean reward: 8.502, mean reward per step: 0.386, rewarded fraction: 0.646
t: 64148.0, Ep: 2999.0, action std: 0.20, target radius upper bound: 0.250
Normal trial:

Normal trial: mean steps: 15.386, mean reward: 9.300, mean reward per step: 0.604, rewarded fraction: 0.712
t: 787421.0, Ep: 44999.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 16.106, mean reward: 9.258, mean reward per step: 0.575, rewarded fraction: 0.690
t: 795585.0, Ep: 45499.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 16.328, mean reward: 9.298, mean reward per step: 0.569, rewarded fraction: 0.690
t: 803574.0, Ep: 45999.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 15.978, mean reward: 9.291, mean reward per step: 0.581, rewarded fraction: 0.734
t: 811484.0, Ep: 46499.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 15.820, mean reward: 9.333, mean reward per step: 0.590, rewarded fraction: 0.710
t: 819513.0, Ep: 46999.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 16.058, mean reward: 9.314, mean reward per step:

Normal trial: mean steps: 16.158, mean reward: 9.282, mean reward per step: 0.574, rewarded fraction: 0.686
t: 1487729.0, Ep: 88999.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 15.766, mean reward: 9.263, mean reward per step: 0.588, rewarded fraction: 0.700
t: 1495657.0, Ep: 89499.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 15.856, mean reward: 9.282, mean reward per step: 0.585, rewarded fraction: 0.704
t: 1503507.0, Ep: 89999.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 15.700, mean reward: 9.230, mean reward per step: 0.588, rewarded fraction: 0.718
t: 1511440.0, Ep: 90499.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 15.866, mean reward: 9.342, mean reward per step: 0.589, rewarded fraction: 0.732
t: 1519356.0, Ep: 90999.0, action std: 0.00, target radius upper bound: 0.250
Normal trial: mean steps: 15.832, mean reward: 9.310, mean reward per 

KeyboardInterrupt: 