In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple, deque
import random
from env import Env
import numpy as np
import wandb

# wandb setup
number = 1
NAME = "DeepSARSA_HER" + str(number)
ID = "Deep SARSA_HER" + str(number)
run = wandb.init(project='DeepSARSA_HER_MachineReplacement', name = NAME, id = ID)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state', 'next_action'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
class DeepSARSA(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DeepSARSA, self).__init__()
        self.layer1 = nn.Linear(n_observations, 16)
        self.layer2 = nn.Linear(16, 16)
        self.layer3 = nn.Linear(16, n_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

In [None]:
torch.cuda.empty_cache()
env = Env(R=5)

In [None]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the ``AdamW`` optimizer
BATCH_SIZE = 512
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4
REPLAY_MEMORY_SIZE = 10000
max_timesteps = 100000

n_actions = env.n_actions
state = env.reset()
n_observations = 1

policy_net = DeepSARSA(n_observations, n_actions).to(device)
target_net = DeepSARSA(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(REPLAY_MEMORY_SIZE)


steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        np.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

In [None]:
def optimize_model(timestep=0, batch_num=0, avg_reward=0):
    if len(memory) < BATCH_SIZE:
        return 
    print("Optimization!")
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    next_state_batch = torch.cat(batch.next_state)
    next_action_batch = torch.cat(batch.next_action)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, next_action_batch) # SARSA update
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch # SARSA update

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # wandb.log({'loss': loss, 'timestep': timestep, 'batch': batch_num})
    wandb.log({'loss': loss, 'avg_reward': avg_reward, 'timestep': timestep}) #, 'batch': t})

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

    return loss


In [None]:
wandb.config.update({
    'max_timesteps': max_timesteps,
    'batch_size': BATCH_SIZE,
    'optimizer': 'Adam',
    'learning_rate': 'default',
    'replay_memory': REPLAY_MEMORY_SIZE, # 10000
    'n_actions': n_actions,
    'n_observations': n_observations
})


In [None]:
num_episodes = 10000
num_time_per_episode = 150

for i in range(num_episodes):
    prev_state = 0
    prev_action = 0
    prev_reward = 0
    for j in range(num_time_per_episode):
        cur_state = env.reset()
        cur_action = select_action(cur_state)
        next_state, reward = env.step(cur_action)

        if j>0:
            memory.push(prev_state, prev_action, prev_reward, cur_state, cur_action)

        prev_state = cur_state
        prev_action = cur_action
        prev_reward = reward

        optimize_model(timestep=(i*num_time_per_episode) + j, reward=reward)

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)

        torch.cuda.empty_cache()


In [None]:
# Train
episodal_rewards = []
episodal_time = []
timestep = 0

while timestep < max_timesteps:
    # Initialize the environment and state
    env.reset()


    for t in count():
        # Select and perform an action
        timestep += 1
        bar.update(timestep)
        action = select_action(state['cur_state']) # select_action(state['rgb'], state['height_map'])
        color_image, depth_image, _ = env_utils.get_true_heightmap(env)
        if action.item() in range(0, 16): # push action
            temp = cv2.cvtColor(color_image, cv2.COLOR_RGB2HSV)
            target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
            push_dir = push_directions[action.item()] # Sample push directions
            push_start, push_end = get_push_start(push_dir, target_mask, body_ids[1])
            env.push(push_start, push_end) # Action performed 
            
            target_pos, target_orn = p.getBasePositionAndOrientation(body_ids[1])
            euler_orn = p.getEulerFromQuaternion(target_orn)

            new_target_st = np.array([target_pos[0], target_pos[1], euler_orn[2]], dtype=np.float)
            new_state = np.hstack((new_target_st, cur_target_goal))
            reward = get_reward4(current_state=new_state, prev_state=state['cur_state'].squeeze().cpu().numpy())
        elif action.item()==16:
            print("Invalid Action!!!!!")
            exit()
            
        targetPos, _ = p.getBasePositionAndOrientation(body_ids[1])
        bottomPos, _ = p.getBasePositionAndOrientation(body_ids[0])

        if targetPos[2] < bottomPos[2] + testcase1.current_bottom_size[2]/2 + testcase1.current_target_size[2]/2 - 0.01:
            done = True
        # _, reward, done, _, _ = env.step(action.item())
        reward_np = reward
        ep_reward += reward_np
        reward = torch.tensor([reward], dtype=torch.float, device=device)
        
        if not done:
            target_pos, target_orn = p.getBasePositionAndOrientation(body_ids[1])
            euler_orn = p.getEulerFromQuaternion(target_orn)
            
            new_target_st = np.array([target_pos[0], target_pos[1], euler_orn[2]], dtype=float)
            new_state = np.hstack((new_target_st, cur_target_goal))
            next_state = {
                'cur_state': torch.tensor(new_state, dtype=torch.float, device=device).unsqueeze(0),
            }
        else:
            next_state = {
                'cur_state': None,
            }

        # Store the transition in memory
        memory.push(state['cur_state'], action, next_state['cur_state'], reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        
        optimize_model(timestep=timestep, batch_num=t, reward=reward_np)

    # Update the target network, copying all weights and biases in DQN
        if timestep % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
            # print("Target updated")
            

        # if i_episode % TARGET_SAVE == 0 or i_episode==10:
        if timestep in TARGET_SAVE_CHECKPOINTS:
            print("Saved")
            SAVE_PATH = './V2_next_best_action/models/model_checkpoints/{}.pt'.format(timestep)
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(policy_net.state_dict(), SAVE_PATH)
            SAVE_PATH = './V2_next_best_action/models/model_checkpoints/episodal_rewards_{}.npy'.format(timestep)
            with open(SAVE_PATH, 'wb') as f:
                np.save(f, np.array(episodal_rewards))

            SAVE_PATH = './V2_next_best_action/models/model_checkpoints/episodal_times_{}.npy'.format(timestep)
            with open(SAVE_PATH, 'wb') as f:
                np.save(f, np.array(episodal_time))

        torch.cuda.empty_cache()



        if t>=100:
            done = True
        
        if done:
            # episode_durations.append(t + 1)
            # plot_durations()
            break

    ep_end_time = timestep
    ep_time = ep_end_time - ep_start_time
    episodal_time.append(ep_time)
    episodal_rewards.append(ep_reward)

