In [1]:
import glob
import imp
import math
import gc
import os
from sre_constants import SUCCESS
import time
import datetime
import pybullet as p
import cv2
import numpy as np
from graphviz import Digraph
import argparse
import random
import torch
import matplotlib.pyplot as plt
from time import sleep
import copy

import torch
import torch.nn as nn
import torchvision
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


from Config.constants import (
    GRIPPER_PUSH_RADIUS,
    PIXEL_SIZE,
    PUSH_DISTANCE,
    WORKSPACE_LIMITS,
    TARGET_LOWER,
    TARGET_UPPER,
    orange_lower,
    orange_upper,
    BG_THRESHOLD,
    MIN_GRASP_THRESHOLDS
)

from Environments.environment_sim import Environment
import Environments.utils as env_utils
from V1_destination_prediction.Test_cases.tc1 import TestCase1

from create_env import get_push_start, get_max_extent_of_target_from_bottom


from collections import namedtuple, deque

pybullet build time: Oct 14 2022 01:09:34


In [2]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity) -> None:
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        
        '''Save a transition'''
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [3]:
from V2_next_best_action.models.dqn_v2 import pushDQN2

torch.cuda.empty_cache()

In [4]:
from torchsummary import summary

torch.cuda.empty_cache()

model = pushDQN2(n_observations=6, n_actions=16, use_cuda=True)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    model.cuda()
# model(np.ones(shape=(1, 224, 224, 3)), np.ones(shape=(1, 224, 224, 3)))
# print(model(torch.rand((3, 6)).to(device=device)))
summary(model, (3, 6))
# del model

Layer (type:depth-idx)                   Output Shape              Param #
├─Linear: 1-1                            [-1, 3, 128]              896
├─Linear: 1-2                            [-1, 3, 128]              16,512
├─Linear: 1-3                            [-1, 3, 16]               2,064
Total params: 19,472
Trainable params: 19,472
Non-trainable params: 0
Total mult-adds (M): 0.02
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.07
Estimated Total Size (MB): 0.08


Layer (type:depth-idx)                   Output Shape              Param #
├─Linear: 1-1                            [-1, 3, 128]              896
├─Linear: 1-2                            [-1, 3, 128]              16,512
├─Linear: 1-3                            [-1, 3, 16]               2,064
Total params: 19,472
Trainable params: 19,472
Non-trainable params: 0
Total mult-adds (M): 0.02
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.07
Estimated Total Size (MB): 0.08

In [5]:
with torch.no_grad():
    qs = model(torch.rand((1, 6)).to(device=device))
    print(qs)

    print(qs.max(1)[0].view(1, 1))
    print(qs.max(1)[1].view(1, 1))
    # del model
del model

tensor([[ 0.0417,  0.0830, -0.1164, -0.1587,  0.0085, -0.1066, -0.0746,  0.0431,
         -0.0303,  0.0228,  0.0194,  0.1637,  0.1193,  0.0165,  0.0143, -0.0143]],
       device='cuda:0')
tensor([[0.1637]], device='cuda:0')
tensor([[11]], device='cuda:0')


In [6]:
# Hyperparameters
BATCH_SIZE = 128
GAMMA = 1 # 0.999 # Discount factor
EPS_START = 0.9 # Random action choosing probability starts with this value and decays until EPS_END
EPS_END = 0.05 # Random action choosing probability starts at EPS_START and decays until EPS_END
EPS_DECAY = 200 # Decay rate of random action choosing probability, with the passage of episodes and time
TARGET_UPDATE = 10
TARGET_SAVE_CHECKPOINTS = [200, 500, 1000, 2000, 3000, 4000, 5000]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
n_observations = 6 # 3 for initial state, 3 for goal state
n_actions = 16 # 16 push + 1 grasp

policy_net = pushDQN2(n_observations, n_actions, use_cuda=True).to(device)
target_net = pushDQN2(n_observations, n_actions, use_cuda=True).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000) # 10000

steps_done = 0

In [8]:
def select_action(state):
    '''Select the next best action 
    state: tensor(shape=(6))
    '''
    global steps_done
    sample = random.uniform(0.0, 1.0) # random.randint(a=0, b=16) 
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1.0*steps_done / EPS_DECAY)
    steps_done += 1

    if sample>eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

In [9]:
"""REWARD SPECIFICATION

1. If action=grasp:
        if prev_max_extents>THRESHOLD: # Max extents before grasping
            reward = 1
        else:
            reward = -1 
2. If action=push:
        if cur_max_entents>THRESHOLD: # Max extents after pushing
            reward = 1
        else:
            reward = -1
Bellman equation:
    Q(s, a) = r + gamma*max(Q(s', a'))
"""

def get_reward(prev_state, current_state):
    '''
    prev_state: (x1, y1, theta1, x2, y2, theta2)
    current_state: (x3, y3, theta3, _, _, _)
    '''
    reward = np.linalg.norm(current_state[0:3] - prev_state[3:6])
    return reward

In [10]:
def optimize_model(timestep=0, batch_num=0):
    if len(memory) < BATCH_SIZE:
        return
    print("Optimization!")
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

In [11]:
from itertools import count

from Config.constants import MIN_GRASP_THRESHOLDS

is_viz = False

# env = Environment()
env = Environment(gui=True)
num_of_envs = 10
max_num_of_actions = 15
is_viz = False
max_extent_threshold = 1 # Max extent threshold of the target object in pixel units
push_directions = [0, np.pi/8, np.pi/4, 3*np.pi/8, 
                    np.pi/2, 5*np.pi/8, 3*np.pi/4, 7*np.pi/8, 
                    np.pi, 9*np.pi/8, 5*np.pi/4, 11*np.pi/8,  
                    3*np.pi/2, 13*np.pi/8, 7*np.pi/4, 15*np.pi/8] # 16 standard directions
# num_episodes = 50 # 10
max_timesteps = 1000
timestep = 0

# wandb.config.update({
#     'epochs': num_episodes,
#     'batch_size': BATCH_SIZE,
#     'optimizer': 'Adam',
#     'learning_rate': 'default',
#     'replay_memory': REPLAY_MEMORY_SIZE, # 10000
#     'n_actions': n_actions,
#     'action_types': 'Only push in 16 different directions'
# })

while timestep < max_timesteps:
    # Initialize the environment and state
    env.reset()
    testcase1 = TestCase1(env)
    body_ids, success = testcase1.sample_test_case(bottom_obj='random') #'random') # testcase1.create_standard()
    color_image, depth_image, _ = env_utils.get_true_heightmap(env)
    depth_image = np.stack((depth_image, )*3, axis=-1)
    # print("Returned body ids: {}, success: {}".format(body_ids, success))
    # last_screen = get_screen()
    # current_screen = get_screen()
    # state = current_screen - last_screen
    target_pos, target_orn = p.getBasePositionAndOrientation(body_ids[1])
    euler_orn = p.getEulerFromQuaternion(target_orn)
    cur_target_st = np.array([target_pos[0], target_pos[1], euler_orn[2]], dtype=np.float64)
    cur_target_goal = cur_target_st + np.random.uniform(low=[-5, -5, -2*np.pi], high=[5, 5, 2*np.pi], size=(3,))
    cur_state = np.hstack((cur_target_st, cur_target_goal))
    state = {
        'cur_state': torch.tensor(cur_state, dtype=torch.float, device=device).unsqueeze(0),
        # 'rgb': torch.tensor(np.array([np.transpose(color_image, (2, 0, 1))]), dtype=torch.float, device=device), # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
        # 'height_map': torch.tensor(np.array([np.transpose(depth_image, (2, 0, 1))]), dtype=torch.float, device=device) # torch.tensor([np.transpose(depth_image, (2, 0, 1))], device=device) # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
    }
    done = False

    for t in count():
        # Select and perform an action
        timestep += 1
        action = select_action(state['cur_state']) # select_action(state['rgb'], state['height_map'])
        color_image, depth_image, _ = env_utils.get_true_heightmap(env)
        if action.item() in range(0, 16): # push action
            temp = cv2.cvtColor(color_image, cv2.COLOR_RGB2HSV)
            target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
            push_dir = push_directions[action.item()] # Sample push directions
            push_start, push_end = get_push_start(push_dir, target_mask, body_ids[1])
            env.push(push_start, push_end) # Action performed 

            # color_image, depth_image, _ = env_utils.get_true_heightmap(env) # Evaluating the new state for calculating the reward
            # temp = cv2.cvtColor(color_image, cv2.COLOR_RGB2HSV)
            # target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
            # bottom_mask = cv2.inRange(temp, orange_lower, orange_upper)
            # depth_image = np.stack((depth_image, )*3, axis=-1)
            # target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
            # max_extents = get_max_extent_of_target_from_bottom(target_mask=target_mask, bottom_mask=bottom_mask, 
            #                             bottom_obj_body_id=body_ids[0], 
            #                             current_bottom_obj_size=testcase1.current_bottom_size, 
            #                             is_viz=False)
            
            target_pos, target_orn = p.getBasePositionAndOrientation(body_ids[1])
            euler_orn = p.getEulerFromQuaternion(target_orn)

            new_target_st = np.array([target_pos[0], target_pos[1], euler_orn[2]], dtype=np.float)
            new_target_goal = new_target_st + np.random.uniform(low=[-5, -5, -2*np.pi], high=[5, 5, 2*np.pi], size=(3,))
            new_state = np.hstack((new_target_st, new_target_goal))
            reward = get_reward(current_state=new_state, prev_state=state['cur_state'].squeeze().cpu().numpy())
            # print(f"Current state: {state['cur_state'].squeeze().cpu().numpy()}\t Action: {action.item()}\nNew State: {new_state}\tReward: {reward}")
            print(f"Timestep: {timestep}\tReward: {reward}")
            # reward = get_reward(action='push', max_extents=max_extents, MIN_GRASP_EXTENT_THRESH=MIN_GRASP_THRESHOLDS) # get_reward(action, max_extents, MAX_EXTENT_THRESH, MIN_GRASP_EXTENT_THRESH)
            # belman_update_val = get_belman_update_value()
        elif action.item()==16:
            print("Invalid Action!!!!!")
            exit()
            # Check if the state is graspable and reward the agent
            # temp = cv2.cvtColor(color_image, cv2.COLOR_RGB2HSV)
            # target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
            # bottom_mask = cv2.inRange(temp, orange_lower, orange_upper)
            # max_extents = get_max_extent_of_target_from_bottom(target_mask=target_mask, bottom_mask=bottom_mask, 
            #                             bottom_obj_body_id=body_ids[0], 
            #                             current_bottom_obj_size=testcase1.current_bottom_size, 
            #                             is_viz=False)
            
            # reward = get_reward(action='grasp', max_extents=max_extents, MIN_GRASP_EXTENT_THRESH=MIN_GRASP_THRESHOLDS)
            # if reward==1:
            #     done = True
            # done = True
        targetPos, _ = p.getBasePositionAndOrientation(body_ids[1])
        bottomPos, _ = p.getBasePositionAndOrientation(body_ids[0])
        if targetPos[2] < bottomPos[2] + testcase1.current_bottom_size[2]/2 + testcase1.current_target_size[2]/2 - 0.01:
            # reward = -0.75
            done = True
        # _, reward, done, _, _ = env.step(action.item())
        reward = torch.tensor([reward], dtype=torch.float, device=device)
        if reward == 1:
            done=True

        # Observe new state
        # last_screen = current_screen
        # current_screen = get_screen()
        if not done:
            target_pos, target_orn = p.getBasePositionAndOrientation(body_ids[1])
            euler_orn = p.getEulerFromQuaternion(target_orn)
            
            new_target_st = np.array([target_pos[0], target_pos[1], euler_orn[2]], dtype=float)
            new_target_goal = new_target_st + np.random.uniform(low=[-5, -5, -2*np.pi], high=[5, 5, 2*np.pi], size=(3,))
            new_state = np.hstack((new_target_st, new_target_goal))
            next_state = {
                'cur_state': torch.tensor(new_state, dtype=torch.float, device=device).unsqueeze(0),
                # 'rgb': torch.tensor(np.array([np.transpose(color_image, (2, 0, 1))]), dtype=torch.float, device=device), # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
                # 'height_map': torch.tensor(np.array([np.transpose(depth_image, (2, 0, 1))]), dtype=torch.float, device=device) # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
            }
        else:
            next_state = {
                'cur_state': None,
                # 'rgb': None,
                # 'height_map': None
            }

        # Store the transition in memory
        memory.push(state['cur_state'], action, next_state['cur_state'], reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model(timestep=timestep, batch_num=t)
        if done:
            # episode_durations.append(t + 1)
            # plot_durations()
            break

        if t>=100:
            done = True

        torch.cuda.empty_cache()
    # Update the target network, copying all weights and biases in DQN
        if timestep % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
            print("Target updated")
            

        # if i_episode % TARGET_SAVE == 0 or i_episode==10:
        if timestep in TARGET_SAVE_CHECKPOINTS:
            print("Saved")
            SAVE_PATH = './V2_next_best_action/models/model_checkpoints/{}.pt'.format(timestep)
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(policy_net.state_dict(), SAVE_PATH)

print('Complete')
# env.render()
# env.close()
# plt.ioff()
# plt.savefig('durations_count.png')
# plt.show()
# run.finish()


startThreads creating 1 threads.
starting thread 0
started thread 0 
argc=2
argv[0] = --unused
argv[1] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creating context
Created GL 3.3 context
Direct GLX rendering context obtained
Making context current
GL_VENDOR=NVIDIA Corporation
GL_RENDERER=NVIDIA GeForce GTX 1050 Ti/PCIe/SSE2
GL_VERSION=3.3.0 NVIDIA 510.85.02
GL_SHADING_LANGUAGE_VERSION=3.30 NVIDIA via Cg compiler
pthread_getconcurrency()=0
Version = 3.3.0 NVIDIA 510.85.02
Vendor = NVIDIA Corporation
Renderer = NVIDIA GeForce GTX 1050 Ti/PCIe/SSE2
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started
Loading a new scene! ---------------------------------------- : True
Timestep: 1	Reward: 4.570579464246407
Timestep: 2	Reward: 2.9358213690629333
Timestep: 3	Reward: 3.841

: 