# DQN 
- To learn to push using a manipulator 
- Author: Vishal Reddy Mandadi

In [5]:
import glob
import imp
import math
import gc
import os
from sre_constants import SUCCESS
import time
import datetime
import pybullet as p
import cv2
import numpy as np
from graphviz import Digraph
import argparse
import random
import torch
import matplotlib.pyplot as plt
from time import sleep
import copy

import torch
import torch.nn as nn
import torchvision
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from Config.constants import (
    GRIPPER_PUSH_RADIUS,
    PIXEL_SIZE,
    PUSH_DISTANCE,
    WORKSPACE_LIMITS,
    TARGET_LOWER,
    TARGET_UPPER,
    orange_lower,
    orange_upper,
    BG_THRESHOLD,
    MIN_GRASP_THRESHOLDS
)

from Environments.environment_sim import Environment
import Environments.utils as env_utils
from V1_destination_prediction.Test_cases.tc1 import TestCase1

from create_env import get_push_start, get_max_extent_of_target_from_bottom


from collections import namedtuple, deque

## Replay Memory Class

In [6]:
Transition = namedtuple('Transition', ('state_rgb', 'state_height', 'action', 'next_state_rgb', 'next_state_height', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity) -> None:
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        
        '''Save a transition'''
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

## Deep Model 
- Inspired from vpg's model

In [7]:
from typing import OrderedDict


class pushDQN(nn.Module):
    def __init__(self, use_cuda) -> None:
        super(pushDQN, self).__init__()
        self.use_cuda = use_cuda

        # Initialize push network trunk with DenseNet pre-trained on ImageNet
        self.push_color_trunk = torchvision.models.densenet.densenet121(pretrained=True) # (weights='DenseNet121_Weights.DEFAULT') # (pretrained=True) # 7*7*1024 # These pre-trained models will also be trained
        self.push_height_trunk = torchvision.models.densenet.densenet121(pretrained=True) #(weights='DenseNet121_Weights.DEFAULT') # (pretrained=True) # 7*7*1024 (given input==3*224*224)These pre-trained models will also be trained


        # Additional Layers for the model
        self.pushnet = nn.Sequential(OrderedDict([
            ('push-norm0', nn.BatchNorm2d(2048)), # here 2048 is the number of channels for the image
            ('push-relu0', nn.ReLU(inplace=True)),
            # ('push-pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=0)),
            ('push-conv1', nn.Conv2d(2048, 64, kernel_size=5, stride=1, padding='same')),
            ('push-norm1', nn.BatchNorm2d(64)), # here 2048 is the number of channels for the image
            ('push-relu1', nn.ReLU(inplace=True)),
            # ('push-pool1', nn.MaxPool2d(kernel_size=3, stride=2, padding=0)),
            ('push-conv2', nn.Conv2d(64, 32, kernel_size=5, stride=1, padding='same')),
            ('push-norm2', nn.BatchNorm2d(32)), # here 2048 is the number of channels for the image
            ('push-relu2', nn.ReLU(inplace=True)),
            # ('push-pool2', nn.MaxPool2d(kernel_size=3, stride=2, padding=0)) 
        ])) # Output 7*7*32
        
        self.linear_layers = nn.Sequential(OrderedDict([
            ('push-linear0', nn.Linear(7*7*32, 1024)),
            ('push-relu3', nn.ReLU(inplace=True)),
            ('push-linear1', nn.Linear(1024, 17)),
            # ('push-tanh', nn.Tanh()) # ('push-relu4', nn.ReLU(inplace=True))
        ]))

        # Weights initialization for the newly added layers

        for m in self.named_modules():
            if 'push-' in m[0]:
                if isinstance(m[1], nn.Conv2d):
                    nn.init.kaiming_normal_(m[1].weight.data)
                elif isinstance(m[1], nn.BatchNorm2d):
                    m[1].weight.data.fill_(1)
                    m[1].bias.data.zero_()
                elif isinstance(m[1], nn.Linear):
                    nn.init.xavier_uniform_(m[1].weight.data)
                    m[1].bias.data.fill_(0.01)
        
        # Initialize output variable (for backprop)
        self.interim_feat = []
        self.output_prob = []
        
    def forward(self, input_color_data, input_height_data, is_volatile=False):
        # if is_volatile:
        #     with torch.no_grad():
        #         # Fill up the code later
        #         pass
        # else:
        self.interim_feat = []
        self.output_prob = []

        interim_color_feat = self.push_color_trunk.features(input_color_data)
        interim_height_feat = self.push_height_trunk.features(input_height_data)
        interim_push_feat = torch.cat((interim_color_feat, interim_height_feat), dim=1)

        appended_activated_feat = self.pushnet(interim_push_feat)
        appended_activated_feat = appended_activated_feat.view(appended_activated_feat.size(0), -1)

        output_probs = self.linear_layers(appended_activated_feat)
    
        return output_probs
            


In [8]:
torch.cuda.empty_cache()
# del model
# del policy_net
# del target_net
# torch.cuda.memory_summary(device=device, abbreviated=False)

In [5]:
from torchsummary import summary

torch.cuda.empty_cache()

model = pushDQN(use_cuda=True)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    model.cuda()
# model(np.ones(shape=(1, 224, 224, 3)), np.ones(shape=(1, 224, 224, 3)))
print(model(torch.rand((3, 3, 224, 224)).to(device=device), torch.rand((3, 3, 224, 224)).to(device=device)))
# model.eval()
# del model
# summary(model, ((3, 224, 224), (3, 224, 224)))



tensor([[ 1.4062, -0.6015, -1.1330,  0.4928,  0.1588, -0.0075, -0.3264, -1.1091,
         -0.6993, -0.2761,  1.0824,  0.4413, -0.1230, -0.1436,  0.7756, -0.1645,
         -0.1676],
        [ 1.5104, -0.7261,  0.0078, -1.3493, -0.4104, -0.5523,  0.2192,  0.0642,
          0.2218, -0.2024,  0.1039, -0.0814, -0.6820, -0.2294,  0.7852,  1.4242,
         -1.0454],
        [ 1.1283,  0.3069,  0.0361,  0.0068,  0.1971, -0.6518,  0.0199, -1.1494,
         -1.1064,  0.3678,  0.2370,  0.3216, -0.3027,  0.2368,  0.4308,  0.1677,
         -0.3347]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [6]:
with torch.no_grad():
    qs = model(torch.rand((1, 3, 224, 224)).to(device=device), torch.rand((1, 3, 224, 224)).to(device=device))
    print(qs)

    print(qs.max(1)[0].view(1, 1))
    print(qs.max(1)[1].view(1, 1))
    # del model
del model

tensor([[ 1.6812, -0.1787, -1.2036, -0.6929,  0.0150, -0.5955,  0.6491, -0.3198,
         -0.4854, -0.2856,  0.7225, -0.2740, -0.4843, -0.5139,  0.3464,  0.5617,
         -0.1480]], device='cuda:0')
tensor([[1.6812]], device='cuda:0')
tensor([[0]], device='cuda:0')


## Training setup 

### Hyperparameters setting
- We start with EPS_START probability of choosing a random action as our initial policy. 
- This decays at EPS_DECAY rate 
- This will reach EPS_END towards the end of training

In [9]:
# Hyperparameters
BATCH_SIZE = 4
GAMMA = 0.999 # Discount factor
EPS_START = 0.9 # Random action choosing probability starts with this value and decays until EPS_END
EPS_END = 0.05 # Random action choosing probability starts at EPS_START and decays until EPS_END
EPS_DECAY = 200 # Decay rate of random action choosing probability, with the passage of episodes and time
TARGET_UPDATE = 10

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
# Number of actions
n_actions = 17 # 16 push + 1 grasp

policy_net = pushDQN(use_cuda=True).to(device)
target_net = pushDQN(use_cuda=True).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(100) # 10000

steps_done = 0



In [11]:
def select_action(state_rgb, state_height):
    '''Select the next best action 
    state: {
        'rgb': tensor(shape(3*224*224)),
        'height_map': tensor(shape(3*224*224))
    }
    '''
    global steps_done
    sample = random.uniform(0.0, 1.0) # random.randint(a=0, b=16) 
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1.0*steps_done / EPS_DECAY)
    steps_done += 1

    if sample>eps_threshold:
        with torch.no_grad():
            return policy_net(state_rgb, state_height).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


In [12]:
"""REWARD SPECIFICATION

1. If action=grasp:
        if prev_max_extents>THRESHOLD: # Max extents before grasping
            reward = 1
        else:
            reward = -1 
2. If action=push:
        if cur_max_entents>THRESHOLD: # Max extents after pushing
            reward = 1
        else:
            reward = -1
Bellman equation:
    Q(s, a) = r + gamma*max(Q(s', a'))
"""

def get_reward(action, max_extents, MIN_GRASP_EXTENT_THRESH):
    '''
    '''
    if action=='push':
        # if (max_extents[0] > MAX_EXTENT_THRESH) or (max_extents[1] > MAX_EXTENT_THRESH[1]): # check if object fell on the ground
        #     return -1.0
        if (max_extents[0] > MIN_GRASP_EXTENT_THRESH[0]) or (max_extents[1] > MIN_GRASP_EXTENT_THRESH[1]):
            return 1.0
        else:
            return -0.05 # for fast achievement of goal
    elif action=='grasp':
        # if (max_extents[0] > MAX_EXTENT_THRESH) or (max_extents[1] > MAX_EXTENT_THRESH[1]): # check if object fell on the ground
        #     return -1.0
        if (max_extents[0] > MIN_GRASP_EXTENT_THRESH[0]) or (max_extents[1] > MIN_GRASP_EXTENT_THRESH[1]):
            return 1.0
        else:
            return -1.0 # end the episode if it tries to grasp here
    else:
        return 0.0

def get_belman_update_value(state_rgb, state_height, reward):
    '''
    '''
    next_q = policy_net(state_rgb, state_height).max(1)[0].view(1, 1)
    update = reward + next_q.max(1)[0].view[1, 1]
    return update
    

## Training Loop

In [9]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state_rgb)), device=device, dtype=torch.bool)

    non_final_next_states_rgb = torch.cat([s for s in batch.next_state_rgb
                                                if s is not None])

    non_final_next_states_height = torch.cat([s for s in batch.next_state_height
                                                if s is not None])

    state_rgb_batch = torch.cat(batch.state_rgb)
    state_height_batch = torch.cat(batch.state_height)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_rgb_batch, state_height_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states_rgb, non_final_next_states_height).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        if param.grad == None:
            continue
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [12]:

import matplotlib

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

episode_durations = []

def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())

## Training Loop Main

In [13]:
from itertools import count

from Config.constants import MIN_GRASP_THRESHOLDS

is_viz = False

# env = Environment()
env = Environment(gui=True)
num_of_envs = 10
max_num_of_actions = 10
is_viz = False
max_extent_threshold = 1 # Max extent threshold of the target object in pixel units
push_directions = [0, np.pi/8, np.pi/4, 3*np.pi/8, 
                    np.pi/2, 5*np.pi/8, 3*np.pi/4, 7*np.pi/8, 
                    np.pi, 9*np.pi/8, 5*np.pi/4, 11*np.pi/8,  
                    3*np.pi/2, 13*np.pi/8, 7*np.pi/4, 15*np.pi/8] # 16 standard directions


num_episodes = 50

for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    testcase1 = TestCase1(env)
    body_ids, success = testcase1.sample_test_case(bottom_obj='random') #'random') # testcase1.create_standard()
    color_image, depth_image, _ = env_utils.get_true_heightmap(env)
    depth_image = np.stack((depth_image, )*3, axis=-1)
    # print("Returned body ids: {}, success: {}".format(body_ids, success))
    # last_screen = get_screen()
    # current_screen = get_screen()
    # state = current_screen - last_screen
    state = {
        'rgb': torch.tensor([np.transpose(color_image, (2, 0, 1))], dtype=torch.float, device=device), # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
        'height_map': torch.tensor([np.transpose(depth_image, (2, 0, 1))], dtype=torch.float, device=device) # torch.tensor([np.transpose(depth_image, (2, 0, 1))], device=device) # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
    }
    done = False

    for t in count():
        # Select and perform an action
        action = select_action(state['rgb'], state['height_map'])
        if action.item() in range(0, 16): # push action
            temp = cv2.cvtColor(color_image, cv2.COLOR_RGB2HSV)
            target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
            push_dir = push_directions[action.item()] # Sample push directions
            push_start, push_end = get_push_start(push_dir, target_mask, body_ids[1])
            env.push(push_start, push_end) # Action performed 

            color_image, depth_image, _ = env_utils.get_true_heightmap(env) # Evaluating the new state for calculating the reward
            depth_image = np.stack((depth_image, )*3, axis=-1)
            target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
            max_extents = get_max_extent_of_target_from_bottom(target_mask=target_mask, bottom_mask=target_mask, 
                                        bottom_obj_body_id=body_ids[0], 
                                        current_bottom_obj_size=testcase1.current_bottom_size, 
                                        is_viz=False)
            
            reward = get_reward(action='push', max_extents=max_extents, MIN_GRASP_EXTENT_THRESH=MIN_GRASP_THRESHOLDS) # get_reward(action, max_extents, MAX_EXTENT_THRESH, MIN_GRASP_EXTENT_THRESH)
            # belman_update_val = get_belman_update_value()
        elif action.item()==16:
            # Check if the state is graspable and reward the agent
            temp = cv2.cvtColor(color_image, cv2.COLOR_RGB2HSV)
            target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
            max_extents = get_max_extent_of_target_from_bottom(target_mask=target_mask, bottom_mask=target_mask, 
                                        bottom_obj_body_id=body_ids[0], 
                                        current_bottom_obj_size=testcase1.current_bottom_size, 
                                        is_viz=False)
            
            reward = get_reward(action='grasp', max_extents=max_extents, MIN_GRASP_EXTENT_THRESH=MIN_GRASP_THRESHOLDS)
            if reward==1:
                done = True
            # done = True
        targetPos, _ = p.getBasePositionAndOrientation(body_ids[1])
        bottomPos, _ = p.getBasePositionAndOrientation(body_ids[0])
        if targetPos[2] < bottomPos[2] + testcase1.current_bottom_size[2]/2 + testcase1.current_target_size[2]/2 - 0.005:
            reward = -1
            done = True
        # _, reward, done, _, _ = env.step(action.item())
        reward = torch.tensor([reward], dtype=torch.float, device=device)
        if reward == -1:
            done=True

        # Observe new state
        # last_screen = current_screen
        # current_screen = get_screen()
        if not done:
            next_state = {
                'rgb': torch.tensor([np.transpose(color_image, (2, 0, 1))], dtype=torch.float, device=device), # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
                'height_map': torch.tensor([np.transpose(depth_image, (2, 0, 1))], dtype=torch.float, device=device) # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
            }
        else:
            next_state = {
                'rgb': None,
                'height_map': None
            }

        # Store the transition in memory
        memory.push(state['rgb'], state['height_map'], action, next_state['rgb'], next_state['height_map'], reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            plot_durations()
            break

        if t>=10:
            done = True
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')
# env.render()
# env.close()
plt.ioff()
plt.show()

<Figure size 640x480 with 0 Axes>

Complete


<Figure size 640x480 with 0 Axes>

X connection to :1 broken (explicit kill or server shutdown).


: 

# Evaluation

In [13]:
from itertools import count

from Config.constants import MIN_GRASP_THRESHOLDS

is_viz = False

# env = Environment()
env = Environment(gui=True)
num_of_envs = 10
max_num_of_actions = 10
is_viz = False
max_extent_threshold = 1 # Max extent threshold of the target object in pixel units
push_directions = [0, np.pi/8, np.pi/4, 3*np.pi/8, 
                    np.pi/2, 5*np.pi/8, 3*np.pi/4, 7*np.pi/8, 
                    np.pi, 9*np.pi/8, 5*np.pi/4, 11*np.pi/8,  
                    3*np.pi/2, 13*np.pi/8, 7*np.pi/4, 15*np.pi/8] # 16 standard directions


num_episodes = 50

checkpoint = torch.load('/home/vishal/Volume_E/Active/Undergrad_research/ICRA22/codebases/Mid-Level-Planner/V2_next_best_action/models/model_checkpoints/100.pt')
policy_net.load_state_dict(checkpoint)
policy_net.eval()

with torch.no_grad():
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        testcase1 = TestCase1(env)
        body_ids, success = testcase1.sample_test_case(bottom_obj='random') #'random') # testcase1.create_standard()
        color_image, depth_image, _ = env_utils.get_true_heightmap(env)
        depth_image = np.stack((depth_image, )*3, axis=-1)
        # print("Returned body ids: {}, success: {}".format(body_ids, success))
        # last_screen = get_screen()
        # current_screen = get_screen()
        # state = current_screen - last_screen
        state = {
            'rgb': torch.tensor([np.transpose(color_image, (2, 0, 1))], dtype=torch.float, device=device), # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
            'height_map': torch.tensor([np.transpose(depth_image, (2, 0, 1))], dtype=torch.float, device=device) # torch.tensor([np.transpose(depth_image, (2, 0, 1))], device=device) # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
        }
        done = False

        for t in count():
            # Select and perform an action
            action = select_action(state['rgb'], state['height_map'])
            if action.item() in range(0, 16): # push action
                temp = cv2.cvtColor(color_image, cv2.COLOR_RGB2HSV)
                target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
                push_dir = push_directions[action.item()] # Sample push directions
                push_start, push_end = get_push_start(push_dir, target_mask, body_ids[1])
                env.push(push_start, push_end) # Action performed 

                color_image, depth_image, _ = env_utils.get_true_heightmap(env) # Evaluating the new state for calculating the reward
                depth_image = np.stack((depth_image, )*3, axis=-1)
                target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
                max_extents = get_max_extent_of_target_from_bottom(target_mask=target_mask, bottom_mask=target_mask, 
                                            bottom_obj_body_id=body_ids[0], 
                                            current_bottom_obj_size=testcase1.current_bottom_size, 
                                            is_viz=False)
                
                reward = get_reward(action='push', max_extents=max_extents, MIN_GRASP_EXTENT_THRESH=MIN_GRASP_THRESHOLDS) # get_reward(action, max_extents, MAX_EXTENT_THRESH, MIN_GRASP_EXTENT_THRESH)
                # belman_update_val = get_belman_update_value()
            elif action.item()==16:
                # Check if the state is graspable and reward the agent
                temp = cv2.cvtColor(color_image, cv2.COLOR_RGB2HSV)
                target_mask = cv2.inRange(temp, TARGET_LOWER, TARGET_UPPER)
                max_extents = get_max_extent_of_target_from_bottom(target_mask=target_mask, bottom_mask=target_mask, 
                                            bottom_obj_body_id=body_ids[0], 
                                            current_bottom_obj_size=testcase1.current_bottom_size, 
                                            is_viz=False)
                
                reward = get_reward(action='grasp', max_extents=max_extents, MIN_GRASP_EXTENT_THRESH=MIN_GRASP_THRESHOLDS)
                if reward==1:
                    done = True
                # done = True
            targetPos, _ = p.getBasePositionAndOrientation(body_ids[1])
            bottomPos, _ = p.getBasePositionAndOrientation(body_ids[0])
            if targetPos[2] < bottomPos[2] + testcase1.current_bottom_size[2]/2 + testcase1.current_target_size[2]/2 - 0.005:
                reward = -1
                done = True

            print("Episode: {}, State: {}, Reward: {}".format(i_episode, t, reward))
            # _, reward, done, _, _ = env.step(action.item())
            reward = torch.tensor([reward], dtype=torch.float, device=device)

            if reward == -1:
                done=True

            # Observe new state
            # last_screen = current_screen
            # current_screen = get_screen()
            if not done:
                next_state = {
                    'rgb': torch.tensor([np.transpose(color_image, (2, 0, 1))], dtype=torch.float, device=device), # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
                    'height_map': torch.tensor([np.transpose(depth_image, (2, 0, 1))], dtype=torch.float, device=device) # transpose used in order to convert (224, 224, 3) to (3, 224, 224)
                }
            else:
                next_state = {
                    'rgb': None,
                    'height_map': None
                }

            # Store the transition in memory
            # memory.push(state['rgb'], state['height_map'], action, next_state['rgb'], next_state['height_map'], reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the policy network)
            # optimize_model()
            if done:
                # episode_durations.append(t + 1)
                # plot_durations()
                break

            if t>=10:
                done = True
    # Update the target network, copying all weights and biases in DQN
    # if i_episode % TARGET_UPDATE == 0:
        # target_net.load_state_dict(policy_net.state_dict())

print('Complete')
# env.render()
# env.close()
# plt.ioff()
# plt.show()

startThreads creating 1 threads.
starting thread 0
started thread 0 
argc=2
argv[0] = --unused
argv[1] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creating context
Created GL 3.3 context
Direct GLX rendering context obtained
Making context current
GL_VENDOR=NVIDIA Corporation
GL_RENDERER=NVIDIA GeForce GTX 1050 Ti/PCIe/SSE2
GL_VERSION=3.3.0 NVIDIA 510.85.02
GL_SHADING_LANGUAGE_VERSION=3.30 NVIDIA via Cg compiler
pthread_getconcurrency()=0
Version = 3.3.0 NVIDIA 510.85.02
Vendor = NVIDIA Corporation
Renderer = NVIDIA GeForce GTX 1050 Ti/PCIe/SSE2
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started
Loading a new scene! ---------------------------------------- : True


  'rgb': torch.tensor([np.transpose(color_image, (2, 0, 1))], dtype=torch.float, device=device), # transpose used in order to convert (224, 224, 3) to (3, 224, 224)


Push from [ 0.47087442 -0.14805217  0.09026264] to [ 0.44255585 -0.07968508  0.09026264], True
Episode: 0, State: 0, Reward: 1.0
Push from [0.48805217 0.00287442 0.09031931] to [ 0.41968508 -0.02544415  0.09031931], True
Episode: 0, State: 1, Reward: 1.0
Push from [0.316      0.109      0.09031863] to [0.316      0.035      0.09031863], True
Episode: 0, State: 2, Reward: -1
Loading a new scene! ---------------------------------------- : True
Push from [ 0.46594783 -0.04687442  0.08002078] to [ 0.53431492 -0.01855585  0.08002078], True
Episode: 1, State: 0, Reward: -0.05
Push from [0.66196194 0.16196194 0.08008015] to [0.60963604 0.10963604 0.08008015], True
Episode: 1, State: 1, Reward: 1.0
Push from [0.705      0.016      0.08008119] to [0.631      0.016      0.08008119], True
Episode: 1, State: 2, Reward: 1.0
Push from [0.387      0.01       0.08008301] to [0.461      0.01       0.08008301], True
Episode: 1, State: 3, Reward: -0.05
Push from [0.58087442 0.13805217 0.08008124] to [0.5

: 

In [1]:
del policy_net
del target_net

NameError: name 'policy_net' is not defined

# Residual code 
Ignore the parts of code that follow this block

In [None]:
push_color_trunk = torchvision.models.densenet.densenet121(pretrained=True)

In [None]:
from torchsummary import summary

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    push_color_trunk.cuda()

summary(push_color_trunk, (3, 224, 224))

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 1024, 7, 7]          --
|    └─Conv2d: 2-1                       [-1, 64, 112, 112]        9,408
|    └─BatchNorm2d: 2-2                  [-1, 64, 112, 112]        128
|    └─ReLU: 2-3                         [-1, 64, 112, 112]        --
|    └─MaxPool2d: 2-4                    [-1, 64, 56, 56]          --
|    └─_DenseBlock: 2-5                  [-1, 256, 56, 56]         --
|    |    └─_DenseLayer: 3-1             [-1, 32, 56, 56]          45,440
|    |    └─_DenseLayer: 3-2             [-1, 32, 56, 56]          49,600
|    |    └─_DenseLayer: 3-3             [-1, 32, 56, 56]          53,760
|    |    └─_DenseLayer: 3-4             [-1, 32, 56, 56]          57,920
|    |    └─_DenseLayer: 3-5             [-1, 32, 56, 56]          62,080
|    |    └─_DenseLayer: 3-6             [-1, 32, 56, 56]          66,240
|    └─_Transition: 2-6                  [-1, 128, 28, 28

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 1024, 7, 7]          --
|    └─Conv2d: 2-1                       [-1, 64, 112, 112]        9,408
|    └─BatchNorm2d: 2-2                  [-1, 64, 112, 112]        128
|    └─ReLU: 2-3                         [-1, 64, 112, 112]        --
|    └─MaxPool2d: 2-4                    [-1, 64, 56, 56]          --
|    └─_DenseBlock: 2-5                  [-1, 256, 56, 56]         --
|    |    └─_DenseLayer: 3-1             [-1, 32, 56, 56]          45,440
|    |    └─_DenseLayer: 3-2             [-1, 32, 56, 56]          49,600
|    |    └─_DenseLayer: 3-3             [-1, 32, 56, 56]          53,760
|    |    └─_DenseLayer: 3-4             [-1, 32, 56, 56]          57,920
|    |    └─_DenseLayer: 3-5             [-1, 32, 56, 56]          62,080
|    |    └─_DenseLayer: 3-6             [-1, 32, 56, 56]          66,240
|    └─_Transition: 2-6                  [-1, 128, 28, 28

In [None]:
print(push_color_trunk)

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [None]:
import torch
torch.__version__

'1.7.1'