In [1]:
import argparse
import gym
import os
import sys
import pickle
import time
import numpy as np

from robotoddler.utils import *
# from models.cnn_ac import BinaryCnnAC
# from models.cnn_ac_value import BinaryCnnValue
# from core.a2c import a2c_step
# from core.common import estimate_advantages
# from core.agent import Agent
import assembly_gymenv
from matplotlib import pyplot as plt

In [2]:
env = gym.make('assembly_gymenv/AssemblyGymEnv-v0')
state = env.reset()
print(state)

pybullet build time: Sep 11 2023 10:08:20


{'blocks': tensor([], size=(0, 4)), 'obstacles': tensor([[0.5250, 0.5000, 0.0217],
        [0.5250, 0.5000, 0.0650]]), 'targets': tensor([[0.5250, 0.5000, 0.1083]])}


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(
  logger.warn(


In [3]:
action = [0,7]
next_state, reward, done, info = env.step(action)
print(next_state)

IndexError: list index out of range

In [None]:
img = env.assembly_env.get_image()
plt.imshow(np.flip(img.transpose(), 0))
plt.show()

# Training with Deep Q Learning

In [4]:
import networkx as nx
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric.nn as geom_nn
from torch_geometric.nn import GCNConv, GraphConv
import torch.nn.functional as F
import pickle

env = gym.make("assembly_gymenv/AssemblyGymEnv-v0")

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [5]:
class GraphCore(nn.Module):
    def __init__(self, emb_dim, num_layers):
        super(GraphCore, self).__init__()
        self.layers = nn.ModuleList([
            GraphConv(emb_dim, emb_dim, aggr='mean') for _ in range(num_layers)
        ])

    def forward(self, x, edge_index):
        for layer in self.layers:
            x = layer(x, edge_index)
        return x

class EPD_GNN(nn.Module):
    def __init__(self, num_node_features, num_actions, emb_dim, num_layers):
        super(EPD_GNN, self).__init__()
        self.num_actions = num_actions
        
        self.encode_layer = nn.Sequential(
            nn.Linear(num_node_features, emb_dim),
            nn.ReLU(),
            nn.Linear(emb_dim, emb_dim)
        )
        self.core = GraphCore(emb_dim, num_layers)
        self.decode_layer = nn.Sequential(
            nn.Linear(emb_dim, emb_dim),
            nn.ReLU(),
            nn.Linear(emb_dim, num_actions)
        )

    def forward(self, x): #edge_index
        # Encode
        x = self.encode_layer(x)

        # Recurrent Full Graph Core
        num_objects = x.shape[0]
        edge_index = torch.tensor([np.hstack([[i]*num_objects for i in range(num_objects)]), np.hstack([list(np.arange(num_objects))*num_objects])])
        x = self.core(x, edge_index)

        # Decode (classification)
        x = self.decode_layer(x[:-1]) # remove target
        return x

In [6]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

    
class PrioritizedReplayBuffer(object):
    def __init__(self, max_size, alpha=0.6):
        self.max_size = max_size
        self.alpha = alpha
        self.memory = []
        self.priorities = np.zeros(max_size, dtype=np.float32)
        self.idx = 0

    def push(self, state, action, next_state, reward, priority):
        if len(self.memory) < self.max_size:
            self.memory.append(Transition(state, action, next_state, reward))
        else:
            self.memory[self.idx] = Transition(state, action, next_state, reward)
        self.priorities[self.idx] = priority
        self.idx = (self.idx + 1) % self.max_size

    def sample(self, batch_size, beta=0.4):
        priorities = self.priorities[:len(self.memory)]
        priorities = priorities ** self.alpha
        prob = priorities / priorities.sum()

        indices = np.random.choice(len(self.memory), batch_size, p=prob)
        weights = (len(self.memory) * prob[indices]) ** (-beta)
        weights /= weights.max()

        experiences = [self.memory[idx] for idx in indices]
        return indices, experiences, weights

    def update_priorities(self, indices, priorities):
        for i, priority in zip(indices, priorities):
            self.priorities[i] = priority

In [7]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the ``AdamW`` optimizer
#BATCH_SIZE = 128
BATCH_SIZE = 32
num_updates = 25

GAMMA = 0.99 # 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
#TAU = 0.005
target_update_freq = 200
LR = 1e-4 # 1e-4

# Get number of actions per object
n_actions = env.num_actions
# Get the number of state observations
state = env.reset()
n_observations = len(state[0]) # num of features per object (should be 6)

policy_net = EPD_GNN(n_observations, n_actions, 64, 2).to(device)
target_net = EPD_GNN(n_observations, n_actions, 64, 2).to(device)

target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=False)

prioritized_memory = False
if prioritized_memory:
    alpha = 0.
    beta = 0.
    memory = PrioritizedReplayBuffer(30000, alpha) #ReplayMemory(30000)
else:
    memory = ReplayMemory(30000)



steps_done = 0


def select_action(state, with_randomness=True):
    global steps_done
    sample = random.random()
    if with_randomness:
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * steps_done / EPS_DECAY)
    else:
        eps_threshold = 0
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            #return policy_net(state).max(1)[1].view(1, 1)
            q_values = policy_net(state)
            target_block = q_values.max(dim=1).values.argmax()
            delta_index = q_values[target_block].argmax()
            #action = state[target_block,0] + (-0.49 + 0.98 * delta_index / (q_values.shape[1] - 1)) * state[target_block,2]
            return torch.tensor([target_block, delta_index], device=device, dtype=torch.long)
    else:
        print("Random action")
        rnd_obj = np.random.randint(state.shape[0]-1)
        rnd_delta = np.random.randint(policy_net.num_actions)
        return torch.tensor([rnd_obj, rnd_delta], device=device, dtype=torch.long)
        #rnd_action = torch.rand(1)
        #return torch.tensor(rnd_action, device=device)
        #return torch.tensor([[rnd_action]], device=device)#, dtype=torch.long)

        
episode_durations = []

AttributeError: 'AssemblyGymEnv' object has no attribute 'num_actions'

In [None]:
def optimize_model(batch_size):
    # Implementation with batchsize=1 !
    if len(memory.memory) < batch_size:
        return None, None
    if prioritized_memory:
        batch_indices, transitions, batch_weights = memory.sample(batch_size, beta)
    else:
        transitions = memory.sample(batch_size)
        
    
    loss = 0
    for i, t in enumerate(transitions):
        s = t.state
        a = t.action
        r = t.reward
        next_s = t.next_state
        
        state_action_value = policy_net(s)[a[0], a[1]]
        
        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_value = 0 #torch.zeros(BATCH_SIZE, device=device)
        with torch.no_grad():
            if next_s is not None:
                next_state_value = target_net(next_s).max() #target_net(next_s).max(1)[0][0]
        
        expected_state_action_value = (next_state_value * GAMMA) + r
        # Compute Huber loss
        criterion = nn.SmoothL1Loss()
        
        td_error = criterion(state_action_value, expected_state_action_value)
        if prioritized_memory:
            loss += batch_weights[i] * td_error
            memory.update_priorities([batch_indices[i]], [td_error.item()])
        else:
            loss += td_error
    
    loss /= batch_size
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    
    return loss, transitions

    
def run_episode(policy_net, target=None):
    state = env.reset()
    if target is not None:
        env.target = [target]
        state = env.get_observation()
    state = torch.tensor(state, dtype=torch.float32, device=device)
    print("Target at {}".format(state[1,0]))
    tot_reward = 0
    terminated = False
    while not terminated:
        action = select_action(state, with_randomness=False)
        target_block = action[0].item()
        delta_index = action[1].item()
        observation, reward, terminated, _ = env.step([target_block, delta_index])
        state = torch.tensor(observation, dtype=torch.float32, device=device)
        tot_reward += reward
    
    success = 0
    if reward > 0: # means the episode was a success
        success = 1
    
    return tot_reward, success
        
        
def test(policy_net, num_episodes=5):
    print("TEST")
    avg_reward = 0
    success_rate = 0
    for n in range(num_episodes):
        tot_reward, success = run_episode(policy_net)
        avg_reward += tot_reward
        success_rate += success
    return avg_reward / num_episodes, success_rate / num_episodes

In [None]:
if torch.cuda.is_available():
    num_episodes = 601
else:
    num_episodes = 4001

test_freq = 500

if 0:
    test_rewards = []
    test_success = []
    tot_reward = 0
    episode_rewards = []
    Q_losses = []

    best_policy_param = policy_net.state_dict()
    best_test_acc = 0

for i_episode in range(num_episodes):
    print("Episode {}".format(i_episode))
        
    if i_episode > 5:
        for j in range(num_updates):
            loss, transitions = optimize_model(BATCH_SIZE)
            if loss is not None:
                Q_losses.append(loss.item())
        
    if i_episode % target_update_freq == 0 and i_episode > 1:
        target_net.load_state_dict(policy_net.state_dict())
        
    if i_episode % test_freq == 0 and i_episode > 1:
        r, s = test(policy_net, num_episodes=20)
        test_rewards.append(r)
        test_success.append(s)
        if s > best_test_acc:
            best_test_acc = s
            best_policy_param = policy_net.state_dict()
        
    # Initialize the environment and get it's state
    state = env.reset()
    print("Target location: {}".format(env.target[0]))
    state = torch.tensor(state, dtype=torch.float32, device=device)
    for t in count():
        action = select_action(state)
        target_block = action[0].item()
        delta_index = action[1].item()
        observation, reward, terminated, _ = env.step([target_block, delta_index])
        tot_reward += reward
        reward = torch.tensor(reward, device=device)
        done = terminated

        if terminated:
            next_state = None
            episode_rewards.append(tot_reward)
            tot_reward = 0
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device)

        # Store the transition in memory
        if prioritized_memory:
            memory.push(state, action, next_state, reward, 10) # push first with high priority
        else:
            memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        if done:
            episode_durations.append(t + 1)
            break
        
    
print('Complete')

In [None]:
plt.figure()
plt.plot(test_success)
plt.show()

plt.figure()
plt.plot(episode_rewards)
plt.plot(np.convolve(episode_rewards, np.ones(200), 'valid') / 200, 'r')
plt.show()

plt.figure()
plt.plot(Q_losses, '+')
plt.plot(np.convolve(Q_losses, np.ones(200), 'valid') / 200, 'r')
plt.show()