In [1]:
import gym
from gym import spaces
import robo_gym
from robo_gym.wrappers.exception_handling import ExceptionHandling
import numpy as np
import pfrl
import torch
from torch import distributions, nn
import cv2
import matplotlib.pyplot as plt

In [2]:
class WrapPyTorch(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(WrapPyTorch, self).__init__(env)
        obs_shape = self.observation_space
        agent_pose = env.observation_space['agent_pose']
        occupancy_grid = env.observation_space['occupancy_grid']
        self.map_size = int(occupancy_grid.shape[0] ** (1/2))
        
        agent_pose_space = spaces.Box(
            low=np.expand_dims(agent_pose.low, axis=0),
            high=np.expand_dims(agent_pose.high, axis=0),
            shape=(1, agent_pose.shape[0]),
            dtype=np.float32
        )
        occupancy_grid_space = spaces.Box(
            low=occupancy_grid.low[0],
            high=occupancy_grid.high[0],
            shape=(1, self.map_size, self.map_size),
            dtype=np.float32
        )
        
        self.observation_space = spaces.Tuple((occupancy_grid_space, agent_pose_space))
    
    def observation(self, observation):
        map_img = observation['occupancy_grid'].reshape((self.map_size, self.map_size)).T
        occupancy_grid = np.expand_dims(map_img, axis=0)
        agent_pose = observation['agent_pose']
        return (occupancy_grid, agent_pose)
    
    def reset(self, **kwargs):
        return self.observation(self.env.reset(**kwargs))

In [3]:
target_machine_ip = 'localhost' # or other machine 'xxx.xxx.xxx.xxx'

# initialize environment
env = gym.make('CubeRoomOnNavigationStack-v0', ip=target_machine_ip, gui=True)

env = ExceptionHandling(env)
env = WrapPyTorch(env)
state = env.reset(**{'new_room': True, 'new_agent_pose': True})
state

Starting new Robot Server | Tentative 1
<class 'server_manager_pb2.RobotServer'>
True 
Successfully started Robot Server at localhost:56335
Resetting env... [room: True, pose: True]




(array([[[-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         ...,
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.]]], dtype=float32),
 array([ 4.4754521e-16, -9.9227786e-01,  1.2403473e-01,  0.0000000e+00,
         1.0000000e+00], dtype=float32))

In [4]:
timestep_limit = env.spec.max_episode_steps
obs_space = env.observation_space
action_space = env.action_space
obs_map_size = obs_space[0].low.size
obs_pose_size = obs_space[1].low.size
action_size = action_space.low.size

print(f'timelimit: \t{timestep_limit}')
print(f'obs_space: \t{obs_space} \naction_space: \t{action_space}')
print(f'obs_map_size: \t{obs_map_size} \nobs_pose_size: \t{obs_pose_size}')
print(f'action_size: \t{action_size}')

timelimit: 	500
obs_space: 	Tuple(Box(-1.0, 100.0, (1, 128, 128), float32), Box(-1.0, inf, (1, 5), float32)) 
action_space: 	Box(-1.0, 1.0, (3,), float32)
obs_map_size: 	16384 
obs_pose_size: 	5
action_size: 	3


In [5]:
# env.reset(new_room=False, new_agent_pose=True)
state, _, _, _ = env.step([0,0,0.5])
state

(array([[[-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         ...,
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.],
         [-1., -1., -1., ..., -1., -1., -1.]]], dtype=float32),
 array([0.00929746, 0.6262265 , 0.7796412 , 0.99850106, 0.05473227],
       dtype=float32))

In [6]:
def conv2d_size_out(size, kernel_size=5, stride=2):
    return (size - (kernel_size - 1) - 1) // stride + 1
        
def make_conv2d_layer(width, height):
    convW = conv2d_size_out(width, 4, 4) # 128 -> 32
    convW = conv2d_size_out(convW, 4, 4) # 32 -> 8
    convW = conv2d_size_out(convW, 3, 1) # 8 -> 6

    convH = conv2d_size_out(height, 4, 4)
    convH = conv2d_size_out(convH, 4, 4)
    convH = conv2d_size_out(convH, 3, 1)

    linear_input_size = convW * convH * 64
    print('size:', linear_input_size)

    # RGB Image tensor as input
    return nn.Sequential(
        nn.Conv2d(1, 32, kernel_size=4,stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=4),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3,stride=1),
        nn.ReLU(),
        nn.Flatten(),
    ), linear_input_size

def make_linear_layer(linear_input_size, out_size):
    return nn.Sequential(
        nn.Linear(linear_input_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, out_size),
    )

In [7]:
def squashed_diagonal_gaussian_head(x):
    assert x.shape[-1] == action_size * 2
    mean, log_scale = torch.chunk(x, 2, dim=1)
    log_scale = torch.clamp(log_scale, -20.0, 2.0)
    var = torch.exp(log_scale * 2)
    base_distribution = distributions.Independent(
        distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1
    )
    # cache_size=1 is required for numerical stability
    return distributions.transformed_distribution.TransformedDistribution(
        base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]
    )

# def extract_obs(obs):
#     occupancy_grid_batch = torch.tensor([s['occupancy_grid'] for s in obs])
#     agent_pose_batch = torch.tensor([s['agent_pose'] for s in state])
#     return occupancy_grid_batch, agent_pose_batch

In [8]:
class PolicyFunction(nn.Module):
    def __init__(self, width, height, pose_size, action_size):
        super().__init__()

        # RGB Image tensor as input
        self.selectTrackFeatures, self.linear_input_size = make_conv2d_layer(width, height)
        self.fc1 = make_linear_layer(self.linear_input_size + pose_size, action_size*2)
    
    def forward(self, state):
        x = self.selectTrackFeatures(state[0])
#         print(x.shape, state[1].shape)
        x = torch.cat((x, state[1]), axis=-1)
        x = self.fc1(x)
        return squashed_diagonal_gaussian_head(x)

obs_map_shape = obs_space[0].low.shape
print(obs_map_shape)
policy = PolicyFunction(obs_map_shape[1], obs_map_shape[2], obs_pose_size, action_size)
policy_optimizer = torch.optim.Adam(policy.parameters(), lr=3e-4)

(1, 128, 128)
size: 2304


In [9]:
class QFunction(nn.Module):
    def __init__(self, width, height,pose_size, action_size):
        super().__init__()

        # RGB Image tensor as input
        self.selectTrackFeatures, self.linear_input_size = make_conv2d_layer(width, height)
        self.fc1 = make_linear_layer(self.linear_input_size + pose_size + action_size, 1)
    
    def forward(self, state_and_action):
        state = state_and_action[0]
        occupancy_vector = self.selectTrackFeatures(state[0])
        x = torch.cat((occupancy_vector, state[1], state_and_action[1]), axis=-1)
        return self.fc1(x)

q_func1 = QFunction(obs_map_shape[1], obs_map_shape[2], obs_pose_size, action_size)
q_func2 = QFunction(obs_map_shape[1], obs_map_shape[2], obs_pose_size, action_size)
q_func1_optimizer = torch.optim.Adam(q_func1.parameters(), lr=3e-4)
q_func2_optimizer = torch.optim.Adam(q_func2.parameters(), lr=3e-4)

size: 2304
size: 2304


In [10]:
rbuf = pfrl.replay_buffers.ReplayBuffer(10 ** 6)

In [11]:
def burnin_action_func():
    """Select random actions until model is updated one or more times."""
    return np.random.uniform(action_space.low, action_space.high).astype(np.float32)

In [12]:
gamma = 0.99
replay_start_size = 10000
gpu = -1
batch_size = 256
entropy_target = -action_size
temperature_optimizer_lr = 3e-4

agent = pfrl.agents.SoftActorCritic(
    policy,
    q_func1,
    q_func2,
    policy_optimizer,
    q_func1_optimizer,
    q_func2_optimizer,
    rbuf,
    gamma=gamma,
    replay_start_size=replay_start_size,
    gpu=gpu,
    minibatch_size=batch_size,
    burnin_action_func=burnin_action_func,
    entropy_target=entropy_target,
    temperature_optimizer_lr=temperature_optimizer_lr,
)

In [None]:
n_episodes = 10
max_episode_len = 500

for i in range(1, n_episodes + 1):
    obs = env.reset(new_room=False, new_agent_pose=True)
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while True:
        # Uncomment to watch the behavior in a GUI window
        # env.render()
        action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
        reset = t == max_episode_len
        agent.observe(obs, reward, done, reset)
        # print(f"action: {action}, reward: {reward}")
        if done or reset:
            break
    if i % 10 == 0:
        print('episode:', i, 'R:', R, '\nstatistics:', agent.get_statistics())

print('Finished.')

In [None]:
rbuf.sample(2)