In [1]:
#@title Run to install MuJoCo and `dm_control`
import distutils.util
import os
import subprocess
if subprocess.run('nvidia-smi').returncode:
  raise RuntimeError(
      'Cannot communicate with GPU. '
      'Make sure you are using a GPU Colab runtime. '
      'Go to the Runtime menu and select Choose runtime type.')

# # Add an ICD config so that glvnd can pick up the Nvidia EGL driver.
# # This is usually installed as part of an Nvidia driver package, but the Colab
# # kernel doesn't install its driver via APT, and as a result the ICD is missing.
# # (https://github.com/NVIDIA/libglvnd/blob/master/src/EGL/icd_enumeration.md)
# NVIDIA_ICD_CONFIG_PATH = '/usr/share/glvnd/egl_vendor.d/10_nvidia.json'
# if not os.path.exists(NVIDIA_ICD_CONFIG_PATH):
#   with open(NVIDIA_ICD_CONFIG_PATH, 'w') as f:
#     f.write("""{
#     "file_format_version" : "1.0.0",
#     "ICD" : {
#         "library_path" : "libEGL_nvidia.so.0"
#     }
# }
# """)

print('Installing dm_control...')
!pip install -q dm_control>=1.0.18

# Configure dm_control to use the EGL rendering backend (requires GPU)
%env MUJOCO_GL=egl

print('Checking that the dm_control installation succeeded...')
try:
  from dm_control import suite
  env = suite.load('cartpole', 'swingup')
  pixels = env.physics.render()
except Exception as e:
  raise e from RuntimeError(
      'Something went wrong during installation. Check the shell output above '
      'for more information.\n'
      'If using a hosted Colab runtime, make sure you enable GPU acceleration '
      'by going to the Runtime menu and selecting "Choose runtime type".')
else:
  del pixels, suite

!echo Installed dm_control $(pip show dm_control | grep -Po "(?<=Version: ).+")

Thu Apr 18 23:40:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A10G                    On  | 00000000:00:1E.0 Off |                    0 |
|  0%   25C    P8              17W / 300W |      2MiB / 23028MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
#@title All `dm_control` imports required for this tutorial

# The basic mujoco wrapper.
from dm_control import mujoco

# Access to enums and MuJoCo library functions.
from dm_control.mujoco.wrapper.mjbindings import enums
from dm_control.mujoco.wrapper.mjbindings import mjlib

# PyMJCF
from dm_control import mjcf

# Composer high level imports
from dm_control import composer
from dm_control.composer.observation import observable
from dm_control.composer import variation

# Imports for Composer tutorial example
from dm_control.composer.variation import distributions
from dm_control.composer.variation import noises
from dm_control.locomotion.arenas import floors

# Control Suite
from dm_control import suite

# Run through corridor example
from dm_control.locomotion.walkers import cmu_humanoid
from dm_control.locomotion.arenas import corridors as corridor_arenas
from dm_control.locomotion.tasks import corridors as corridor_tasks

# Soccer
from dm_control.locomotion import soccer

# Manipulation
from dm_control import manipulation



In [3]:
# import gym
import random
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Categorical
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
#from pyvirtualdisplay import Display
#from IPython import display as disp
import copy
from typing import Tuple
%matplotlib inline

In [4]:
# Replay buffer
class ReplayBuffer(object):
	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0

		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))

		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done

		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)


	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)

		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.not_done[ind]).to(self.device)
		)

In [18]:
def init_flags():

    flags = {
        "env": "hopper",
        "task": "hop",
        "seed":0, # random seed
        "start_timesteps": 25e3, #total steps of free exploration phase
        "max_timesteps": 8e4, # maximum length of time steps in training
        "expl_noise": 0.1, # noise strength in exploration
        "batch_size": 512,
        "discount":0.99,
        "tau": 0.005, # rate of target update
        #"policy_noise": 0.2, # policy noise when sampling action
        #"noise_clip":0.5, # noise clip rate
        "policy_freq": 2, # delayed policy update frequency in TD3,
        "N": 1, # number of agents,
        "RR": 4, # replay ratio,
        "T": np.inf, # time steps between agent resets ,
        "beta": 50, # action selection coefficient
    }

    return flags

def collect_actions(theta, state):
    actions = []
    for theta_i in theta:
      action, entropy, mean, vari = (theta_i.select_action(np.array(state)))
      actions.append(torch.from_numpy(action))
    return actions

def main(policy_name = 'DDPG') -> list:
    """
    Input:
    policy_name: str, the method to implement
    Output:
    evaluations: list, the reward in every episodes
    Call DDPG/TD3 trainer and
    """
    args = init_flags()
    random_state = np.random.RandomState(args["seed"])
    env = suite.load(args["env"], args["task"], task_kwargs={'random': random_state})
    
    action_spec = env.action_spec()
    action_dim = action_spec.shape[0]
    max_action = action_spec.maximum # be careful that max_action is an array!

    ob_spec = env.observation_spec()
    state_dim = 0
    for item in ob_spec:
        state_dim += ob_spec[item].shape[0]
    
    # env = gym.make(args["env"])
    # env.seed(args["seed"]+100)
    # env.action_spec.seed(args["seed"])
    torch.manual_seed(args["seed"])
    np.random.seed(args["seed"])

    # state_dim = env.observation_space.shape[0]
    # action_dim = env.action_space.shape[0]
    # max_action = float(env.action_space.high[0])
    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args["discount"],
        "tau": args["tau"],}
    if policy_name == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        #kwargs["policy_noise"] = args["policy_noise"] * max_action
        #kwargs["noise_clip"] = args["noise_clip"] * max_action
        kwargs["policy_freq"] = args["policy_freq"]
        theta = [TD3(**kwargs) for _ in range(args["N"])]
        # policy = theta[0]
    elif policy_name == "DDPG":
        policy = DDPG(**kwargs)

    replay_buffer = ReplayBuffer(state_dim, action_dim)
    evaluations = []
    actions_l = []
    entropies = []
    means = []
    varis = []
    # state, done = env.reset(), False
    time_step = env.reset()
    state = []
    for item in time_step.observation:
        for ob in time_step.observation[item]:
            state.append(ob)
    done = False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    k = 0

    for t in range(int(args["max_timesteps"])):

      episode_timesteps += 1

      # Select action randomly or according to policy
      entropy = 0
      mean = 0
      vari = 0
      if t < args["start_timesteps"]:
        # action = env.action_space.sample()
          action = np.random.uniform(action_spec.minimum,
                             action_spec.maximum,
                             size=action_spec.shape)
      else:
        with torch.no_grad():
        
          actions = collect_actions(theta, state)
          #print(state)
          #print(type(state[0]))
          #print(actions)
          #print(type(actions))
          # compute Q's, then apply softmax
          q_sa = torch.hstack(
              [theta[k].critic.Q1(torch.FloatTensor(state), action.to(torch.float32)) for action in actions]
          )
          # dim
          max_q_sa, _ = torch.max(q_sa, dim=0)
          alpha = args["beta"] / max_q_sa
          p_select = F.softmax(q_sa / alpha)
        
          if (t == args["start_timesteps"]):
            print(p_select)

          # print(actions)
          # print(torch.hstack(actions).numpy())
          # action = np.random.choice(a=torch.hstack(actions).numpy(), p=p_select.numpy())
          idx = np.arange(0, len(actions))
          action_idx = np.random.choice(a=idx, p=p_select.numpy())
          #action_idx = np.atleast_1d(action)
          action = actions[action_idx].numpy()

      # Perform action
      time_step = env.step(action)
      next_state = []
      for item in time_step.observation:
          for ob in time_step.observation[item]:
              next_state.append(ob)
      reward = time_step.reward
      done = time_step.last()
      
      # done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0
      done_bool = float(done) if episode_timesteps < 1000 else 0

      #print("state: ", state)
      #print("action: ", action)
      #print("next_state: ", next_state)
      #print("reward: ", reward)
      #print("done_bool: ", done_bool)
      #print(episode_timesteps)

      # Store data in replay buffer
      replay_buffer.add(state, action, next_state, reward, done_bool)

      state = next_state
      actions_l.append(action)
      entropies.append(entropy)
      means.append(mean)
      varis.append(vari)
      episode_reward += reward

      # Train agent after collecting sufficient data
      if t >= args["start_timesteps"]:
        for j in range(args["RR"]):
          for theta_i in theta:
            theta_i.train(replay_buffer, args["batch_size"])

        #if (t % (args["T"] / args["N"])) == 0:
        #    print(k)
            # reset just actor or both?
        #    theta[k].actor.reset()
        #    theta[k].critic.reset()
        #    k = (k + 1) % args["N"]

      if done:
        # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
        print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
        evaluations.append(episode_reward)
        # print('a', actions_l)
        # print('e', entropies)
        # print('mu', means)
        # print('var', varis)
        entropies = []
        actions_l = []
        means = []
        varis = []
        # Reset environment
        # state, done = env.reset(), False
        time_step = env.reset()
        state = []
        for item in time_step.observation:
            for ob in time_step.observation[item]:
                state.append(ob)
        done = False
        episode_reward = 0
        episode_timesteps = 0
        episode_num += 1

    return evaluations


In [19]:
# Reference Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Construct the actor/critic network for TD3
class Actor_TD3(nn.Module):
	def __init__(self, state_dim: int, action_dim: int, max_action: float):
		super(Actor_TD3, self).__init__()
		############################
		# YOUR IMPLEMENTATION HERE #
		self.l1 = nn.Linear(state_dim, 1024)
		self.l2 = nn.Linear(1024, 1024)
		self.l3 = nn.Linear(1024, 2 * action_dim)
		############################
		self.max_action = torch.from_numpy(max_action).to(torch.float32)
		self.action_dim = action_dim

	def forward(self, state: torch.Tensor) -> torch.Tensor:
		############################
		# YOUR IMPLEMENTATION HERE #
		a = F.relu(self.l1(state))
		a = F.relu(self.l2(a))

		# Hint: Use torch.distributions.Normal
		a = self.l3(a)

		mean = self.max_action * torch.tanh(a[:,:self.action_dim])
		cov = nn.functional.softplus(a[:,self.action_dim:]) + 1e-9
        
		return torch.distributions.MultivariateNormal(mean, scale_tril=torch.diag_embed(cov))
    ############################

	def reset(self):
		for layer in self.children():
			if hasattr(layer, "reset_parameters"):
				layer.reset_parameters()


class Critic_TD3(nn.Module):
	def __init__(self, state_dim : int, action_dim: int):
		super(Critic_TD3, self).__init__()

		# Q1 architecture
		############################
		# YOUR IMPLEMENTATION HERE #
		self.l1 = nn.Linear(state_dim + action_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, 1)

		# Please implement Q2 below
		############################
		# YOUR IMPLEMENTATION HERE #
		self.l4 = nn.Linear(state_dim + action_dim, 256)
		self.l5 = nn.Linear(256, 256)
		self.l6 = nn.Linear(256, 1)
		############################

	def forward(self, state: torch.Tensor, action: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
		sa = torch.cat([state, action], 1)
		############################
		# YOUR IMPLEMENTATION HERE #
		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)

		q2 = F.relu(self.l4(sa))
		q2 = F.relu(self.l5(q2))
		q2 = self.l6(q2)
		############################
		return q1, q2


	def Q1(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor:
		# print(self.l1.weight.dtype)
		# [HINT] only returns q1 for actor update
		############################
		# YOUR IMPLEMENTATION HERE #
		sa = torch.cat([state, action], -1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)
	  ############################
		return q1

	def reset(self):
		for layer in self.children():
			if hasattr(layer, "reset_parameters"):
				layer.reset_parameters()


In [20]:
class TD3(object):
	def __init__(
		self,
		state_dim: int,
		action_dim: int,
		max_action: float,
		discount=0.99,
		tau=0.005,
		#policy_noise=0.2,
		#noise_clip=0.5,
		policy_freq=2,
		temperature=0.01
	):

		self.actor = Actor_TD3(state_dim, action_dim, max_action).to(device)
		self.actor_target = copy.deepcopy(self.actor)
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

		self.critic = Critic_TD3(state_dim, action_dim).to(device)
		self.critic_target = copy.deepcopy(self.critic)
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

		self.max_action = torch.from_numpy(max_action)
		self.discount = discount
		self.tau = tau
		#self.policy_noise = policy_noise
		#self.noise_clip = noise_clip
		self.policy_freq = policy_freq

		self.total_it = 0
		self.temperature = temperature


	def select_action(self, state: torch.Tensor) -> torch.Tensor:
		state = torch.FloatTensor(state.reshape(1, -1)).to(device)

		# mean, std = self.actor(state)
		# actor_dist = torch.distributions.Normal(mean, std)
        # make std into cov instead?
		actor_dist = self.actor(state)
    
		selected_action = actor_dist.rsample().clamp(-self.max_action,
		                                             self.max_action)
		# print([actor_dist.batch_shape, actor_dist.event_shape])
		entropy = actor_dist.entropy()
		# vari = torch.square(std)
		return selected_action.data.numpy().flatten(), entropy.data.numpy(), actor_dist.loc.data.numpy(), actor_dist.covariance_matrix.data.numpy()


	def train(self, replay_buffer, batch_size=256):
		self.total_it += 1

		# Sample replay buffer
		state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

		with torch.no_grad():
			# Select action according to the policy,
      ############################
      # YOUR IMPLEMENTATION HERE #
			#target_mean, target_std = self.actor_target(
			#		 next_state)
			target_actor_dist = self.actor_target(next_state)
			#print("actor_dist dimensions", [target_actor_dist.batch_shape, target_actor_dist.event_shape])
			#target_actor_dist = torch.distributions.Normal(
			#		 target_mean, target_std)
			next_action = target_actor_dist.rsample().clamp(
					 -self.max_action, self.max_action)
			target_entropy = target_actor_dist.entropy().unsqueeze(-1)
			#print("entropy dimensions", target_entropy.size())
      ############################
			# Compute the target Q value
			target_Q1, target_Q2 = self.critic_target(
					 next_state, next_action.to(torch.float32))

      ############################
      # YOUR IMPLEMENTATION HERE #
			# 1.Calculate the min of two target Q-functions
			# 2. Calculate the TD target
			target_Q = torch.min(target_Q1, target_Q2)#.squeeze(-1)
			#print("reward dim: ", reward.size())
			target_Q = not_done * self.discount * (target_Q + (self.temperature * target_entropy))

            #target_Q = reward + not_done * self.discount * (
			#		 target_Q + (self.temperature * target_entropy))
			#print("second target Q dim: ", target_Q.size())
			target_Q = target_Q.detach()
      ############################
		# Get current Q estimates
		current_Q1, current_Q2 = self.critic(state, action.to(torch.float32))

		# Compute critic loss
		critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

		# Optimize the critic
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()

		# Delayed policy updates
		if self.total_it % self.policy_freq == 0:

			# Compute actor loss
      ############################
      # YOUR IMPLEMENTATION HERE #
			#mean, std = self.actor(state)
			#actor_dist = torch.distributions.Normal(
			#		 mean, std)
			actor_dist = self.actor(state)
			selected_action = actor_dist.rsample().clamp(
					 -self.max_action, self.max_action)

			actor_loss = -(self.critic.Q1(state, selected_action.to(torch.float32)) -
			 (self.temperature * actor_dist.entropy())).mean()
      ############################

			# Optimize the actor
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()

			# Update the frozen target models
      ############################
      # YOUR IMPLEMENTATION HERE #
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				new_target_params = self.tau * param.data + (1 - self.tau) * target_param.data
				target_param.data.copy_(new_target_params)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				new_target_params = self.tau * param.data + (1 - self.tau) * target_param.data
				target_param.data.copy_(new_target_params)
      ############################


In [None]:
evaluation_td3 = main(policy_name = 'TD3')

Total T: 1000 Episode Num: 1 Episode T: 1000 Reward: 1.049
Total T: 2000 Episode Num: 2 Episode T: 1000 Reward: 0.000
Total T: 3000 Episode Num: 3 Episode T: 1000 Reward: 0.000
Total T: 4000 Episode Num: 4 Episode T: 1000 Reward: 0.000
Total T: 5000 Episode Num: 5 Episode T: 1000 Reward: 0.000
Total T: 6000 Episode Num: 6 Episode T: 1000 Reward: 0.000
Total T: 7000 Episode Num: 7 Episode T: 1000 Reward: 0.000
Total T: 8000 Episode Num: 8 Episode T: 1000 Reward: 0.000
Total T: 9000 Episode Num: 9 Episode T: 1000 Reward: 0.000
Total T: 10000 Episode Num: 10 Episode T: 1000 Reward: 0.000
Total T: 11000 Episode Num: 11 Episode T: 1000 Reward: 0.333
Total T: 12000 Episode Num: 12 Episode T: 1000 Reward: 0.000
Total T: 13000 Episode Num: 13 Episode T: 1000 Reward: 0.000
Total T: 14000 Episode Num: 14 Episode T: 1000 Reward: 0.000
Total T: 15000 Episode Num: 15 Episode T: 1000 Reward: 0.000
Total T: 16000 Episode Num: 16 Episode T: 1000 Reward: 0.000
Total T: 17000 Episode Num: 17 Episode T: 

  p_select = F.softmax(q_sa / alpha)


tensor([1.])
Total T: 26000 Episode Num: 26 Episode T: 1000 Reward: 0.000
Total T: 27000 Episode Num: 27 Episode T: 1000 Reward: 0.000
Total T: 28000 Episode Num: 28 Episode T: 1000 Reward: 0.013
Total T: 29000 Episode Num: 29 Episode T: 1000 Reward: 0.000
Total T: 30000 Episode Num: 30 Episode T: 1000 Reward: 0.000
Total T: 31000 Episode Num: 31 Episode T: 1000 Reward: 0.000
Total T: 32000 Episode Num: 32 Episode T: 1000 Reward: 0.000
Total T: 33000 Episode Num: 33 Episode T: 1000 Reward: 0.000
Total T: 34000 Episode Num: 34 Episode T: 1000 Reward: 0.000
Total T: 35000 Episode Num: 35 Episode T: 1000 Reward: 0.000
Total T: 36000 Episode Num: 36 Episode T: 1000 Reward: 0.000
Total T: 37000 Episode Num: 37 Episode T: 1000 Reward: 0.000
Total T: 39000 Episode Num: 39 Episode T: 1000 Reward: 0.000


In [261]:
tensor_array = [torch.tensor([-0.0369, -0.4338,  0.3530, -0.5791], dtype=torch.float64), torch.tensor([-0.1975, -0.5541, -0.7902, -0.8466], dtype=torch.float64)]
np.array([np.array(tensor) for tensor in tensor_array])

array([[-0.0369, -0.4338,  0.353 , -0.5791],
       [-0.1975, -0.5541, -0.7902, -0.8466]])

In [268]:
tensor_array[0].numpy()

array([-0.0369, -0.4338,  0.353 , -0.5791])

In [266]:
np.arange(0, len(tensor_array))

array([0, 1])

In [76]:
# Load the environment
random_state = np.random.RandomState(42)
env = suite.load('hopper', 'run', task_kwargs={'random': random_state})

action_spec = env.action_spec()

action_spec.shape[0]

6

In [78]:
action_spec.minimum

array([-1., -1., -1., -1., -1., -1.])

In [94]:
action_spec.maximum

array([1., 1., 1., 1., 1., 1.])

In [73]:
env.action_spec()

BoundedArray(shape=(1,), dtype=dtype('float64'), name=None, minimum=[-1.], maximum=[1.])

In [87]:
ob_spec = env.observation_spec()
ob_spec

OrderedDict([('position',
              Array(shape=(8,), dtype=dtype('float64'), name='position')),
             ('velocity',
              Array(shape=(9,), dtype=dtype('float64'), name='velocity'))])

In [92]:
state_dim = 0
for item in ob_spec:
    state_dim += ob_spec[item].shape[0]

state_dim

17

In [103]:
env.step(action_spec.minimum)

TimeStep(step_type=<StepType.FIRST: 0>, reward=None, discount=None, observation=OrderedDict({'position': array([-0.08817184,  0.02524552, -0.08670794, -0.03770115, -0.11239976,
        0.00788536,  0.00912473,  0.02224182]), 'velocity': array([ 0.06690488,  0.0184274 , -0.02983502,  0.09082664,  0.04598512,
        0.12044083,  0.00073289,  0.114146  ,  0.16024414])}))

In [309]:
action = np.random.uniform(action_spec.minimum,
                             action_spec.maximum,
                             size=action_spec.shape)

In [310]:
action

array([-0.97913267, -0.26563407,  0.15866419, -0.7340004 , -0.41211752,
        0.30334101])

In [118]:
time_step = env.reset()

state = []
for item in time_step.observation:
    for ob in time_step.observation[item]:
        state.append(ob)

In [124]:
store = {'A':np.array([1,2,3]), 'B':np.array([3,4,5])}
#np.concatenate(store.values(),1)
#array([1, 2, 3, 3, 4, 5])
store

{'A': array([1, 2, 3]), 'B': array([3, 4, 5])}

In [123]:
time_step.observation

OrderedDict([('position',
              array([-0.0922507 ,  0.0304178 , -0.07834967, -0.02811268, -0.10947232,
                      0.00772878, -0.01447208, -0.00515856])),
             ('velocity',
              array([ 0.10420806,  0.03522159, -0.05837741,  0.13675513,  0.06064862,
                      0.19672703,  0.00130005,  0.21453296,  0.28563518]))])

In [122]:
time_step.observation.values()

odict_values([array([-0.0922507 ,  0.0304178 , -0.07834967, -0.02811268, -0.10947232,
        0.00772878, -0.01447208, -0.00515856]), array([ 0.10420806,  0.03522159, -0.05837741,  0.13675513,  0.06064862,
        0.19672703,  0.00130005,  0.21453296,  0.28563518])])

In [121]:
np.concatenate(time_step.observation.values(),1)

TypeError: The first input argument needs to be a sequence

In [119]:
state

[-0.09225069689751514,
 0.0304178012980982,
 -0.07834967408281786,
 -0.02811267527586962,
 -0.1094723188683084,
 0.007728775876733688,
 -0.014472077073197916,
 -0.0051585564419206215,
 0.10420805524039606,
 0.03522158744236225,
 -0.05837741312698571,
 0.1367551312020601,
 0.060648616548222206,
 0.19672703251895046,
 0.001300048676622605,
 0.21453295703069902,
 0.2856351792783388]

In [141]:
int(time_step.last())

0

In [128]:
time_step.reward

0.011893157750922256

In [129]:
time_step.observation

OrderedDict([('position',
              array([-0.09247594,  0.02837665, -0.08260535, -0.01423998, -0.10560009,
                      0.00785399, -0.01488494,  0.00677807])),
             ('velocity',
              array([ 0.09841373, -0.0225239 , -0.20411492, -0.42556769,  1.38726904,
                      0.38722287,  0.0125212 , -0.04128608,  1.19366227]))])

In [125]:
time_step = env.step(action)

In [126]:
time_step

TimeStep(step_type=<StepType.MID: 1>, reward=0.011893157750922256, discount=1.0, observation=OrderedDict({'position': array([-0.09247594,  0.02837665, -0.08260535, -0.01423998, -0.10560009,
        0.00785399, -0.01488494,  0.00677807]), 'velocity': array([ 0.09841373, -0.0225239 , -0.20411492, -0.42556769,  1.38726904,
        0.38722287,  0.0125212 , -0.04128608,  1.19366227])}))

In [None]:
next_state, reward, done, _ = env.step(action)