### `Soft Actor-Critic` using `Snapbot`

In [1]:
import sys,mujoco
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('../package/helper/')
sys.path.append('../package/mujoco_usage/')
sys.path.append('../package/gym/')
sys.path.append('../package/rl/')
from mujoco_parser import *
from slider import *
from utility import *
from snapbot_env import *
from sac import *
np.set_printoptions(precision=2,suppress=True,linewidth=100)
plt.rc('xtick',labelsize=6); plt.rc('ytick',labelsize=6)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
print ("Ready.")

Ready.


#### Parse `Snapbot` gym

In [2]:
xml_path = '../asset/snapbot/scene_snapbot.xml'
env = MuJoCoParserClass(name='Snapbot',rel_xml_path=xml_path,verbose=True)
gym = SnapbotGymClass(
    env = env,
    HZ  = 50,
    history_total_sec = 0.2,
    history_intv_sec  = 0.1,
    VERBOSE =True,
)
print(env.sensor_names)
print("Ready.")

name:[Snapbot] dt:[0.002] HZ:[500]
n_qpos:[25] n_qvel:[24] n_qacc:[24] n_ctrl:[8]

n_body:[24]
 [0/24] [world] mass:[0.00]kg
 [1/24] [torso] mass:[0.24]kg
 [2/24] [Camera_module_1] mass:[0.06]kg
 [3/24] [Camera_module_2] mass:[0.06]kg
 [4/24] [Leg_module_1_1] mass:[0.06]kg
 [5/24] [Leg_module_1_2] mass:[0.08]kg
 [6/24] [Leg_module_1_3] mass:[0.02]kg
 [7/24] [Leg_module_1_4] mass:[0.01]kg
 [8/24] [Leg_module_1_4bar] mass:[0.01]kg
 [9/24] [Leg_module_2_1] mass:[0.06]kg
 [10/24] [Leg_module_2_2] mass:[0.08]kg
 [11/24] [Leg_module_2_3] mass:[0.02]kg
 [12/24] [Leg_module_2_4] mass:[0.01]kg
 [13/24] [Leg_module_2_4bar] mass:[0.01]kg
 [14/24] [Leg_module_4_1] mass:[0.06]kg
 [15/24] [Leg_module_4_2] mass:[0.08]kg
 [16/24] [Leg_module_4_3] mass:[0.02]kg
 [17/24] [Leg_module_4_4] mass:[0.01]kg
 [18/24] [Leg_module_4_4bar] mass:[0.01]kg
 [19/24] [Leg_module_5_1] mass:[0.06]kg
 [20/24] [Leg_module_5_2] mass:[0.08]kg
 [21/24] [Leg_module_5_3] mass:[0.02]kg
 [22/24] [Leg_module_5_4] mass:[0.01]kg
 [

In [3]:
import pandas as pd

# 1) the array of qpos addresses for each joint
qposadr     = env.model.jnt_qposadr        # shape (n_jnt,)
joint_names = env.joint_names              # shape (n_jnt,)
ctrl_qpos   = env.ctrl_qpos_idxs           # e.g. [3,4,7,8,11,12,15,16]

rows = []
for slot, q_i in enumerate(ctrl_qpos):
    # find the joint whose qpos address is exactly q_i
    # (for hinge joints it's a 1-to-1 match)
    jidx = next(j for j, adr in enumerate(qposadr) if adr == q_i)
    rows.append({
      'slot':       slot,
      'qpos_idx':   q_i,
      'joint_idx':  jidx,
      'joint_name': joint_names[jidx]
    })

df = pd.DataFrame(rows, columns=['slot','qpos_idx','joint_idx','joint_name'])
print(df)
# hip = leg_*_2 → endswith '_2'
hip_slots  = [r['slot'] for r in rows if r['joint_name'].endswith('_2')]
# knee = leg_*_3 → endswith '_3'
knee_slots = [r['slot'] for r in rows if r['joint_name'].endswith('_3')]

print("hip_slots:", hip_slots)     # → [0,2,4,6]
print("knee_slots:", knee_slots)   # → [1,3,5,7]

   slot  qpos_idx  joint_idx joint_name
0     0         9          3    leg_1_2
1     1        10          4    leg_1_3
2     2        13          7    leg_2_2
3     3        14          8    leg_2_3
4     4        17         11    leg_4_2
5     5        18         12    leg_4_3
6     6        21         15    leg_5_2
7     7        22         16    leg_5_3
hip_slots: [0, 2, 4, 6]
knee_slots: [1, 3, 5, 7]


In [4]:
import pandas as pd

# 1) qpos address → joint index → joint name 은 이미 뽑아둔 rows
# rows = [
#   {'slot':0, 'qpos_idx':9,  'joint_idx':3,  'joint_name':'leg_1_2'},
#   ...
# ]

# 2) mujoco model 에서 joint 범위 꺼내기
#    env.model.jnt_range.shape == (n_jnt, 2)
jnt_range = env.model.jnt_range  # 각 joint_idx 마다 [lower, upper]

# 3) DataFrame 생성 및 range 컬럼 추가
df = pd.DataFrame(rows, columns=['slot','qpos_idx','joint_idx','joint_name'])
df['range_low'], df['range_high'] = zip(*[ jnt_range[r['joint_idx']] for r in rows ])

print(df)

   slot  qpos_idx  joint_idx joint_name  range_low  range_high
0     0         9          3    leg_1_2       -0.9         0.9
1     1        10          4    leg_1_3       -0.7         0.7
2     2        13          7    leg_2_2       -0.9         0.9
3     3        14          8    leg_2_3       -0.7         0.7
4     4        17         11    leg_4_2       -0.9         0.9
5     5        18         12    leg_4_3       -0.7         0.7
6     6        21         15    leg_5_2       -0.9         0.9
7     7        22         16    leg_5_3       -0.7         0.7


#### `SAC` configuration

In [None]:
n_episode         = 1000 # number of total episodes (rollouts)
max_epi_sec       = 5.0 # maximum episode length in second (IMPORTANT)
max_epi_tick      = int(max_epi_sec*gym.HZ) # maximum episode length in tick
n_warmup_epi      = 40 # number of warm-up episodes
buffer_limit      = 50000 # 50000
buffer_warmup     = buffer_limit // 5
init_alpha        = 1
max_torque        = 4
# Update
lr_actor          = 0.0001 # 0.0002
lr_alpha          = 0.00001 # 0.0003
lr_critic         = 0.0003
n_update_per_tick = 1 # number of updates per tick
batch_size        = 256
gamma             = 0.99
tau               = 0.01
# Debug
print_every       = 50
eval_every        = 50
save_every        = 50
RENDER_EVAL       = False # False
print ("n_episode:[%d], max_epi_sec:[%.2f], max_epi_tick:[%d]"%
       (n_episode,max_epi_sec,max_epi_tick))
print ("n_warmup_epi:[%d], buffer_limit:[%.d], buffer_warmup:[%d]"%
       (n_warmup_epi,buffer_limit,buffer_warmup))

n_episode:[1000], max_epi_sec:[5.00], max_epi_tick:[250]
n_warmup_epi:[40], buffer_limit:[50000], buffer_warmup:[10000]


#### Initialize networks

In [6]:
device = 'cpu' # cpu / mps / cuda
replay_buffer = ReplayBufferClass(buffer_limit, device=device)
actor_arg = {'obs_dim':gym.o_dim,'h_dims':[256,256],'out_dim':gym.a_dim,
             'max_out':max_torque,'init_alpha':init_alpha,'lr_actor':lr_actor,
             'lr_alpha':lr_alpha,'device':device}
critic_arg = {'obs_dim':gym.o_dim,'a_dim':gym.a_dim,'h_dims':[256,256],'out_dim':1,
              'lr_critic':lr_critic,'device':device}
actor           = ActorClass(**actor_arg).to(device)
critic_one      = CriticClass(**critic_arg).to(device)
critic_two      = CriticClass(**critic_arg).to(device)
critic_one_trgt = CriticClass(**critic_arg).to(device)
critic_two_trgt = CriticClass(**critic_arg).to(device)
print ("Ready.")

Ready.


In [7]:
# Modify floor friction priority
env.model.geom('floor').priority = 1 # 0=>1
print ("Floor priority:%s"%(env.model.geom('floor').priority))
gym.env.ctrl_ranges[:,0] = -max_torque
gym.env.ctrl_ranges[:,1] = +max_torque
print ("gym.env.ctrl_ranges:\n",gym.env.ctrl_ranges)

Floor priority:[1]
gym.env.ctrl_ranges:
 [[-4.  4.]
 [-4.  4.]
 [-4.  4.]
 [-4.  4.]
 [-4.  4.]
 [-4.  4.]
 [-4.  4.]
 [-4.  4.]]


#### Train using `SAC`

In [None]:
REMOVE_PREV_FILES = False # remove previous files

# Loop
np.random.seed(seed=0) # fix seed
print ("Start training.")
for epi_idx in range(n_episode+1): # for each episode
    zero_to_one = epi_idx/n_episode
    one_to_zero = 1-zero_to_one
    # Reset gym
    s = gym.reset()
    # Loop
    USE_RANDOM_POLICY = (np.random.rand()<(0.1*one_to_zero)) or (epi_idx < n_warmup_epi)
    reward_total,r_height, r_forward = 0.0,0.0, 0.0
    h_peak, dist = 0.0, 0.0 
    air_tick = 0
    takeoff_n = 0
    for tick in range(max_epi_tick): # for each tick in an episode
        if USE_RANDOM_POLICY:
            a_np = gym.sample_action()
        else:
            a,log_prob = actor(np2torch(s,device=device))
            a_np = torch2np(a)
        # Step
        s_prime,reward,done,info = gym.step(a_np,max_time=max_epi_sec)
        h_peak = max(h_peak, info['h_cur'])
        x_diff = info['x_diff']
        replay_buffer.put((s,a_np,reward,s_prime,done))
        reward_total += reward 
        r_height += info['r_height']
        r_forward += info['r_forward']

        if done is True: break # terminate condition
        
        # Replay buffer
        if replay_buffer.size() > buffer_warmup:
             for _ in range(n_update_per_tick): 
                mini_batch = replay_buffer.sample(batch_size)
                # Update critics
                td_target = get_target(
                    actor,
                    critic_one_trgt,
                    critic_two_trgt,
                    gamma      = gamma,
                    mini_batch = mini_batch,
                    device     = device,
                )
                critic_one.train(td_target,mini_batch)
                critic_two.train(td_target,mini_batch)
                # Update actor
                actor.train(
                    critic_one,
                    critic_two,
                    target_entropy = -gym.a_dim,
                    mini_batch     = mini_batch,
                )
                # Soft update of critics
                critic_one.soft_update(tau=tau,net_target=critic_one_trgt)
                critic_two.soft_update(tau=tau,net_target=critic_two_trgt)
    
    # Print
    if (epi_idx%print_every)==0:
        print ("[%d/%d][%.1f%%]"%(epi_idx,n_episode,100.0*(epi_idx/n_episode)))
        print ("  reward:[%.3f] r_h:[%.3f] r_f:[%.3f] max_h:[%.3f] x_diff:[%.3f] epi_len:[%d/%d] buffer_size:[%d] alpha:[%.2f]"%
               (reward_total, r_height, r_forward, h_peak, x_diff,tick,max_epi_tick,
                replay_buffer.size(),actor.log_alpha.exp()))
    
    # Evaluation
    if (epi_idx%eval_every)==0:
        if RENDER_EVAL: gym.init_viewer()
        s = gym.reset()
        reward_total = 0.0
        r_height_eval, r_forward_eval = 0.0, 0.0
        h_max_eval = 0.0
        dist = 0.0
        x_max_eval = 0.0
        air_tick = 0
        for tick in range(max_epi_tick):
            a,_ = actor(np2torch(s,device=device),SAMPLE_ACTION=False)
            s_prime,reward,done,info = gym.step(torch2np(a),max_time=max_epi_sec)
            x_diff = info['x_diff']
            h_max_eval = max(h_max_eval, info['h_cur'])
            reward_total += reward
            r_height_eval += info['r_height']
            r_forward_eval += info['r_forward']

            if RENDER_EVAL and ((tick%5) == 0):
                gym.render(
                    TRACK_TORSO      = True,
                    PLOT_WORLD_COORD = True,
                    PLOT_TORSO_COORD = True,
                    PLOT_SENSOR      = True,
                    PLOT_CONTACT     = True,
                    PLOT_TIME        = True,
                )
            s = s_prime
            if RENDER_EVAL:
                if not gym.is_viewer_alive(): break
        if RENDER_EVAL: gym.close_viewer()
        print ("  [Eval] reward:[%.3f] r_h:[%.3f] r_f:[%.3f] h_max:[%.3f] x_diff:[%.3f] epi_len:[%d/%d]"%
               (reward_total, r_height_eval, r_forward_eval, h_max_eval, x_diff, tick,max_epi_tick))

    # Save network
    if (epi_idx%save_every)==0:
        pth_path = './result/weights/sac_%s/episode_%d.pth'%(gym.name.lower(),epi_idx)
        dir_path = os.path.dirname(pth_path)
        if not os.path.exists(dir_path): os.makedirs(dir_path)
        if (epi_idx == 0) and REMOVE_PREV_FILES: # remove all existing files
            files = os.listdir(path=dir_path)
            print ("  [Save] Remove existing [%d] pth files."%(len(files)))
            for file in files: os.remove(os.path.join(dir_path,file))
        torch.save(actor.state_dict(),pth_path)
        print ("  [Save] [%s] saved."%(pth_path))

print ("Done.")

Start training.
[0/1000][0.0%]
  reward:[-117.445] r_h:[135.070] r_f:[0.000] max_h:[0.153] x_diff:[0.000] epi_len:[198/250] buffer_size:[398] alpha:[1.00]
  [Eval] reward:[-95.512] r_h:[0.000] r_f:[0.000] h_max:[0.098] x_diff:[-0.000] epi_len:[249/250]
  [Save] [./result/weights/sac_snapbot/episode_0.pth] saved.
[50/1000][5.0%]
  reward:[-130.212] r_h:[50.889] r_f:[4.221] max_h:[0.131] x_diff:[-0.000] epi_len:[249/250] buffer_size:[11155] alpha:[0.39]
  [Eval] reward:[-151.385] r_h:[0.000] r_f:[0.000] h_max:[0.098] x_diff:[-0.000] epi_len:[249/250]
  [Save] [./result/weights/sac_snapbot/episode_50.pth] saved.
[100/1000][10.0%]
  reward:[-141.211] r_h:[0.000] r_f:[0.000] max_h:[0.098] x_diff:[0.004] epi_len:[249/250] buffer_size:[23457] alpha:[0.00]
  [Eval] reward:[-171.649] r_h:[0.000] r_f:[0.000] h_max:[0.098] x_diff:[0.000] epi_len:[249/250]
  [Save] [./result/weights/sac_snapbot/episode_100.pth] saved.
