## !export VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
!export LD_LIBRARY_PATH=/home/t1tl/miniconda3/envs/rlgpu/lib

In [1]:
%config Completer.use_jedi = False

In [1]:
import gym
from isaacgym import gymapi
from isaacgym import gymtorch
import numpy as np
import time
import random
class BasicEnv(gym.Env):
    def __init__(self, normalize):
        self.action_space = gym.spaces.Discrete(4)
#         self.observation_space = gym.spaces.Box(low=np.array([0,-1.2]),high=np.array([1.2,0]),shape=(2,))
        self.observation_space = gym.spaces.Box(low=np.array([0,1]),high=np.array([0,1]),shape=(2,))

        self.obs_spc = [3,3]
        gymI = gymapi.acquire_gym()

        sim_params = gymapi.SimParams()

        # set common parameters
        sim_params.dt = 1 / 60
        sim_params.substeps = 2
        sim_params.up_axis = gymapi.UP_AXIS_Z
        sim_params.gravity = gymapi.Vec3(0.0, 0.0, -9.8)

        # set PhysX-specific parameters
        sim_params.physx.use_gpu = True
        sim_params.physx.solver_type = 1
        sim_params.physx.num_position_iterations = 3
        sim_params.physx.num_velocity_iterations = 1
        sim_params.physx.contact_offset = 0.01
        sim_params.physx.rest_offset = 0.0

        # set Flex-specific parameters
        sim_params.flex.solver_type = 5
        sim_params.flex.num_outer_iterations = 4
        sim_params.flex.num_inner_iterations = 20
        sim_params.flex.relaxation = 0.8
        sim_params.flex.warm_start = 0.5
        sim_params.up_axis = gymapi.UP_AXIS_Z
        sim_params.gravity = gymapi.Vec3(0.0, 0.0, -9.8)

        # create sim with these parameters
        sim = gymI.create_sim(0, 0, gymapi.SIM_PHYSX, sim_params)

        plane_params = gymapi.PlaneParams()
        plane_params.normal = gymapi.Vec3(0, 0, 1) # z-up!
        plane_params.distance = 0
        plane_params.static_friction = 1
        plane_params.dynamic_friction = 1
        plane_params.restitution = 0

        # create the ground plane
        gymI.add_ground(sim, plane_params)
#         num_envs = 9
        envs_per_row = 8
        env_spacing = 4.0
        env_lower = gymapi.Vec3(-env_spacing, 0.0, -env_spacing)
        env_upper = gymapi.Vec3(env_spacing, env_spacing, env_spacing)

        self.vel_app = 1
        self.effort_app=30
        # cache some common handles for later use

        env = gymI.create_env(sim, env_lower, env_upper, envs_per_row)
        
        #ADD CUSTOM PATH
        asset_root = "/home/t1tl/Documents/UniGalway/courses/thesis/"
        asset_file = "URDFs/krawler3.urdf"
        asset = gymI.load_asset(sim, asset_root, asset_file)

#         height = random.uniform(1.0, 2.5)

        pose = gymapi.Transform()
        pose.p = gymapi.Vec3(0.0, 0, 0.6)

        actor_handle = gymI.create_actor(env, asset, pose, "MyActor", 0, 1)
        rh = gymI.get_actor_rigid_shape_properties(env,actor_handle)
        rh[0].friction=0
        rh[1].friction=0
        rh[-1].friction=10
        gymI.set_actor_rigid_shape_properties(env,actor_handle,rh)
#         actor_handles.append(actor_handle)
        cam_props = gymapi.CameraProperties()
        self.viewer = gymI.create_viewer(sim, cam_props)
        props = gymI.get_actor_dof_properties(env, actor_handle)
        props["driveMode"].fill(gymapi.DOF_MODE_POS)
#         props["stiffness"].fill(100000.0)
        props["damping"].fill(200.0)
        props["effort"].fill(2000.0)
        props["stiffness"].fill(3000.0)
        props["friction"].fill(100.0)
        props["damping"].fill(0.0)
#         props["effort"].fill(20000.0)
        gymI.set_actor_dof_properties(env, actor_handle, props)
        
        self.gymI = gymI
        self.env = env
        self.actor_handle = actor_handle
        self.sim=sim
        self.props = gymI.get_actor_dof_properties(env,actor_handle)
        
        self.targets = np.array([self.val_to_bin(0,0),self.val_to_bin(0,1)])
        self.state = self.get_state()
        t = gymI.get_rigid_transform(env,0)
        self.last_pos = t.p.y
        self.step_count = 0
        self.tmr = time.time()
        self.draw = False
        self.eprew = 0
        
        body_states = self.gymI.get_env_rigid_body_states(self.env, gymapi.STATE_ALL)
        self.rot = body_states["pose"]["r"].copy()
        self.pos = body_states["pose"]["p"].copy()
        self.bs = body_states.copy()
        self.verbose=0
        
    def step(self, action):
        info = {}
        done=False
        self.old_state=self.get_state()
        self.targets=self.state
        self.apply_action(action)
#         self.gymI.set_actor_dof_position_targets(self.env, self.actor_handle, [self.bin_to_val(0),self.bin_to_val(1)])
#         print("state:",a, )
        
        self.simulate()
        t = self.gymI.get_rigid_transform(self.env,0)
        curr_pos = t.p.y
        reward = (self.last_pos - curr_pos)*-1
        
#         print('r',reward,self.last_pos,curr_pos)
        if abs(reward)<0.02:
            reward=0
        else:
            reward*=10
        self.step_count+=1
        if self.step_count>100:
            done=True
        self.last_pos=curr_pos
        if self.last_pos >2:
            print("done")
            reward +=30
            done=True
        reward -=0.1
        self.state=self.get_state()
        s = self.state
        self.eprew+=reward
        if reward!=-0.1 and self.verbose>0:
            print(round(reward,2),"|",end="")
        return s, reward , done , info
    
    def reset(self):
        self.gymI.set_actor_dof_states(self.env,self.actor_handle,[0,0],gymapi.STATE_POS)
        self.gymI.set_actor_dof_position_targets(self.env, self.actor_handle, [0,0])
        self.targets=[0,1]

        self.step_count=0
        print("|%.2f|%.4f"%(time.time()-self.tmr,self.eprew))
        self.tmr=time.time()
        self.eprew=0
        self.gymI.set_env_rigid_body_states(self.env, self.bs, gymapi.STATE_ALL)
        t = self.gymI.get_rigid_transform(self.env,0)
        curr_pos = t.p.y
        self.last_pos=curr_pos

        return self.get_state()
    
    def close (self):
        self.gymI.destroy_viewer(self.viewer)
        self.gymI.destroy_sim(self.sim)
        
    def get_state(self):
        s0 = self.gymI.get_dof_position(self.env,0)
        s1 = self.gymI.get_dof_position(self.env,1)
        ds0 = self.val_to_bin(s0,0)
        ds1 = self.val_to_bin(s1,1)
        sl = [s0,s1]
        return np.round( \
                    np.clip( \
                     np.array( [(sl[i]-self.props[i][1])/(self.props[i][2]-self.props[i][1]) \
                              for i in range(2)]),0,1),decimals=2)
    
    def apply_action(self,a):
        add_to = np.array([0,0])
        if a ==0:
            add_to = np.array([1,0])
        if a ==1:
            add_to = np.array([0,1])
        if a ==2:
            add_to = np.array([-1,0])
        if a==3:
            add_to = np.array([0,-1])
        targets = np.clip(self.get_state() - 0.2*add_to,0,1)
        self.gymI.set_actor_dof_position_targets(self.env,self.actor_handle, self.toTargets(targets))

    
    def toTargets(self, targets):
        targs = []
        for s in range(2):
            vmin=self.props[s][1]
            vmax=self.props[s][2]
            targs.append(self.translate(targets[s],0,1,vmin,vmax))
        return targs

            
    def translate(self,value, leftMin, leftMax, rightMin, rightMax):
        # Figure out how 'wide' each range is
        leftSpan = leftMax - leftMin
        rightSpan = rightMax - rightMin

        # Convert the left range into a 0-1 range (float)
        valueScaled = float(value - leftMin) / float(leftSpan)

        # Convert the 0-1 range into a value in the right range.
        return rightMin + (valueScaled * rightSpan)
        
    def bin_to_val(self,s):
        vbin = self.targets[s]
        vmin=self.props[s][1]
        vmax=self.props[s][2]
        return (vbin/self.obs_spc[s])*(vmax-vmin)+vmin
    
    def val_to_bin(self,val,s):
        vmin=self.props[s][1]
        vmax=self.props[s][2]
        return round((max((val-vmin),0)/(vmax-vmin))*self.obs_spc[s])
        
    def render(self):
        self.draw=True
    def simulate(self):
        sims = random.randint(30,40)#add jitter to exncourage generalization
        for i in range(sims):

            # step the physics
            self.gymI.simulate(self.sim)
            self.gymI.fetch_results(self.sim, True)

            # update the viewer
            if self.draw:
                self.gymI.step_graphics(self.sim);
                self.gymI.draw_viewer(self.viewer, self.sim, True)

            # Wait for dt to elapse in real time.
            # This synchronizes the physics simulation with the rendering rate.
#             self.gymI.sync_frame_time(self.sim)

Importing module 'gym_37' (/home/t1tl/Downloads/IsaacGym_Preview_2_Package/isaacgym/python/isaacgym/_bindings/linux-x86_64/gym_37.so)
Setting GYM_USD_PLUG_INFO_PATH to /home/t1tl/Downloads/IsaacGym_Preview_2_Package/isaacgym/python/isaacgym/_bindings/linux-x86_64/usd/plugInfo.json
PyTorch version 1.8.1
Device count 1
/home/t1tl/Downloads/IsaacGym_Preview_2_Package/isaacgym/python/isaacgym/_bindings/src/gymtorch
Using /home/t1tl/.cache/torch_extensions as PyTorch extensions root...
Emitting ninja build file /home/t1tl/.cache/torch_extensions/gymtorch/build.ninja...
Building extension module gymtorch...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module gymtorch...


In [2]:
import mxnet as mx
import matplotlib.pyplot as plt
import math

In [3]:
if 'env' in locals():
    env.close()
env = BasicEnv(True)



# DEEP Q LEARNING

In [5]:
from stable_baselines3 import PPO
from stable_baselines3 import DQN

## train and evaluate models

In [6]:
def evalModel(model,env,draw=False):
    print("Evaluating...")
    env.draw = draw
    obs = env.reset()
    done=False
    accRew = 0
    while not done:
#         env.render()
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        print(action,"->",obs,round(reward,2))
        accRew+=reward
    print("Evaluated",accRew)
    env.draw=False
    return accRew
#         if done:
#             obs = env.reset()

In [10]:
## import gym

from stable_baselines3 import PPO
from stable_baselines3 import DQNs


#gym.make("CartPole-v1")
# model = PPO("MlpPolicy", env, verbose=1)
# model.learn(total_timesteps=10000)
def trainDQN(time_steps,learning_rate, h1,h2,batch_size,gradient_steps,iteration,env):
    env.draw=False
#     policy_kwargs = dict(activation_fn=th.nn.ReLU,
#                          net_arch=[h1,h2])
    policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[h1])
    idstr = "DQN_LR%.3f_H1_%d_H2_%d_BS_%d_GS_%d_%d"%(learning_rate,h1,h2,batch_size,gradient_steps,iteration)
    model=DQN("MlpPolicy", env, verbose=2,batch_size=batch_size,\
              learning_starts=3000,learning_rate=learning_rate,\
              gradient_steps=gradient_steps,train_freq=(2,"episode"),\
              target_update_interval=4000,\
              exploration_fraction=0.4, tensorboard_log="tlog/dqn_evals_6",\
              policy_kwargs=policy_kwargs)
    model.learn(total_timesteps=time_steps, log_interval=40,eval_env=env, eval_freq=5000,\
                n_eval_episodes=3,eval_log_path="./logs/",tb_log_name=idstr)
    model.save("models/"+idstr)
    ev = evalModel(model,env)
    return ev
    


In [8]:
# [0.01,64,32,1024]
# vals = [[0.01,32,16,1024],[0.01,64,32,512],[0.001,64,32,1024],[0.1,64,32,1024],[0.001,32,16,512]]

In [9]:
# vals = [[0.01,32,16,1024],[0.01,32,16,4048],[0.001,32,16,4048]]

In [9]:
# vals = [[0.01,64,64,5000,10],[0.01,32,16,5000,10]]#, [0.01,32,16,2048]]#
#[0.01,32,32,5000,10]]
vals = [[0.01,64,0,5000,10]]#[[0.01,32,32,5000,10]]#[0.01,64,64,1000,10],[0.001,64,64,5000,10]]

In [11]:
import pickle
results = {}
for v in vals:
    for i in range(2):
        print("Training",v,i)
        res = trainDQN(65000,*v,i,env)
        results[str(v)+str(i)]=res
        f = open("backup_results_new_7_appended.obj","wb")
        pickle.dump(res,f)
        f.close()

Training [0.01, 64, 0, 5000, 10] 0
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
|111.22|0.0000
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to tlog/dqn_evals_6/DQN_LR0.010_H1_64_H2_0_BS_5000_GS_10_0_1
|5.84|-7.8814
|4.53|-10.2239
|4.38|-4.3075
|4.45|-10.2595
|4.48|-13.7189
|4.54|-7.9418
|5.16|-8.2057
|4.81|-7.3976
|4.85|-4.0126
|4.72|-14.1929
|4.44|-20.6889
|4.42|-9.3343
|4.46|-8.5007
|4.46|-8.2047
|4.49|-7.0604
|4.39|-10.4954
|4.39|-7.9410
|4.91|-10.4126
|5.09|-8.9296
|4.57|-10.9259
|4.64|-7.7604
|4.97|-6.6801
|4.62|-11.7162
|4.49|-7.6742
|4.57|-9.6554
|4.61|-14.8467
|4.86|-3.6698
|4.57|-9.3914
|4.65|-11.2865
|5.86|-7.6178
|4.62|-13.7338
|4.54|-7.6925
|4.56|-8.9245
|4.47|-7.5896
|4.53|-6.4923
|4.51|-11.7995
|4.50|-7.4027
|4.67|-0.3374
|4.80|-4.9914
|4.75|-13.4712
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 101      |
|    ep_rew_mean      | -9.08 

|4.64|-3.6703
|4.65|-2.0810
|4.65|-7.6351
|4.66|-1.4175
|4.89|-1.1445
|4.67|-3.3963
|4.54|-2.0535
|4.58|-1.9164
|4.59|-6.0221
|4.63|-1.4433
|4.56|-4.5776
|4.58|-6.2142
|4.56|-1.7373
|4.67|-1.7478
|4.69|-0.6272
|4.62|-3.1629
|4.69|-1.6738
|4.68|-1.7434
|4.54|-1.9912
|4.62|-3.4996
|4.66|-3.3561
|4.69|-0.5893
|4.77|-3.0408
|4.66|-1.6421
|4.74|-3.1483
|4.86|-2.0719
|5.38|-0.9586
|5.90|-0.5514
|4.68|-3.0373
|4.89|-4.5574
|5.07|-0.9376
|4.79|0.5154
|4.81|-0.3409
|4.89|-0.3266
|4.75|-2.9079
|4.69|-0.7575
|4.66|-1.2618
|4.90|-1.4874
|4.73|-1.4988
|4.58|-0.7468
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 102      |
|    ep_rew_mean      | -3.98    |
|    exploration rate | 0.107    |
| time/               |          |
|    episodes         | 240      |
|    fps              | 20       |
|    time_elapsed     | 1196     |
|    total timesteps  | 24444    |
| train/              |          |
|    learning_rate    | 0.01     |
|    loss            

|4.58|0.0409
|4.61|1.6719
|4.60|-1.2052
|4.64|0.6902
|4.57|-0.0212
|4.56|-7.1379
|4.55|-0.8017
|4.66|4.0313
|4.66|3.0200
|4.54|-2.1012
|4.53|5.9365
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 102      |
|    ep_rew_mean      | -0.0877  |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 20       |
|    time_elapsed     | 2193     |
|    total timesteps  | 44848    |
| train/              |          |
|    learning_rate    | 0.01     |
|    loss             | 0.0432   |
|    n_updates        | 2050     |
----------------------------------
|4.57|0.4288
|2.37|6.1746
|4.59|2.1915
|4.58|0.3553
|4.57|5.5728
Eval num_timesteps=45000, episode_reward=2.71 +/- 2.16
Episode length: 101.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 101      |
|    mean_reward      | 2.71     |
| rollout/            |          |


done
|3.28|43.5265
done
|4.13|42.0221
done
|3.49|43.2596
done
|3.64|42.7280
done
|3.80|42.2695
done
|4.14|41.1319
done
|3.65|42.4250
done
|4.27|41.1870
done
|3.29|43.8907
done
|3.73|43.8314
done
|3.43|43.1081
done
|3.60|42.2739
done
|3.84|42.8212
done
|3.63|42.6690
done
|3.76|42.8131
done
|3.34|43.6683
done
|3.33|43.0792
done
|3.53|42.2815
done
|3.88|42.4238
done
|3.91|41.8064
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 78.7     |
|    ep_rew_mean      | 40.7     |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 640      |
|    fps              | 20       |
|    time_elapsed     | 3065     |
|    total timesteps  | 62382    |
| train/              |          |
|    learning_rate    | 0.01     |
|    loss             | 0.077    |
|    n_updates        | 3050     |
----------------------------------
done
|3.66|42.9578
done
|4.43|41.1653
done
|3.63|42.8009
done
|3.81|42.3453
done
|3.58|43.7041


|4.65|-4.1343
|4.59|-10.4578
|4.54|-6.6494
|4.59|-7.3950
|4.82|-7.4351
|4.72|-6.7792
|4.64|-7.0424
|4.65|-9.2504
|4.56|-7.2919
|4.55|-7.2983
|4.58|-0.8660
|4.57|-9.4816
|4.72|-8.5030
|4.60|-5.7572
|4.73|-3.0590
|4.66|0.3039
|4.59|-6.2074
|4.58|-2.7984
|4.73|-13.5056
|4.53|-5.3350
|4.53|-8.1924
|4.55|-2.8075
|4.63|-6.2614
|4.59|-7.6779
|4.67|-4.7437
|4.59|-11.0409
|4.54|-8.9649
|2.37|0.5020
|4.70|-0.2094
|4.64|-0.2077
|4.64|-0.2094
Eval num_timesteps=15000, episode_reward=-0.21 +/- 0.00
Episode length: 101.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 101      |
|    mean_reward      | -0.209   |
| rollout/            |          |
|    exploration rate | 0.452    |
| time/               |          |
|    total timesteps  | 15000    |
| train/              |          |
|    learning_rate    | 0.01     |
|    loss             | 0.0426   |
|    n_updates        | 590      |
----------------------------------
New best mean reward!


New best mean reward!
|4.56|3.0140
|4.63|5.4670
|4.57|0.7522
|4.56|0.5845
|4.63|2.5776
|4.67|3.2469
|4.62|1.7780
|4.57|-2.7319
|4.66|-0.2981
|4.56|-1.2182
|4.63|-2.6640
|4.62|-9.1518
|4.46|-9.3203
|4.61|4.4080
|4.56|5.5328
|4.63|1.1607
|4.69|2.0051
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 102      |
|    ep_rew_mean      | -0.604   |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 360      |
|    fps              | 20       |
|    time_elapsed     | 1790     |
|    total timesteps  | 36717    |
| train/              |          |
|    learning_rate    | 0.01     |
|    loss             | 0.0528   |
|    n_updates        | 1650     |
----------------------------------
|4.58|2.5119
|4.51|2.1422
|4.65|-5.4708
|4.60|6.1180
|4.59|-1.9026
|4.51|3.1362
|4.64|1.3596
|4.55|2.0217
|4.58|1.8651
|4.63|1.5648
done
|4.34|41.3020
|4.66|0.3820
|4.62|1.4894
|4.59|-0.3080
|4.57|2.1033
|4.63|0.3038
|4.67|0.6

done
|3.98|42.6706
|4.55|7.7908
done
|3.74|41.6343
done
|3.72|42.0236
done
|3.43|43.0779
done
|3.70|42.8663
done
|4.17|41.5268
done
|4.04|41.8921
done
|3.91|41.8309
done
|3.97|41.6845
done
|4.16|40.7630
done
|4.06|41.9301
done
|4.25|41.2990
done
|4.03|41.3233
done
|3.61|42.3994
done
|3.71|42.2545
done
|4.42|41.0928
done
|3.94|41.3975
done
|3.89|41.1052
|4.56|8.9776
done
|4.42|40.2379
done
|4.18|41.3460
done
|3.59|42.2041
done
|3.61|41.9611
done
|3.56|42.2468
done
|3.88|42.8008
done
|3.85|42.2467
done
|4.43|40.8775
done
|3.81|41.6194
done
|3.65|42.0108
done
|3.82|42.5505
done
|3.87|41.9362
done
|3.97|41.4785
done
|3.90|42.0456
|1.80|5.6242
done
|3.62|41.8569
done
|3.73|42.7382
done
|3.73|42.1974
Eval num_timesteps=60000, episode_reward=42.26 +/- 0.36
Episode length: 82.00 +/- 0.00
----------------------------------
| eval/               |          |
|    mean_ep_length   | 82       |
|    mean_reward      | 42.3     |
| rollout/            |          |
|    exploration rate | 0.05     |

## try retrain

In [6]:
env.verbose=2

In [19]:
# model = DQN.load("models/dqn_13_more_free")
# modelO = DQN.load("models/dqn_13_more_free")
model = DQN.load("models/DQN_LR0.010_H1_64_H2_64_BS_1000_GS_10_0")

In [8]:

model.learning_rate=0.001
model.learning_starts=10
model.target_update_interval=1000
model.exploration_initial_eps=0.5
model.exploration_final_eps=0.01
model.set_env(env)
model.draw=True

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [10]:
model.exploration_schedule = lambda x :  (0.2 - 0.2*(1-(x-0.05))) if x<0.96 else 0.01

In [11]:
env.draw=True