# Implementação DQN no ambiente Swift com manipulador panda.

Replay buffer <br>
DNN para representar o agente atual e uma DNN alvo com uma taxa de atualização menor.

In [1]:
import torch
import random
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
from IPython.display import clear_output
from scipy.signal import convolve, gaussian
import os
import io
import base64
import time
import glob
from IPython.display import HTML
from DQN_panda_utils import DQNAgent, evaluate, ReplayBuffer,\
        compute_td_loss,play_and_record, smoothen, epsilon_schedule,\
                generate_animation, display_animation
from panda_env import Panda_RL
                
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
%matplotlib inline

#j3 range -0.08 a 3.75  #j2 range -0.07 a -3. #j1 range -1.8 a 1.76

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
torch.cuda.is_available()

False

### Sistema observável e com medidas dos ângulos disponíveis


    Observation:
        Type: Box(4)
        Num     Observation               Min                     Max
        0       Joint1                   -4.8                    4.8
        1       Joint2                    -Inf                    Inf
        2       Joint3                -0.418 rad (-24 deg)    0.418 rad (24 deg)
        
    Actions:
        Type: Discrete(9)
        Num   Three actions for each joint
        0     decrement joint j
        1     increment joint j
        2     decrement join  j

        #j3 range 0.0 a 3.7
        #j2 range 0.0 a -3.
        #j1 range -1.7 a 1.7


        



In [3]:
state_shape = 3
env=Panda_RL()
agent=DQNAgent(state_shape, epsilon=0).to(device)
RESTORE_AGENT=True
if RESTORE_AGENT:
    agent.load_state_dict(torch.load('model_panda_trained.pth'))


In [4]:
env.panda

ERobot: panda (by Franka Emika), 7 joints (RRRRRRR), 1 gripper, geometry, collision
┌─────┬──────────────┬───────┬─────────────┬────────────────────────────────────────────────┐
│link │     link     │ joint │   parent    │              ETS: parent to link               │
├─────┼──────────────┼───────┼─────────────┼────────────────────────────────────────────────┤
│   0 │ panda_link0  │       │ BASE        │                                                │
│   1 │ panda_link1  │     0 │ panda_link0 │ SE3(0, 0, 0.333) ⊕ Rz(q0)                      │
│   2 │ panda_link2  │     1 │ panda_link1 │ SE3(-90°, -0°, 0°) ⊕ Rz(q1)                    │
│   3 │ panda_link3  │     2 │ panda_link2 │ SE3(0, -0.316, 0; 90°, -0°, 0°) ⊕ Rz(q2)       │
│   4 │ panda_link4  │     3 │ panda_link3 │ SE3(0.0825, 0, 0; 90°, -0°, 0°) ⊕ Rz(q3)       │
│   5 │ panda_link5  │     4 │ panda_link4 │ SE3(-0.0825, 0.384, 0; -90°, -0°, 0°) ⊕ Rz(q4) │
│   6 │ panda_link6  │     5 │ panda_link5 │ SE3(90°, -0°, 0°) ⊕ Rz(q5

In [5]:
target_network = DQNAgent(agent.state_shape, epsilon=0.5).to(device)
#Copying weights from agent network
target_network.load_state_dict(agent.state_dict())

<All keys matched successfully>

In [6]:
env.panda.q=env.panda.qz
# set a seed
seed = 13
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
NEW_BUFFER=False
TRAIN=False

In [7]:
q_far=np.array([ 0., -0.8 ,  0. , -0.0698,  0.,  3.3825,  0.    ])
env.panda.q=q_far
env.scene.step()
print(env.distance())


1.8716433359090923


In [8]:
env.detect_collision()

pybullet build time: Jul 21 2022 19:48:53


(False,
 [[False, False, False, False, False, False, False, False, False, False],
  [False, False, False, False, False, False, False]])

### Main loop



In [9]:
# Fill buffer with samples collected ramdomly from environment
buffer_len=1000
if NEW_BUFFER:
    exp_replay = ReplayBuffer(buffer_len)
    #Add Expert user experience
    q_far=np.array([ 0., -0.8 ,  0. , -0.0698,  0.,  3.3825,  0.    ])
    env.panda.q=q_far
    env.scene.step()
    for i in range(14):
        a=[0,-1,0]
        s=env.get_q()
        pos,r,done,info=env.step(a)
        exp_replay.add(s, a, r, pos, done)
        print(pos,r,env.fitness())
    for i in range(16):
        a=[1,0,0]
        s=env.get_q()
        pos,r,done,info=env.step(a)
        exp_replay.add(s, a, r, pos, done)
        print(pos,r,env.fitness())
    for i in range(22):
        a=[0,0,-1]
        s=env.get_q()
        pos,r,done,info=env.step(a)
        exp_replay.add(s, a, r, pos, done)
        print(pos,r,env.fitness())

In [10]:
# Fill buffer with samples collected ramdomly from environment
buffer_len=1000
# exp_replay = ReplayBuffer(buffer_len)
import pickle

if NEW_BUFFER:
    
    for i in range(50):
        
        state=env.reset()
        # Play 100 runs of experience with 100 steps and  stop if reach 10**4 samples
        play_and_record(state, agent, env, exp_replay, n_steps=60)
        
        if len(exp_replay) == buffer_len:
            break
    print(len(exp_replay))



    with open('buffer.pickle', 'wb') as handle:
        pickle.dump(exp_replay.buffer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
else:
    exp_replay = ReplayBuffer(buffer_len)
    with open('buffer.pickle', 'rb') as handle:
        exp_replay.buffer=pickle.load(handle)



In [11]:
#setup some parameters for training
timesteps_per_epoch = 1
batch_size = 64
total_steps = 2 * 10**3
#total_steps = 10

#init Optimizer
opt = torch.optim.Adam(agent.parameters(), lr=1e-4)

# set exploration epsilon 
start_epsilon = 1
end_epsilon = 0.05
eps_decay_final_step = 0.9*total_steps

# setup some frequency for logging and updating target network
loss_freq = 40
refresh_target_network_freq = 100
eval_freq = 500

# to clip the gradients
max_grad_norm = 5000

In [12]:
#Initialize the logger
mean_rw_history = []
td_loss_history = []


In [13]:
if TRAIN:
    state = env.reset()
    for step in trange(total_steps + 1):
        
        
        # reduce exploration as we progress
        agent.epsilon = epsilon_schedule(start_epsilon, end_epsilon, step, eps_decay_final_step)

        # take timesteps_per_epoch and update experience replay buffer
        _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)

        # train by sampling batch_size of data from experience replay
        states, actions, rewards, next_states, done_flags = exp_replay.sample(batch_size)
        actions =[agent.get_action_index(i) for i in actions]
        

        # loss = <compute TD loss>
        loss = compute_td_loss(agent, target_network, 
                            states, actions, rewards, next_states, done_flags,                  
                            gamma=0.99,
                            device=device)

        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        opt.step()
        opt.zero_grad()

        if step % loss_freq == 0:
            td_loss_history.append(loss.data.cpu().item())

        if step % refresh_target_network_freq == 0:
            # Load agent weights into target_network
            target_network.load_state_dict(agent.state_dict())

        if step % eval_freq == 0:
            # eval the agent
            mean_rw_history.append(evaluate(env, agent, n_games=40, greedy=True, t_max=60)[0] )

            clear_output(True)
            print("buffer size = %i, epsilon = %.5f" %
                (len(exp_replay), agent.epsilon))
            print(f"Frequency evaluation = {eval_freq}")

            plt.figure(figsize=[16, 5])
            plt.subplot(1, 2, 1)
            plt.title("Mean reward per episode")
            plt.plot(mean_rw_history)
            plt.grid()

            assert not np.isnan(td_loss_history[-1])
            plt.subplot(1, 2, 2)
            plt.title("TD loss history (smoothened)")
            plt.plot(smoothen(td_loss_history))
            plt.grid()
            plt.show()

In [14]:
final_score,m_steps,infos = evaluate(env,agent, n_games=1, greedy=True, t_max=100)
print(f'final score:{final_score} in {m_steps} steps')
print(infos)
print(f'Well done , Distance: {env.distance()}')
print(f"collision {env.detect_collision()[0]}")

final score:-13.973294859656132 in 27.0 steps
[['Done', 'Collided']]
Well done , Distance: 0.13411198556296502
collision True


In [15]:
SAVE_AGENT=False
SAVE_AGENT=False

if SAVE_AGENT:
    torch.save(agent.state_dict(), 'model_panda_trained2.pth')

In [16]:
state = env.reset()
qvalues = agent.get_qvalues([state])
action = agent.actions_space[qvalues.argmax(axis=-1)[0]]
state, r, done, info = env.step(action)
print(state, r, done, info)

[-1.1  -0.78  2.62] -6.0864463213705156 False ['', '']


In [17]:
qvalues = agent.get_qvalues([state])
action = agent.actions_space[qvalues.argmax(axis=-1)[0]]
state, r, done, _ = env.step(action)

In [26]:
state = env.reset()
reward = 0
while True:
    qvalues = agent.get_qvalues([state])
    action = agent.actions_space[qvalues.argmax(axis=-1)[0]]
    state, r, done, info = env.step(action)
    reward += r
    #print(reward)
    if done or reward < -60:
        print('Got reward: {}'.format(reward))
        break
print(f'Done , Distance: {env.distance()}')
print(f"collision {env.detect_collision()[0]}")
print(info)

Got reward: -5.741395572278462
Done , Distance: 0.2332662234912365
collision True
['Done', 'Collided']


**Let us record a video of trained agent**

In [None]:
# # Animate learned policy
# save_dir='./videos/'
# #env = make_env(env_name)
# generate_animation(env, agent, save_dir=save_dir)
# [filepath] = glob.glob(os.path.join(save_dir, '*.mp4'))

# display_animation(filepath)

In [None]:
#TODO

# Run this on another environment in OpenAI Gym
# Create a robotic environment with more actions
#