In [None]:
import sys
sys.path.insert(0, './py_torch_trpo')
from baselines.common import set_global_seeds, tf_util as U
import gym
import roboschool
import numpy as np
import random
from expert import *
import matplotlib.pyplot as plt
import time
import pandas as pd
import seaborn as sns
from gym import spaces
from base_line_model.TRPO_agent import TRPO_agent_new
from base_line_model.mlp import MlpPolicy_new
from baselines import logger

plt.style.use('seaborn-white')
sns.set(context = "paper", font = "monospace", font_scale=2)
seed = 1
logger.configure()
U.make_session(num_cpu=16).__enter__()
set_global_seeds(seed)

# Dummy observer and adversary agent

In [None]:
# dummy class, use for construction of adversary and observer environment
class dummy_adversary_env(object):
    def __init__(self):
        self.env = gym.make("RoboschoolInvertedPendulum-v1")
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.env.observation_space.shape[0],))
        self.observation_space = self.env.observation_space
        
    def action_ev(self, s):
        return action_space.sample()
    
class dummy_observer_env(object):
    def __init__(self):
        self.env = gym.make("RoboschoolInvertedPendulum-v1")
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.env.observation_space.shape[0],))
        self.observation_space = self.env.observation_space
    
    def action_ev(self, s):
        return s

# Adversary environment

In [None]:
class adversary_env(object):
    def __init__(self):
        self.env = gym.make("RoboschoolInvertedPendulum-v1")
        self.threshold = np.array([ 0.14244403,  0.07706523,  0.00016789,  0.00789366,  0.02395424]) # standard deviation
        # pre-trained controller
        self.agent = SmallReactivePolicy(self.env.observation_space, self.env.action_space) # declare sample trained agent
        self.env.seed(0)
        
        # observer agent
        self.observer = dummy_observer_env()
        
        self.ratio = 0.7
        self.max_turn = 1000
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.env.observation_space.shape[0],))
        self.observation_space = self.env.observation_space
        
        self.obsr = 0
        self.epi_num = 0
        self.total_score = 0
        self.first = True
        self.score_list = []
        self.epi_list = []
        self.env.metadata
    
    # -----------------------   define reward for adversary agent --------------------------------- 
    def reward(self, st):
        return np.abs(st[3]) - 0.08 # sin theta - 0.8
    
    def step(self, a):
        self.epi_num = self.epi_num + 1
        obs = np.clip(a,-1,1)*self.threshold*self.ratio + self.obsr
        
        # observer take the state input
        obs = self.observer.action_ev(obs)
        
        ac = self.agent.act(obs)
        self.obsr, r, done, _ = self.env.step(ac)
        
        if self.epi_num >= self.max_turn:
            done = True
        
        if self.first and done:
            self.first = False
            self.score_list = [self.total_score]
            self.epi_list = [self.epi_num]
            
        final_r = self.reward(self.obsr)
        if done and self.epi_num < self.max_turn:
            final_r = 15 # terminal cost 
        
        self.total_score += final_r
        return self.obsr, final_r, done, 0
        
        
    def seed(self, a):
        pass
    
    def reset(self):
        self.obsr = self.env.reset()        
        if not self.first: 
            self.score_list.append(self.total_score)
            self.epi_list.append(self.epi_num)
        
        self.epi_num = 0
        self.total_score = 0
        return self.obsr
    
    def env_reset(self):
        self.obsr = 0
        self.epi_num = 0
        self.total_score = 0
        self.first = True
        self.score_list = []
        self.epi_list = []
    
    def result_plot(self):
        fon_size = 19
        x = list(range(0, len(self.score_list[1:])))
        fig=plt.figure(figsize=(18, 4), dpi= 80, facecolor='w', edgecolor='k')
        plt.subplot(1,2,1)
        plt.scatter(x,self.score_list[1:], s=5)
        plt.xlabel('episodes',fontsize=fon_size)
        plt.ylabel('cumulative reward',fontsize=fon_size)
        plt.subplot(1,2,2)
        x = list(range(0, len(self.epi_list)))
        plt.scatter(x,self.epi_list, s=5)
        plt.xlabel('episodes',fontsize=fon_size)
        plt.ylabel('time steps',fontsize=fon_size)
        plt.ylim([0,1200])
        plt.show()

# Observer environment

In [None]:
class observer_env(object):
    agn_index = 1
    def __init__(self):
        # parameter
        self.env = gym.make("RoboschoolInvertedPendulum-v1")
        self.env.seed(0)
        # ------------------ pre-trained agent -----------------------------
        self.agent = SmallReactivePolicy(self.env.observation_space, self.env.action_space) # declare sample trained agent
        self.ratio = 0.7
        self.threshold = np.array([ 0.14244403,  0.07706523,  0.00016789,  0.00789366,  0.02395424])
        
        # ------ dummy adversarial agent -------------------
        self.adv_agn = dummy_adversary_env()
        
        self.max_turn = 1000
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.env.observation_space.shape[0],))
        self.observation_space = self.env.observation_space
        self.obsr = 0
        self.epi_num = 0
        self.total_score = 0
        self.first = True
        self.score_list = []
        self.epi_list = []
        self.env.metadata
        
    # define reward function
    def reward(self, st):
        return 1 #np.abs(st[3])-0.08#(np.abs(st[3])-0.00786861)*100
    
    def step(self, a):
        self.epi_num = self.epi_num + 1
        
        ac = self.agent.act(a)
        self.obsr, r, done, _ = self.env.step(ac)
        
        if self.epi_num >= self.max_turn:
            done = True
        
        if self.first and done: ###################################
            self.first = False
            self.score_list = [self.total_score]
            self.epi_list = [self.epi_num]
        
        final_r = self.reward(self.obsr)
        
        self.total_score += final_r
        
        # return noise output (adversarial)
        action = self.adv_agn.action_ev(self.obsr)
        obs = np.clip(action,-1,1)*self.threshold*self.ratio + self.obsr
        return obs, final_r, done, 0
    
    
    def seed(self, a):
        pass
    
    def reset(self):
        self.obsr = self.env.reset()
        if not self.first:
            self.score_list.append(self.total_score)
            self.epi_list.append(self.epi_num)
        
        self.epi_num = 0
        self.total_score = 0
        return self.obsr
    
    def env_reset(self):
        self.obsr = 0
        self.epi_num = 0
        self.total_score = 0
        self.first = True
        self.score_list = []
        self.epi_list = []
    
    def result_plot(self):
        fon_size = 19
        x = list(range(0, len(self.score_list)))
        fig=plt.figure(figsize=(18, 4), dpi= 80, facecolor='w', edgecolor='k')
        plt.subplot(1,2,1)
        plt.scatter(x,self.score_list, s=5)
        plt.xlabel('episodes',fontsize=fon_size)
        plt.ylabel('cumulative reward',fontsize=fon_size)

        plt.subplot(1,2,2)
        x = list(range(0, len(self.epi_list)))
        plt.scatter(x,self.epi_list, s=5)
        plt.xlabel('episodes',fontsize=fon_size)
        plt.ylabel('time steps',fontsize=fon_size)
        plt.ylim([0,1200])
        plt.show()

# Adversary agent

In [None]:
dummy_env1 = dummy_adversary_env()
class pargm1(object):
    def __init__(self):
        self.timesteps_per_batch = 5000 # what to train on
        self.max_kl = 0.01
        self.cg_iters = 10
        self.gamma = 0.995
        self.lam =  0.97# advantage estimation
        self.entcoeff=0.0
        self.cg_damping=0.1
        self.vf_stepsize=1e-3
        self.vf_iters =5
        self.max_timesteps = 5000
        self.max_episodes=0
        self.max_iters=0  # time constraint
        self.max_epi_avg = 1000
        self.callback=None

def policy_fn1(name, ob_space, ac_space):
        return MlpPolicy_new(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=128, num_hid_layers=2)
    
parg = pargm1()
adversary = TRPO_agent_new('pi1', dummy_env1, policy_fn1, parg)

In [None]:
#adversary.restore("adversary")

# Observer agent

In [None]:
dummy_env2 = dummy_observer_env()
class pargm2(object):
    def __init__(self):
        self.timesteps_per_batch = 5000 # what to train on
        self.max_kl = 0.01
        self.cg_iters = 10
        self.gamma = 0.995
        self.lam =  0.97# advantage estimation
        self.entcoeff=0.0
        self.cg_damping=0.1
        self.vf_stepsize=1e-3
        self.vf_iters =5
        self.max_timesteps = 5000
        self.max_episodes=0
        self.max_iters=0  # time constraint
        self.max_epi_avg = 1001
        self.callback=None

def policy_fn2(name, ob_space, ac_space):
        return MlpPolicy_new(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=128, num_hid_layers=2)

    
parg = pargm2()
observer = TRPO_agent_new('pi2', dummy_env2, policy_fn2, parg)

In [None]:
#observer.restore("observer")

In [None]:
obs_env = observer_env()
adv_env = adversary_env()
# replace all dummy instance
obs_env.adv_agn = adversary
adv_env.observer = observer
adversary.env = adv_env
observer.env = obs_env

In [None]:
for i in range(1000):
    print("\n -------------- adversary update -------------- \n")
    adversary.learn()
    print("\n -------------- observer update -------------- \n")
    observer.learn()