In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

import os, sys
import numpy as np
import random
from collections import deque, namedtuple
from numpy import pi
import datetime
import matplotlib.pyplot as plt

In [3]:
# define actor and critic networks
class Actor(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_dim=128):
        super().__init__()
        self.input_dim = input_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim
        num_outputs = action_dim
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU(inplace=True)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU(inplace=True)
        self.mu = nn.Linear(hidden_dim, num_outputs)

    def forward(self, inputs):
        """
        :param inputs: state = torch.cat([r, rel_ang, vel, ang_vel, time, vecL])
        inputs[:,1]: rel_ang
        inputs[:,3]: ang_vel
        """
        x = self.linear1(inputs)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        mu = torch.tanh(self.mu(x))
        return mu
    
class Critic(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_dim=128):
        super(self.__class__, self).__init__()
        self.input_dim = input_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU(inplace=True)
        self.linear_action = nn.Linear(action_dim, hidden_dim)
        self.relu1_action = nn.ReLU(inplace=True)
        self.linear2 = nn.Linear(hidden_dim + hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU(inplace=True)
        self.V = nn.Linear(hidden_dim, 1)

    def forward(self, inputs, actions):

        x_input = self.linear1(inputs)
        x_input = self.relu1(x_input)
        x_action = self.linear_action(actions)
        x_action = self.relu1_action(x_action)
        x = torch.cat((x_input, x_action), dim=1)
        x = self.linear2(x)
        x = self.relu2(x)

        V = self.V(x)
        return V

In [4]:
# agent utils

def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)

def next_path(path_pattern):
    """
    path_pattern = 'file-%s.txt':
    """
    i = 1
    while os.path.exists(path_pattern % i):
        i = i * 2

    a, b = (i // 2, i)
    while a + 1 < b:
        c = (a + b) // 2 # interval midpoint
        a, b = (c, b) if os.path.exists(path_pattern % c) else (a, c)

    return path_pattern % b

In [5]:
# SumTree and PER
# a binary tree data structure where the parent’s value is the sum of its children
class SumTree(object):
    write = 0

    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
        self.n_entries = 0

    # update to the root node
    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    # find sample on leaf node
    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s - self.tree[left])

    def total(self):
        return self.tree[0]

    # store priority and sample
    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

        if self.n_entries < self.capacity:
            self.n_entries += 1

    # update priority
    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    # get priority and sample
    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1

        return (idx, self.tree[idx], self.data[dataIdx])

    def __len__(self):
        return self.n_entries


# PER
class PER(object):  # stored as ( s, a, r, s_ ) in SumTree
    e = 0.0001
    #a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001

    def __init__(self, capacity, a=1.):
        self.tree = SumTree(capacity)
        self.capacity = capacity
        self.a = a

    def _get_priority(self, error):
        return (error + self.e) ** self.a

    def add(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def __len__(self):
        return len(self.tree)

In [6]:
Transition = namedtuple(
    'Transition', ('state', 'action', 'done', 'next_state', 'reward'))
    #'Transition', ('state', 'action', 'mask', 'next_state', 'reward'))
    
class ReplayMemory(object):
    def __init__(self, capacity, priority=False):
        self.capacity = capacity
        self.priority = priority
        if priority:
            self.memory = PER(capacity=capacity)
        else:
            self.memory = deque(maxlen=capacity)

    def push(self, *args, err=None):
        """Saves a transition."""
        if self.priority:
            assert err is not None, "Need to pass float error to add to priority memory"
            self.memory.add(err, Transition(*args))
        else:
            self.memory.append(Transition(*args))

    def sample(self, batch_size):
        if self.priority:
            batch, idx, is_weights = self.memory.sample(batch_size)
        else:
            batch = random.sample(self.memory, batch_size)
            idx = None
        batch = Transition(*zip(*batch))
        return batch, idx

    def update(self, idx, err):
        assert self.priority, "Cannot call this function if not priority memory"
        self.memory.update(idx, err)

    def batch_update(self, ids, errs):
        for idx, err in zip(ids, errs):
            self.update(idx, err)
        return

    def __len__(self):
        return len(self.memory)

In [7]:
# rewards
"""
reward.py
This file describes reward function which is the “expected reward” for the belief distribution over Gaussian reward distribution.
rew_std: standard deviation of Gaussian distribution for reward [std_x, std_y]

b(s)= 1/sqrt(2*pi*det(P)) * exp(-0.5* ((s-x)^T*P^-1*(s-x)) : Gaussian distribution with mean x, covariance P
r(s) = scale * exp(-0.5* s^T* R^-1 * s): reward gaussian distribution with mean zeros, covariance R
invS = invR +invP
R(b) = \int b(s)*r(s) ds = c *sqrt(det(S)/det(P))* exp(-0.5* mu^T*(invP - invP*S*invP)*mu)

R(b) =  \int b(s)*r(s) ds = 1/sqrt(det(2 pi (P+R)) * exp(-0.5*mu^t(R+P)^-1*mu)
"""

def return_reward(episode, info, reached_target, b, goal_radius, REWARD, finetuning = 0):
    if info['stop']:  # receive reward if monkey stops. position does not matters
        
        if finetuning == 0: # Gaussian reward based on belief
            reward = get_reward(b, goal_radius, REWARD)
            
        else: # 0/1 reward based on real position
            if reached_target == 1:
                reward = REWARD * torch.ones(1).cuda()
            else:
                reward = -0 * torch.ones(1).cuda()
    else:
        reward = -0 * torch.ones(1).cuda()
    return reward


def get_reward(b, goal_radius, REWARD):
    bx, P = b
    rew_std = goal_radius / 2  # std of reward function --> 2*std (=goal radius) = reward distribution

    #rew_std = goal_radus/2/2 #std of reward function --> 2*std (=goal radius) = reward distribution
    reward = rewardFunc(rew_std, bx.view(-1), P, REWARD)  # reward currently only depends on belief not action
    return reward

"""
def rewardFunc(rew_std, x, P, scale):
    R = torch.eye(2) * rew_std**2 # reward function is gaussian
    P = P[:2, :2] # cov
    invP = torch.inverse(P)
    invS = torch.inverse(R) + invP
    S = torch.inverse(invS)
    mu = x[:2] # pos
    alpha = -0.5 * mu.matmul(invP - invP.mm(S).mm(invP)).matmul(mu)
    reward = torch.exp(alpha) * torch.sqrt(torch.det(S)/torch.det(P))
    reward = scale * reward # adjustment for reward per timestep
    return reward.view(-1)
"""

def rewardFunc(rew_std, x, P, scale):
    mu = x[:2]  # pos
    R = torch.eye(2).cuda() * rew_std**2 # reward function is gaussian
    P = P[:2, :2] # cov
    S = R+P
    if not is_pos_def(S):
        print('R+P is not positive definite!')
    alpha = -0.5 * mu @ S.inverse() @ mu.t()
    #alpha = -0.5 * mu.matmul(torch.inverse(R+P)).matmul(mu.t())
    reward = torch.exp(alpha) /2 / np.pi /torch.sqrt(S.det())

    # normalization -> to make max reward as 1
    mu_zero = torch.zeros(1,2).cuda()
    alpha_zero = -0.5 * mu_zero @ R.inverse() @ mu_zero.t()
    reward_zero = torch.exp(alpha_zero) /2 / np.pi /torch.sqrt(R.det())
    reward = reward/reward_zero
    ####################

    reward = scale * reward  # adjustment for reward per timestep
    if reward > scale:
        print('reward is wrong!', reward)
        print('mu', mu)
        print('P', P)
        print('R', R)
    return reward.view(-1)

In [8]:
#terminal
def is_terminal_action(a, terminal_vel):
    """
    terminal is true if the action( which determines velocity) is lower that terminal_vel,
    which means the monkey stops.
    This approach only cares the action, does not depend on the position.
    """
    stop = (torch.norm(a) < terminal_vel)

    if stop:
        return torch.ByteTensor([True])
    else:
        return torch.ByteTensor([False])


In [9]:
# env utils
def is_pos_def(x):
    """
    Check if the matrix is positive definite
    """
    x = x.detach().cpu().numpy()
    return np.all(np.linalg.eigvalsh(x) > 0)

def tril_mask(size):
    """
    Returns a lower triangular mask
    (Used to select lower triangular elements)
    """
    mask = torch.tril(torch.ones(size, size, dtype=torch.uint8)).cuda() # ByteTensor
    return mask

def vectorLowerCholesky(P):
    """
    Performs the lower cholesky decomposition and returns vectorized output
    P = L L.t()
    """
    L = torch.cholesky(P, upper=False)
    mask = tril_mask(P.size(0))
    return torch.masked_select(L, mask > 0)

def sample_exp(min, max, scale = np.e):
    """sample a random number with exponetial distribution
    the number should be in a range [min, max]
    should be min, max >= 0
    """
    temp = min -100
    while temp < min or temp > max:
        temp = np.random.exponential(scale=scale)
    return temp

def range_angle(ang):
    """
    Adjusts the range of angle from -pi to pi
    """
    ang = torch.remainder(ang, 2*pi)
    ang = ang if ang < pi else (ang -2*pi)
    return ang


def dynamics(x, a, dt, box, pro_gains, pro_noise_ln_vars):
    # dynamics
    px, py, ang, vel, ang_vel = torch.split(x.view(-1), 1)

    a_v = a[0]  # action for velocity
    a_w = a[1]  # action for angular velocity

    w = torch.sqrt(torch.exp(pro_noise_ln_vars)) * torch.randn(2).cuda() # std * randn #random process noise for [vel, ang_vel]

    vel = torch.tensor(0.0).cuda() * vel + pro_gains[0] * a_v + w[0]
    ang_vel = torch.tensor(0.0).cuda() * ang_vel + pro_gains[1] * a_w + w[1]
    ang = ang + ang_vel * dt
    ang = range_angle(ang) # adjusts the range of angle from -pi to pi

    px = px + vel * torch.cos(ang) * dt
    py = py + vel * torch.sin(ang) * dt
    px = torch.clamp(px, -box, box)
    py = torch.clamp(py, -box, box)
    next_x = torch.stack((px, py, ang, vel, ang_vel))

    return next_x.view(1,-1)

In [10]:
"""
belief_step.py
This code uses the polar coordinate
state = torch.cat([vel, ang_vel, r, ang, vecL, time])
"""

class BeliefStep(nn.Module):
    def __init__(self, arg):
        super(self.__class__, self).__init__()

        self.dt = arg.DELTA_T
        self.P = torch.eye(5).cuda() * 1e-8
        self.terminal_vel = arg.TERMINAL_VEL
        return

    def reset(self, x, time, pro_gains, pro_noise_ln_vars, goal_radius, gains_range, noise_range, obs_gains = None, 
              obs_noise_ln_vars = None):

        self.pro_gains = pro_gains
        self.pro_noise_ln_vars = pro_noise_ln_vars
        self.goal_radius = goal_radius

        self.obs_gains = torch.zeros(2).cuda()
        self.obs_noise_ln_vars = torch.zeros(2).cuda()


        if obs_gains is None:
            self.obs_gains[0] = torch.zeros(1).uniform_(gains_range[0], gains_range[1])  # [obs_gain_vel]
            self.obs_gains[1] = torch.zeros(1).uniform_(gains_range[2], gains_range[3])  # [obs_gain_ang]
        else:
            self.obs_gains = obs_gains

        if obs_noise_ln_vars is None:
            self.obs_noise_ln_vars[0] = -1 * sample_exp(-noise_range[1], -noise_range[0]) # [obs_vel_noise]
            self.obs_noise_ln_vars[1] = -1 * sample_exp(-noise_range[3], -noise_range[2]) # [obs_ang_noise]
        else:
            self.obs_noise_ln_vars = obs_noise_ln_vars

        """    
        if obs_noise_stds is None:
            self.obs_noise_stds[0] = torch.zeros(1).uniform_(std_range[0], std_range[1])  # [obs_vel_noise]
            self.obs_noise_stds[1] = torch.zeros(1).uniform_(std_range[2], std_range[3])  # [obs_ang_noise]
        else:
            self.obs_noise_stds = obs_noise_stds
            
        """

        self.theta = (self.pro_gains, self.pro_noise_ln_vars, self.obs_gains, self.obs_noise_ln_vars, self.goal_radius)


        self.P = torch.eye(5).cuda() * 1e-8 # change 4 to size function
        self.b = x, self.P  # belief
        self.state = self.Breshape(self.b, time, self.theta)

        return self.b, self.state, self.obs_gains, self.obs_noise_ln_vars

    def forward(self,  b, ox, a, box):
        I = torch.eye(5).cuda()

        # Q matrix
        Q = torch.zeros(5, 5).cuda()
        Q[-2:, -2:] = torch.diag(torch.exp(self.pro_noise_ln_vars)) # variance of vel, ang_vel

        # R matrix
        R = torch.diag(torch.exp(self.obs_noise_ln_vars))


        # H matrix
        H = torch.zeros(2, 5).cuda()
        H[:, -2:] = torch.diag(self.obs_gains)


        # Extended Kalman Filter
        pre_bx_, P = b
        bx_ = dynamics(pre_bx_, a.view(-1), self.dt, box, self.pro_gains, self.pro_noise_ln_vars)
        bx_ = bx_.t() # make a column vector
        A = self.A(bx_) # after dynamics
        P_ = A.mm(P).mm(A.t())+Q # P_ = APA^T+Q
        if not is_pos_def(P_):
            print("P_:", P_)
            print("P:", P)
            print("A:", A)
            APA = A.mm(P).mm(A.t())
            print("APA:", APA)
            print("APA +:", is_pos_def(APA))
        error = ox - self.observations(bx_)
        S = H.mm(P_).mm(H.t()) + R # S = HPH^T+R
        K = P_.mm(H.t()).mm(torch.inverse(S)) # K = PHS^-1
        bx = bx_ + K.matmul(error)
        I_KH = I - K.mm(H)
        P = I_KH.mm(P_)

        if not is_pos_def(P):
            print("here")
            print("P:", P)
            P = (P + P.t()) / 2 + 1e-6 * I  # make symmetric to avoid computational overflows

        bx = bx.t() #return to a row vector
        b = bx.view(-1), P  # belief


        # terminal check
        terminal = self._isTerminal(bx, a) # check the monkey stops or not
        return b, {'stop': terminal}


    def observations(self, x): # observation noise on

        on = torch.sqrt(torch.exp(self.obs_noise_ln_vars)) * torch.randn(2).cuda() # random generation of observation noise
        vel, ang_vel = torch.split(x.view(-1),1)[-2:]

        ovel = self.obs_gains[0] * vel + on[0]
        oang_vel = self.obs_gains[1] * ang_vel + on[1]
        ox = torch.stack((ovel, oang_vel))
        return ox

    def observations_mean(self, x): # observation without noise

        vel, ang_vel = torch.split(x.view(-1),1)[-2:]

        ovel = self.obs_gains[0] * vel
        oang_vel = self.obs_gains[1] * ang_vel
        ox = torch.stack((ovel, oang_vel))
        return ox

    def A(self, x_): # F in wiki
        dt = self.dt
        px, py, ang, vel, ang_vel = torch.split(x_.view(-1),1)

        A_ = torch.zeros(5, 5).cuda()
        A_[:3, :3] = torch.eye(3)
        A_[0, 2] = - vel * torch.sin(ang) * dt
        A_[1, 2] = vel * torch.cos(ang) * dt
        return A_

    def Breshape(self, b, time, theta): # reshape belief for policy
        pro_gains, pro_noise_ln_vars, obs_gains, obs_noise_ln_vars, goal_radius = theta
        x, P = b
        px, py, ang, vel, ang_vel = torch.split(x.view(-1), 1)
        r = torch.norm(torch.cat([px, py])).view(-1)
        rel_ang = ang - torch.atan2(-py, -px).view(-1)
        rel_ang = range_angle(rel_ang)
        vecL = vectorLowerCholesky(P)
        state = torch.cat([r, rel_ang, vel, ang_vel, time, vecL, pro_gains.view(-1), 
                           pro_noise_ln_vars.view(-1), obs_gains.view(-1), obs_noise_ln_vars.view(-1), 
                           torch.ones(1).cuda()*goal_radius]) # original

        return state.view(1, -1)

    def _isTerminal(self, x, a, log=True):
        terminal_vel = self.terminal_vel
        terminal = is_terminal_action(a, terminal_vel)
        return terminal.item() == 1


In [49]:
# config
class Config():
    def __init__(self):
        self.SEED_NUMBER = 0

        self.WORLD_SIZE = 1.0
        self.ACTION_DIM = 2
        self.STATE_DIM = 29

        self.TERMINAL_VEL = torch.tensor(0.1).cuda()  # norm(action) that you believe as a signal to stop 0.1
        # all times are in second
        self.DELTA_T = torch.tensor(1).cuda()  # time to perform one action
        self.EPISODE_TIME = torch.tensor(70).cuda()
        self.EPISODE_LEN = self.EPISODE_TIME / self.DELTA_T  # number of time steps(actions) for one episode

        self.TOT_T = 2000000000  # total number of time steps for this code

        self.BATCH_SIZE = 512  # for replay memory
        self.REWARD = torch.tensor(10).cuda()  # for max reward
        self.DISCOUNT_FACTOR = 0.99

        self.STD_STEP_SIZE = 2e-5  # 1e-4 action space noise (default: 2e-3)

        self.filename = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        self.data_path = './'

        self.goal_radius_range = torch.tensor([0.10* self.WORLD_SIZE, 0.2* self.WORLD_SIZE]).cuda() #0.175: best radius
        self.GOAL_RADIUS_STEP_SIZE = torch.tensor(1e-5).cuda()

        self.initial_radius_range = [0.25, self.WORLD_SIZE] # [0.25,1] = [100,400] cm
        self.relative_angle_range = torch.tensor([-40/180*pi,40/180*pi]).cuda()  # in real experiment, the relative angle range is [-40,40]
        self.gains_range = torch.tensor([0.025, 0.1, pi/40, pi/10]).cuda() # [100cm/s,400cm/s,45deg/s,180deg/s]
        self.noise_range = torch.tensor([np.log(0.025/100), np.log(0.025/5), np.log((pi/40)/100), np.log((pi/40)/5)]).cuda() #SNR 100

In [12]:
class ActionNoise(object): # Gaussian
    def __init__(self, action_dim, mean=torch.tensor(0).cuda(), std=torch.tensor(1).cuda()):
        self.mu = torch.ones(action_dim).cuda() * mean
        self.scale = std
        self.action_dim = action_dim

    def reset(self, mean, std):
        self.mu = torch.ones(self.action_dim).cuda() * mean
        self.scale = std

    def noise(self):
        n = torch.randn(2).cuda()
        return self.mu + self.scale*n

In [13]:
# Agent
class Agent():
    def __init__(self, input_dim, action_dim, arg, filename=None, hidden_dim=128, gamma=0.99, tau=0.001, memory_size=1e6,
                 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")):

        self.device = device
        self.input_dim = input_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim
        self.gamma = gamma
        self.tau = tau
        self.data_path = arg.data_path

        print("Running DDPG Agent: using ", self.device)

        self.actor = Actor(input_dim, action_dim, hidden_dim).to(self.device)
        self.target_actor = Actor(input_dim, action_dim, hidden_dim).to(self.device)  # target NW
        self.critic = Critic(input_dim, action_dim, hidden_dim).to(self.device)
        self.target_critic = Critic(input_dim, action_dim, hidden_dim).to(self.device)# target NW


        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) # 
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.priority = False
        self.memory = ReplayMemory(int(memory_size), priority=self.priority)

        self.args = (input_dim, action_dim, hidden_dim)
        hard_update(self.target_actor, self.actor)  # Make sure target is with the same weight
        hard_update(self.target_critic, self.critic)
        self.create_save_file(filename)


        # for belief step
        self.Bstep = BeliefStep(arg).cuda()

    def select_action(self,  state, action_noise=None, param = None):

        if param is not None:
            mu = self.actor_perturbed(state).detach()
        else: # no parameter space noise
            mu = self.actor(state).detach()

        if action_noise is not None:
            mu += action_noise.noise()
        else:
            mu = mu
        return mu.clamp(-1, 1)

    def update_parameters(self, batch):
        states = torch.cat(batch.state)
        next_states = torch.cat(batch.next_state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward).unsqueeze(1)
        dones = torch.cat(batch.done)

        with torch.no_grad():
            next_actions = self.target_actor(next_states) # use target
            next_qvalues = self.target_critic(next_states, next_actions) # use target network
            #next_qvalues = self.target_critic(next_states, next_actions) * (1 - dones)
            target_qvalues = rewards + self.gamma * next_qvalues

        self.critic_optim.zero_grad()
        pred_qvalues = self.critic(states, actions)
        value_loss = torch.mean((pred_qvalues - target_qvalues)**2)
        value_loss.backward()
        self.critic_optim.step()

        self.actor_optim.zero_grad()
        policy_loss = -self.critic(states, self.actor(states))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        return policy_loss, value_loss

    def learn(self, batch_size=512):
            # sample new batch here
        batch, _ = self.memory.sample(batch_size)
        losses = self.update_parameters(batch)
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

        return losses

    def save(self, filename, episode):
        state = {
            'args': self.args,
            'actor_dict': self.actor.state_dict(),
            'critic_dict': self.critic.state_dict(),
        }

        torch.save(state, self.file)

    def load(self, filename):
        file = self.data_path +'trained_agent/'+filename+'.pth.tar'
        state = torch.load(file, map_location=lambda storage, loc: storage)
        if self.args != state['args']:
            print('Agent parameters from file are different from call')
            print('Overwriting agent to load file ... ')
            args = state['args']
            #self = Agent(*args)
            self.__init__(*args)

        self.actor.load_state_dict(state['actor_dict'])
        self.critic.load_state_dict(state['critic_dict'])
        hard_update(self.target_actor, self.actor)  # Make sure target is with the same weight
        hard_update(self.target_critic, self.critic)
        #print('Loaded')
        return

    def create_save_file(self, filename):
        path = self.data_path+'trained_agent'
        os.makedirs(path, exist_ok=True)
        if filename == None:
            self.file = next_path(path + '/' + 'ddpgmodel_%s.pth.tar')
        else: self.file = path + '/' + filename + '.pth.tar'


In [15]:
# firefly Env Model
"""
This is the main description for firefly task model
This code is for environment

This code uses the polar coordinate
next_x = torch.stack((vel, ang_vel, r, ang))
state = torch.cat([vel, ang_vel, r, ang, vecL, time]) # for policy network
"""



class Model(nn.Module):
    def __init__(self, arg):
        super(self.__class__, self).__init__()
        # constants
        self.dt = arg. DELTA_T
        self.action_dim = arg.ACTION_DIM
        self.state_dim = arg.STATE_DIM
        self.box = arg.WORLD_SIZE #initial value
        self.max_goal_radius = arg.goal_radius_range[0]
        self.GOAL_RADIUS_STEP = arg.GOAL_RADIUS_STEP_SIZE
        #self.rendering = Render()
        #self.reset()

    def reset(self, gains_range, noise_range, goal_radius_range,relative_angle_range,initial_radius_range=None, 
              goal_radius=None, pro_gains=None, pro_noise_ln_vars=None):

        self.pro_gains = torch.zeros(2).cuda()
        self.pro_noise_ln_vars = torch.zeros(2).cuda()


        if pro_gains is None:
            self.pro_gains[0] = torch.zeros(1).uniform_(gains_range[0], gains_range[1])  #[proc_gain_vel]
            self.pro_gains[1] = torch.zeros(1).uniform_(gains_range[2],
                                                        gains_range[3])  # [proc_gain_ang]
        else:
            self.pro_gains = pro_gains

        if pro_noise_ln_vars is None:

            self.pro_noise_ln_vars[0] = -1 * sample_exp(-noise_range[1], -noise_range[0]) #[proc_vel_noise]
            self.pro_noise_ln_vars[1] = -1 * sample_exp(-noise_range[3], -noise_range[2]) #[proc_ang_noise]
        else:
            self.pro_noise_ln_vars = pro_noise_ln_vars

        if goal_radius is None:
            #self.max_goal_radius = min(self.max_goal_radius + self.GOAL_RADIUS_STEP, goal_radius_range[1])
            #self.goal_radius = torch.zeros(1).uniform_(goal_radius_range[0], self.max_goal_radius).cuda()
            self.goal_radius = torch.zeros(1).uniform_(goal_radius_range[0], goal_radius_range[1]).cuda()
        else:
            self.goal_radius = goal_radius


        self.time = torch.zeros(1)
        if initial_radius_range is None:
            min_r = self.goal_radius.item()
            r = torch.sqrt(torch.zeros(1).uniform_(min_r**2, self.box**2))  # sample radius uniformly in 2D
            self.r = r
        else:
            r = torch.sqrt(torch.zeros(1).uniform_(initial_radius_range[0]**2, initial_radius_range[1]**2))
            self.r = r

        loc_ang = torch.zeros(1).uniform_(-pi, pi) # location angel: to determine initial location
        px = r * torch.cos(loc_ang)
        py = r * torch.sin(loc_ang)
        rel_ang = torch.zeros(1).uniform_(relative_angle_range[0], relative_angle_range[1])
        ang = rel_ang + loc_ang + pi # heading angle of monkey, pi is added in order to make the monkey toward firefly
        ang = range_angle(ang)

        vel = torch.zeros(1)
        ang_vel = torch.zeros(1)

        x = torch.cat([px, py, ang, vel, ang_vel]).cuda()
        return x, self.pro_gains, self.pro_noise_ln_vars, self.goal_radius


    def forward(self, x, a):
        # get a real next state of monkey
        next_x = dynamics(x, a, self.dt, self.box, self.pro_gains, self.pro_noise_ln_vars)
        pos = next_x.view(-1)[:2] # x and y position
        reached_target = (torch.norm(pos) <= self.goal_radius)

        return next_x, reached_target