# Debugging Policy gradient

In [61]:
import numpy as np
from scipy.linalg import lstsq
import  numpy.random as random
from operator import itemgetter
from scipy.optimize import nnls
import math
import control

In [85]:
# Defining required classes

class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    def flush_all(self):
        self.buffer = []
        self.position = 0
        return

    def push(self, state, action, reward, next_state, done,policy):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done,policy)
        self.position = (self.position + 1) % self.capacity

    def push_batch(self, batch):
        if len(self.buffer) < self.capacity:
            append_len = min(self.capacity - len(self.buffer), len(batch))
            self.buffer.extend([None] * append_len)

        if self.position + len(batch) < self.capacity:
            self.buffer[self.position : self.position + len(batch)] = batch
            self.position += len(batch)
        else:
            self.buffer[self.position : len(self.buffer)] = batch[:len(self.buffer) - self.position]
            self.buffer[:len(batch) - len(self.buffer) + self.position] = batch[len(self.buffer) - self.position:]
            self.position = len(batch) - len(self.buffer) + self.position

    def sample(self, batch_size):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        batch = random.sample(self.buffer, int(batch_size))
        state, action, reward, next_state, done,policy = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done,policy

    def sample_all_batch(self, batch_size):
        idxes = np.random.randint(0, len(self.buffer), batch_size)
        batch = list(itemgetter(*idxes)(self.buffer))
        state, action, reward, next_state, done,policy = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done,policy

    def return_all(self):
        return self.buffer

    def __len__(self):
        return len(self.buffer)


class ENV:
    def __init__(self,A,B,Q,R,target_state=np.zeros(3)):
        
        self.A=A
        self.B=B
        self.Q=Q
        self.R=R
        self.target_state=target_state
        self.current_action=None
        self.current_state=None
        
    def reset(self):
        # self.current_state=0.1*np.random.rand(3)
        self.current_state=0.1*np.zeros(3)
        return self.current_state

    def step(self,action):
        mean = self.A@np.array([self.current_state]).T + self.B@action
        next_state=np.random.multivariate_normal(mean.T[0],np.eye(len(mean)))
        part_1=np.array([next_state])@self.Q@np.array([next_state]).T
        part_2=action.T@self.R@action
        
        self.current_action=action
        if np.linalg.norm(next_state-self.target_state)<0.1:
            return part_1+part_2,next_state,True
        return part_1+part_2,next_state,False

In [63]:
sigma=0.8
gamma=0.1

In [86]:
def update_D_real(K_t,env,D_real,L,start_state=None):
    D_real.flush_all()
    i=0
    Is_done=False
    env.reset()
    while i<L and Is_done!=True:
        u_T=K_t@np.array([env.current_state]).T
  
        u_T=u_T+np.array([np.random.multivariate_normal(mean=np.zeros(3),cov=math.pow(sigma,2)*np.eye(3))]).T

        R_t,S_t,Is_done=env.step(u_T)
        New_policy=K_t+math.pow(sigma,2)*np.zeros(3) # Modified
        D_real.push(env.current_state,u_T,R_t[0][0],S_t,Is_done,New_policy)
        env.current_state=S_t
        if Is_done:
            break
        i+=1  
    return
def get_episodes(D_fake,L):
    i=0
    globl=[]
    locl=[]
    j=0
    for i in range(D_fake.position):
       locl.append(D_fake.buffer[i])
       flag=D_fake.buffer[i][4]
       i+=1
       j+=1
       if flag or j==L:
           globl.append(locl)
           locl=[]
           j=0
    return globl
def Normal_distrb(z, μ, Σ):
    
    z = np.atleast_2d(z)
    μ = np.atleast_2d(μ)
    Σ = np.atleast_2d(Σ)

    N = z.size

    temp1 = np.linalg.det(Σ) ** (-1/2)
    temp2 = np.exp(-.5 * (z - μ).T @ np.linalg.inv(Σ) @ (z - μ))

    return (2 * np.pi) ** (-N/2) * temp1 * temp2

def Get_importance_term(episode,K):
    # SARSA
    prod=1.0
    for i in range(len(episode)):
        S_t=episode[i][0]
        mean=K@np.array([S_t]).T
        var=np.eye(3)*math.pow(sigma,2)
        num=Normal_distrb(episode[i][1],mean,var)
        mean=episode[i][5]@np.array([S_t]).T
        var=np.eye(3)*math.pow(sigma,2)
        den=Normal_distrb(episode[i][1],mean,var)
        prod=prod*num/den
        i+=1
    return prod

# Term 3
def Get_Reward(episode,t):
    r=0.0
    for i in range(t,len(episode)):
        r+=episode[i][2]*math.pow(gamma,i)
        i+=1
    return r

# Term 2

def Get_gradient_for_episode(episode,K):
    i=0
    grad_sum=np.zeros_like(K)
    while i<len(episode):
        a_t=episode[i][1]
        s_t=episode[i][0]
        grad_sum+=(Get_gradient_for_tup(a_t,s_t,K)*Get_Reward(episode,i))
        i+=1
    return grad_sum

def Get_gradient_for_tup(a_t,s_t,K):
    un_normlised=a_t@np.array([s_t])-K@np.array([s_t]).T@np.array([s_t])
    return un_normlised

def get_gradient(list_of_episodes,K):
    i=0
    grad_sum=np.zeros_like(K)
    while i<len(list_of_episodes):
        term_1=Get_importance_term(list_of_episodes[i],K) # Tested
        term_3=Get_gradient_for_episode(list_of_episodes[i],K) # 
        grad_sum+=(term_1*term_3)
        i+=1
    rslt=grad_sum/(len(list_of_episodes)*math.pow(sigma,2))
    return rslt
    


 

In [87]:
# True parameters of the env
# np.random.seed(0)
A=0.1*np.random.rand(3,3)

# np.random.seed(4)
B=0.1*np.random.rand(3,3)

# np.random.seed(1)
Q=np.diag([1,0.1,0.01])

# np.random.seed(3)
R=np.diag([1,0.1,0.01])
env=ENV(A,B,Q,R)
env.reset() # Reset

K_star,_,_=control.lqr(A,B,Q,R)

A_hat=0.1*np.random.rand(3,3)   # Initial theta
B_hat=0.1*np.random.rand(3,3)   # Initial theta
Q_hat=0.1*np.diag(np.ones(3))   # Initial theta
R_hat=0.1*np.diag(np.ones(3))   # Initial theta

# init policy

K=np.random.rand(3,3)   # Initial phi
K_t=K

D_real = ReplayMemory(10000)   # Real dataA
D_fake = ReplayMemory(10000)  # Fake data



In [88]:
# Print K_Star
print(K_star)

[[ 1.94237918e-02  2.33502230e-01 -1.69306423e-03]
 [-8.45803493e-01  2.76048742e+00 -4.52276415e-02]
 [ 1.08134582e+01  9.60079300e-01  1.18210524e+00]]


In [67]:
def Diplay_the_param(A,B,Q,R,step_size,L):
    
    print("\n****************************************************************\n")
    print("A matrix is \n")
    print(A)
    
    print("\n****************************************************************\n")
    print("B matrix is \n")
    print(B)
    
    print("\n****************************************************************\n")
    print("Q matrix is \n")
    print(Q)
    
    print("\n****************************************************************\n")
    print("R matrix is \n")
    print(R)
    
    print("\n****************************************************************\n")
    print(" step_size is  \n")
    print(step_size)
    
    print("\n****************************************************************\n")
    print("L value is \n")
    print(L)
    
    return 

# Policy gradient- Grid search over hyperparameters

In [60]:
K=np.diag(np.random.rand(3))   # Initial phi
# K_t=K_star
K_t=K
L=60
L_list=[20,40,60,80]
step_size_list=[math.pow(0.1,2),math.pow(0.1,3),math.pow(0.1,4)]
step_size=math.pow(0.1,2)
T=10000





# init policy
count=0

while count<10:
    print("Counter is ",count)
    # True parameters of the env
    # np.random.seed(0)
    A=0.1*np.random.rand(3,3)

    # np.random.seed(4)
    B=0.1*np.random.rand(3,3)

    # np.random.seed(1)
    Q=np.diag([1,0.1,0.01])

    # np.random.seed(3)
    R=np.diag([1,0.1,0.01])
    env=ENV(A,B,Q,R)
    env.reset() # Reset

    K_star,_,_=control.lqr(A,B,Q,R)
    t=0.1*np.random.rand(3,3)   # Initial theta
    B_hat=0.1*np.random.rand(3,3)   # Initial theta
    Q_hat=0.1*np.diag(np.ones(3))   # Initial theta
    R_hat=0.1*np.diag(np.ones(3))   # Initial theta
    
    for l in L_list:
        for step in step_size_list:
            L=l
            step_size=step
        
            K=np.random.rand(3,3)   # Initial phi
            K_t=K
            D_real.flush_all()
            D_fake.flush_all()
            env.reset()
            error_list=[]

            for t in range(T):
                D_real.flush_all()
                update_D_real(K_t,env,D_real,L,None)
                list_of_episodes=get_episodes(D_real,L)    
                K_t=K_t-(step_size*get_gradient(list_of_episodes,K_t))
                if t%10==0:
                    m=np.linalg.norm(K_t-K_star)
                    error_list.append(m)
            if min(np.array(error_list))<1:
                print("Found a good set") # a function to display 
                Diplay_the_param(A,B,Q,R,step_size,L)
    count+=1

Counter is  0
Counter is  1
Counter is  2
Counter is  3
Counter is  4
Counter is  5
Counter is  6
Counter is  7
Counter is  8
Counter is  9


# Simple policy_gradient

In [94]:

L=10
step_size=math.pow(0.1,2)
T=100000

K=np.random.rand(3,3)   # Initial phi
K_t=K
D_real.flush_all()
D_fake.flush_all()
env.reset()
error_list=[]

for t in range(T):
    D_real.flush_all()
    update_D_real(K_t,env,D_real,L,None)
    list_of_episodes=get_episodes(D_real,L)
    # for i in range(10):    
    #     K_t=K_t-(step_size*get_gradient(list_of_episodes,K_t))
    K_t=K_t-(step_size*get_gradient(list_of_episodes,K_t))
    if t%1000==0:
        m=np.linalg.norm(K_t-K_star)
        error_list.append(m)
        print(m)

11.229409185231964
10.785115308735614
10.803368766158815
10.85452881094977
10.842238215763823
10.73952091442029
10.81384903186435
10.72828579040138
10.827482036877061
10.683984648583179
10.816427400826598
10.869685463926597
10.995933984373107
11.220048131827662
11.098584356306215
11.07124866981851
11.069939534774567
10.98325532904048
10.883747192461955
10.830533694679081
10.883201767552086
11.260779225855694
11.203337721808962
11.353916586230959
11.503543420681178
11.51571195898706
11.557851717522242
11.627425618399236
11.597522390838712
11.631922508868646
11.60446417362614
11.65873297786414
11.472903769523022
11.541904984648818
11.464379136688764
11.50664429724758
11.562908588768607
11.548562358387576
11.68872359871662
11.729774315290337
11.579080487444875
11.475109025079036
11.434014490925016
11.57393817236187
11.457521288164154
11.436555820070184
11.387931790523055
11.328278293172898
11.489948583448781
11.485570457696644
11.57120304678667
11.730772046577133
11.819022915146348
11.669