In [None]:
import numpy as np
from scipy.linalg import lstsq
import  numpy.random as random
from operator import itemgetter


In [68]:
################################################################

num_of_epcoh=20
N=10
E=10
M=10
G=10
horiz_len=10
num_of_rollouts=10

################################################################

In [50]:
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def push_batch(self, batch):
        if len(self.buffer) < self.capacity:
            append_len = min(self.capacity - len(self.buffer), len(batch))
            self.buffer.extend([None] * append_len)

        if self.position + len(batch) < self.capacity:
            self.buffer[self.position : self.position + len(batch)] = batch
            self.position += len(batch)
        else:
            self.buffer[self.position : len(self.buffer)] = batch[:len(self.buffer) - self.position]
            self.buffer[:len(batch) - len(self.buffer) + self.position] = batch[len(self.buffer) - self.position:]
            self.position = len(batch) - len(self.buffer) + self.position

    def sample(self, batch_size):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        batch = random.sample(self.buffer, int(batch_size))
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def sample_all_batch(self, batch_size):
        idxes = np.random.randint(0, len(self.buffer), batch_size)
        batch = list(itemgetter(*idxes)(self.buffer))
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def return_all(self):
        return self.buffer

    def __len__(self):
        return len(self.buffer)


In [52]:
class ENV:
    def __init__(self,A,B,Q,R,target_state=np.ones(3)):
        self.A=A
        self.B=B
        self.Q=Q
        self.R=R
        self.target_state=target_state
        self.current_action=None
        self.current_state=None

    def reset(self):
        self.current_state=np.array([0.4,-0.6,0.2])
        return self.current_state

    def step(self,action):
        mean = self.A@np.array([self.current_state]).T + self.B@action
        print(mean)
        next_state=np.random.multivariate_normal(mean.T[0],np.eye(len(mean)))
        part_1=np.array([next_state])@self.Q@np.array([next_state]).T
        part_2=action.T@self.R@action
        
        self.current_action=action
        
        # if next_state==self.target_state:
        #     return part_1+part_2,next_state,True
        
        return part_1+part_2,next_state,False

In [73]:
def New_Estimate(D_real):
    if len(D_real)==0:
        return -1
    dim_state=D_real.buffer[0][0].shape
    dim_action=D_real.buffer[0][1].shape
    
    A_T=np.zeros(shape=(dim_state+dim_action,len(D_real)))
    b_T=np.zeros(shape=(dim_state,len(D_real)))
    for i in range(len(D_real)):
        b_T[:,i]=D_real.buffer[i][2]
        A_T[:,i]=np.concatenate(D_real.buffer[i][0],D_real.buffer[i][1])
    A=A_T.T
    b=b_T.T
    total_hat=lstsq(A,b)[0] # need to split
    A_hat=total_hat[:dim_state]
    B_hat=total_hat[dim_state:]
    
    A=np.zeros(shape=(len(D_real),dim_state+dim_action))
    b=np.zeros(shape=(len(D_real),1))
    for i in range(len(D_real.buffer)):
        A[i,:]=np.concatenate(D_real.buffer[i][0],D_real.buffer[i][1])
        b[i,:]=D_real.buffer[i][2]
    total_hat=lstsq(A,b)[0]
    total_hat=total_hat.T[0]
    Q_hat=total_hat[:dim_state]
    R_hat=total_hat[dim_state:]
    Q_hat=np.diag(Q_hat)
    R_hat=np.diag(R_hat)   
    return [A_hat,B_hat,Q_hat,R_hat]

: 

In [69]:
def Sample_state(D_real):
    ind=np.random.randint(low=0,high=D_real.position) 
    return D_real.buffer[ind][0]

def gradient_with_model(A,B):
    
    grad=4
    return grad 

def gradient_with_exp(D_fake):
    
    grad=4
    return grad

In [70]:
def update_D_fake(S_t,model,K_t,D_fake):
    
    A_hat,B_hat,Q_hat,R_hat=model[0],model[1],model[2],model[3]
    
    i=0
    holder=[]
    Is_done=False
    
    while i<horiz_len and Is_done!=True:
        prev=S_t
        u_T=K_t@np.array([S_t]).T
        S_t=A_hat@np.array([S_t]).T+B_hat@u_T
        
        next_state=np.random.multivariate_normal(S_t.T[0],np.eye(len(S_t)))
        
        part_1=np.array([next_state])@Q_hat@np.array([next_state]).T
        part_2=u_T.T@R_hat@u_T
        S_t=next_state
        
        Is_done=False
        
        D_fake.push(prev,u_T,part_1[0][0]+part_2[0][0],next_state,Is_done)
        i+=1
    
    return 

def update_D_real(K_t,env,D_real):
    
    i=0
    Is_done=False
    while i<horiz_len and Is_done!=True:
        u_T=K_t@np.array([env.current_state]).T
        R_t,S_t,Is_done=env.step(u_T)
        D_real.push(env.current_state,u_T,R_t[0][0],S_t,Is_done)
        env.current_state=S_t
        if Is_done:
            break
        i+=1  
    return 

In [75]:
# A ->>3*3
# B ->>3*3
# C->eye(3)
# K ->>3*3



np.random.seed(0)

# True parameters of the env

A=np.diag([-1,-2,3])
B=np.diag([1,0,3])
C=np.eye(3)
Q=np.diag([.1,0,0])
R=np.diag([.5,0,.1])



A_hat=np.random.rand(3,3)   # Initial theta
B_hat=np.random.rand(3,3)   # Initial theta
Q_hat=np.diag(np.ones(3))   # Initial theta
R_hat=np.diag(np.ones(3))   # Initial theta





K=np.random.rand(3,3)   # Initial phi
K_t=K

D_real = ReplayMemory(10000)   # Real data
D_fake = ReplayMemory(10000)  # Fake data


env=ENV(A,B,Q,R)
env.reset() # Reset
################################################################

num_of_epcoh=20
N=10
E=10
M=10
G=10
horiz_len=10
num_of_rollouts=10

################################################################

# model=[A_hat,B_hat,Q_hat,R_hat]


In [None]:
# Issue occured: diverging Entries in Matrix,state vector
# Solution : Find best set of param for the environment

In [76]:
for n in range(num_of_epcoh):
    model=New_Estimate(D_real) # Regression
    if model==-1:
        model=[A_hat,B_hat,Q_hat,R_hat]  # start 
    for e in range(E):
        S_t=np.random.rand(len(A_hat)) # Update D_real
        update_D_real(K_t,env,D_real)
        for m in range(M):
            S_t=Sample_state(D_real)    # Random sampling
            update_D_fake(S_t,model,K_t,D_fake) # update_D_fake
            # Working till here
        for g in range(G):
            K_t+=gradient_with_model(A_hat,B_hat)    # Known parameter
            K_t+=gradient_with_exp(D_fake)    # unKnown parameter (From trajectories -off policy settings)

[[-0.41502092]
 [ 1.2       ]
 [-0.32391655]]
[[ 2.20285507]
 [-0.91566996]
 [ 7.71273874]]
[[ 6.44176521]
 [ 1.73982288]
 [24.40866139]]
[[25.06132632]
 [-6.41836331]
 [93.24507107]]
[[ 77.31287582]
 [ 14.61229811]
 [308.04237289]]
[[ 298.43464609]
 [ -29.53729416]
 [1116.48501363]]
[[ 999.809639  ]
 [  59.84924195]
 [3877.44352507]]
[[ 3622.13329765]
 [ -116.85844803]
 [13760.5647427 ]]
[[12559.82209055]
 [  234.73620042]
 [48258.64862988]]
[[ 44644.08528672]
 [  -471.02738155]
 [170432.72110844]]


KeyboardInterrupt: 