In [290]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils as utils
from torch.autograd import Variable

import math

from blackhc.mdp import dsl
from blackhc import mdp
import time

from blackhc.mdp import lp
import functools
import numpy as np

from tqdm import tqdm
from matplotlib import pyplot as plt
from numpy import random

from operator import itemgetter

from collections import defaultdict


In [291]:
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    def flush_all(self):
        self.buffer = []
        self.position = 0
        return

    def push(self, state, action, reward, next_state,policy):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state,policy)
        self.position = (self.position + 1) % self.capacity

    def push_batch(self, batch):
        if len(self.buffer) < self.capacity:
            append_len = min(self.capacity - len(self.buffer), len(batch))
            self.buffer.extend([None] * append_len)

        if self.position + len(batch) < self.capacity:
            self.buffer[self.position : self.position + len(batch)] = batch
            self.position += len(batch)
        else:
            self.buffer[self.position : len(self.buffer)] = batch[:len(self.buffer) - self.position]
            self.buffer[:len(batch) - len(self.buffer) + self.position] = batch[len(self.buffer) - self.position:]
            self.position = len(batch) - len(self.buffer) + self.position

    def sample(self, batch_size):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        batch = random.sample(self.buffer, int(batch_size))
        state, action, reward, next_state,policy = map(np.stack, zip(*batch))
        return state, action, reward, next_state,policy

    def sample_all_batch(self, batch_size):
        idxes = np.random.randint(0, len(self.buffer), batch_size)
        batch = list(itemgetter(*idxes)(self.buffer))
        state, action, reward, next_state,policy = map(np.stack, zip(*batch))
        return state, action, reward, next_state,policy

    def return_all(self):
        return self.buffer

    def __len__(self):
        return len(self.buffer)

In [292]:
# Assumption : The reward function is known prior:
import itertools

class A_model:
    def __init__(self,num_of_states,num_of_actions,prior_vec):
        self.states = np.arange(num_of_states)
        self.actions = np.arange(num_of_actions)
        self.terminal_state = None
        self.state_action_to_state_dict=dict()
        self.state_action_to_reward_dict=dict()
        self.prior_vec = prior_vec
        
        self.curr_state = None
        self.horizon_len=20 # this specifies the horizon_len for updating D_real
        self.last_seen_len=0 # assuming D_real with infinit capacity
        self.k=10 # specifies the horizon len for updating_len for updating D_fake  
        
    def configure(self):
        # Initialisng the transition prob_matrix
        key_list=list(itertools.product(self.states, self.actions))
        for x in key_list:
            self.state_action_to_state_dict.update({x:dict()})
            for y in range(len(self.states)):
                self.state_action_to_state_dict[x].update({self.states[y]:self.prior_vec[y]})
        # init the rewrad matrix
        
        for x in key_list:
            self.state_action_to_reward_dict.update({x:dict()})
            for y in range(len(self.states)):
                if self.states[y]==self.terminal_state:
                    self.state_action_to_reward_dict[x].update({self.states[y]:10})
                else:
                    self.state_action_to_reward_dict[x].update({self.states[y]:0})
        
    def reset(self):
        self.state_action_to_state_dict=dict()
        self.state_action_to_reward_dict=dict()
        self.last_seen_len=0
    
    def update_param_given_epi(self,D_real):
        
        episodes=D_real.buffer[self.last_seen_len:len(D_real.buffer)]
        self.last_seen_len=len(D_real.buffer)

        # print(len(episodes))
        
        
        # following SARSA format
        for epi_id in range(len(episodes)):
    
            t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(episodes[epi_id])
            i=0 
            
            while i<len(t_states):
                
                # updating the list of states
                
                # tru_tup=self.state_action_to_state_dict[(t_states[i],t_actions[i].item())]


                self.state_action_to_state_dict[(t_states[i],t_actions[i].item())][t_nstates[i]]+=1
                self.state_action_to_reward_dict[(t_states[i],t_actions[i].item())][t_nstates[i]]=t_rewards[i]
                                    
                i+=1  
                
            if self.terminal_state is None and i<self.horizon_len:
                self.terminal_state=t_nstates[i-1]
                self.state_list.append(t_nstates[i-1])
        return

    def cvt_axis(self,traj):
        
        t_states =[]
        t_actions =[]
        t_nstates =[]
        t_rewards=[]
        t_log_probs=[]
        
        for i in range(len(traj[0])):
            t_states.append(traj[0][i])
            t_actions.append(traj[1][i])
            t_rewards.append(traj[2][i])
            t_nstates.append(traj[3][i])
            t_log_probs.append(traj[4][i])

        return (t_states, t_actions, t_rewards,t_nstates,t_log_probs) 
    
    def step(self,a_t):
        
        next_state=0
        un_norm_distr=self.state_action_to_state_dict[(self.curr_state,a_t.item())]
        norm_factor=sum(list(un_norm_distr.values()))
        choices=list(un_norm_distr.keys())
        p=[x/norm_factor for x in un_norm_distr.values()]
        
        next_state_id=np.random.choice(np.arange(len(choices)),p=p)
        next_state=choices[next_state_id]
        reward=self.state_action_to_reward_dict[(self.curr_state,a_t.item())][next_state]

        self.curr_state=next_state
        Is_done=False
        if self.terminal_state==next_state:
            Is_done=True

        return next_state,reward,Is_done,None

    def set_start_state(self):
        self.curr_state=np.random.choice(self.states)
        while self.curr_state==self.terminal_state:
            self.curr_state=np.random.choice(self.states)
        return


In [293]:
def  _multi_round_nmdp_simple():
    with dsl.new() as mdp:
        # Write down the MDP dynamics here 
        
        start = dsl.state()
        S_1=dsl.state()
        end = dsl.terminal_state()
        
        A_0=dsl.action()
        A_1=dsl.action()

        start & A_0 > dsl.reward(0) | dsl.reward(10)
        start & A_0 > start * 10 | end
        start & A_1 > dsl.reward(0) | dsl.reward(10) | dsl.reward(0)
        start & A_1 > start * 10 | end * 1 | S_1 * 1
        
        S_1 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_0 > S_1 * 1 | start
        S_1 & A_1 > dsl.reward(0) | dsl.reward(10)
        S_1 & A_1 > start * 5 | end
        
        dsl.discount(0.5)

        return mdp.validate()
    
def  _multi_round_nmdp_complex():
    with dsl.new() as mdp:
        # Write down the MDP dynamics here 
        
        start = dsl.state()
        S_1=dsl.state()
        S_2=dsl.state()
        S_3=dsl.state()
        S_4=dsl.state()
        S_5=dsl.state()
        end = dsl.terminal_state()
        
        A_0=dsl.action()
        A_1=dsl.action()

        start & A_0 > dsl.reward(10) | dsl.reward(0)
        start & A_0 > end * 1 | start
        start & A_1 > dsl.reward(0) | dsl.reward(0)
        start & A_1 > start * 1 | S_1
        
        S_1 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_0 > S_1 * 1 | start
        S_1 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_1 > S_1 * 1 | S_2
        
        S_2 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_2 & A_0 > S_2 * 1 | S_1
        S_2 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_2 & A_1 > S_2 * 1 | S_3
        
        S_3 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_3 & A_0 > S_3 * 1 | S_2
        S_3 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_3 & A_1 > S_3 * 1 | S_4
        
        S_4 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_4 & A_0 > S_4 * 1 | S_3
        S_4 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_4 & A_1 > S_4 * 1 | S_5
        
        S_5 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_5 & A_0 > S_5 * 1 | S_4
        S_5 & A_1 > dsl.reward(10) | dsl.reward(0)
        S_5 & A_1 > end * 1 | S_1
        
        dsl.discount(0.5)

        return mdp.validate() 

MULTI_ROUND_NDMP = _multi_round_nmdp_complex()



solver = lp.LinearProgramming(MULTI_ROUND_NDMP)

In [294]:

# update D_real


def update_D_real(D_real,env,Agent,num_of_epochs):
    
    A1=Agent
    
    horizon_len=20
    
    env.reset()
    
    s_t_index=env._state.index

    trajs=[]
    # D_real.flush_all()

    result=0


    for traj_id in range(num_of_epochs):
        env.reset()
        # display_env()
        s_t_index=env._state.index
        
        states=[]
        log_probs=[]
        rewards=[]
        actions=[]
        nstates=[]
        
        for t in range(horizon_len):
            
            s_t=F.one_hot(torch.tensor(s_t_index),num_classes=env.observation_space.n).unsqueeze(dim=0)
            s_t=s_t.type(torch.FloatTensor)
            a_t, log_prob = A1.action(s_t)
            ns_t_index, r_t, done, _ = env.step(a_t.numpy()[0][0])
            
            states.append(s_t_index)
            actions.append(a_t)
            log_probs.append(log_prob)
            rewards.append(r_t)
            nstates.append(ns_t_index)
            s_t_index=ns_t_index
            if done:
                break   
        D_real.push(states, actions, rewards,nstates, log_probs)
    
    return D_real


In [295]:
env = MULTI_ROUND_NDMP.to_env()
env.reset()
print(env.observation_space.n)
print(env.action_space.n)

7
2


In [296]:
estimator_1=A_model(env.observation_space.n,env.action_space.n,np.ones(env.observation_space.n))
D_real=ReplayMemory(capacity=10000)

In [369]:

def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)



class Network(nn.Module):

    def __init__(self, input_layer,output_layer):
        super(Network, self).__init__()
        
        self.fc1 = nn.Linear(input_layer, output_layer,bias=False)
        self.fc2=nn.Softmax(dim=1)

    def forward(self, input_):
        x=self.fc1(input_)
        y=self.fc2(x)
        return y
    


class Agent():

    def __init__(self,observation_space,action_space,gamma=0.99,learning_rate=1e-3,horizon_len=20,k=10,fraction_of_real=0.5,batch_size=200):

        self.model = Network(observation_space.n,action_space.n)
        self.gamma = gamma
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.model.train()
        self.horizon_len=horizon_len # Assuming we already know the horizon length
        self.env_model =A_model(observation_space.n,action_space.n,np.ones(observation_space.n))
        self.env_model.terminal_state=observation_space.n-1
        self.env_model.configure()
        
        self.D_fake=ReplayMemory(capacity=100000)
        self.fraction_of_real=fraction_of_real


    def reset(self):
        init_weights(self.model)
        self.env_model.reset()
        self.D_fake.flush_all()

    def action(self, state):
        
        probs = self.model(Variable(state))
        action = probs.multinomial(1).data
        prob = probs[:, action[0,0]].view(1, -1)
        log_prob = prob.log()

        return(action, log_prob)

    def index_to_onehot(self,id):
        id=torch.tensor(id)
        s_t = torch.nn.functional.one_hot(id,num_classes=len(self.env_model.states))
        s_t=s_t.type(torch.FloatTensor)
        
        return s_t

    def onehot_to_index(self,x):
        id= torch.argmax(x, dim=1)
        return id
    
    def update_D_fake(self,start_state=None,num_of_epi=20):
        

        
        
        result=[]
        

        trajs=[]

        for traj_id in range(num_of_epi):
            
            # Start state initialisation
            
            if start_state is None:
                self.env_model.set_start_state()
                s_t=self.env_model.curr_state
            else:
                s_t=start_state
                self.env_model.curr_state=s_t
            
            # defining empty lists
             
            states=[]
            log_probs=[]
            rewards=[]
            actions=[]
            nstates=[]
            
            for t in range(self.env_model.k):
                s_t_rep2=self.index_to_onehot(s_t)
                s_t_rep2=torch.unsqueeze(s_t_rep2,0)
                s_t_rep2=s_t_rep2.type(torch.FloatTensor)
                a_t, log_prob = self.action(s_t_rep2)
                ns_t, r_t, done, _ = self.env_model.step(a_t)

                states.append(s_t)
                actions.append(a_t)
                log_probs.append(log_prob)
                rewards.append(r_t)
                nstates.append(ns_t)
                s_t=ns_t
                if done:
                    break
            self.D_fake.push(states, actions, rewards,nstates, log_probs)      
        return 
       
    def cvt_axis(self,trajs):
        t_states = []
        t_actions = []
        t_rewards = []
        t_nstates = []
        t_log_probs = []

        for traj in trajs:
            t_states.append(traj[0])
            t_actions.append(traj[1])
            t_rewards.append(traj[2])
            t_nstates.append(traj[3])
            t_log_probs.append(traj[4])

        return (t_states, t_actions, t_rewards,t_states,t_log_probs)
    
    def reward_to_value(self,t_rewards, gamma):

        t_Rs = []

        for rewards in t_rewards:
            Rs = []
            R = torch.zeros(1, 1)

            for i in reversed(range(len(rewards))):
                R = gamma * R + rewards[i]
                Rs.insert(0, R)
            t_Rs.append(Rs)
            
        return(t_Rs)

    def cal_log_prob(self, state, action):
        # state=torch.tensor([state])
        state=torch.unsqueeze(state,0)
        probs = self.model(Variable(state))
        prob = probs[:, action[0,0]].view(1, -1)
        log_prob = prob.log()

        return(log_prob)
    
    def MBPO_train_1(self,D_real,mult_fcator=None):
        
        # Given D_real,and a multiplicative factor,will generate fake_data 
        # ST :len(D_fake)=multipl_factor*len(D_real)
        
        self.env_model.reset()
        self.env_model.update_param_given_epi(D_real)
        self.init_env_model()
        
        multiple_factor = (1-self.fraction_of_real)/self.fraction_of_real
        if mult_fcator is not None:
            multiple_factor=mult_fcator
        self.update_D_fake(int(multiple_factor*D_real.position))
        data_list=self.D_fake.buffer
        
        
        
        t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(data_list)
        t_Rs = self.reward_to_value(t_rewards, self.gamma)

        Z = 0
        b = 0
        losses = []
        Z_s = []

        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            p_log_prob = 0
            q_log_prob = 0
            for t in range(len(Rs)):
                p_log_prob += (self.cal_log_prob(self.index_to_onehot(states[t]), actions[t])).data.numpy()
                q_log_prob += log_probs[t].data.numpy()
            Z_ = math.exp(p_log_prob) / math.exp(q_log_prob)
            Z += Z_
            Z_s.append(Z_)
            b += Z_ * sum(Rs) / len(Rs)
        b = b / Z
        
        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            loss = 0.

            for t in range(len(Rs)):
                loss = loss - (log_probs[t] * (Variable(Rs[t] - b).expand_as(log_probs[t]))).sum()

            Z_ = Z_s.pop(0)
            loss = loss / Z_
            losses.append(loss)
            
        loss = sum(losses) / Z

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        utils.clip_grad_value_(self.model.parameters(),40)
        self.optimizer.step()
        
        return
      
    def MBPO_train_2(self,D_real,fraction_of_real=None,mult_factor=1):
        
        # Given D_real,and the fraction of real to fake trajs,then train the policy on data comprising D_fake and D_real
        # ST real_ratio  follows the value given
        

        self.env_model.set_start_state()
        self.env_model.update_param_given_epi(D_real)
        self.env_model.set_start_state()
        self.D_fake.flush_all()
        # NOTE : Here I'm completely flushing out past data and refill again 
        
        self.update_D_fake(None,int(mult_factor*D_real.position))
        
        frc_of_real=self.fraction_of_real
        if fraction_of_real is not None:
            frc_of_real=fraction_of_real
        
        batch_size=D_real.position
        
        num_of_real_epi=int(batch_size*frc_of_real)
        num_of_fake_epi=batch_size-num_of_real_epi
        
        pos_list=np.random.choice(a=len(self.D_fake.buffer),size=min([num_of_fake_epi,len(self.D_fake.buffer)]),replace=False)
        fake_data_list=[self.D_fake.buffer[pos] for pos in pos_list]
        
        pos_list=np.random.choice(a=len(D_real.buffer),size=min([num_of_real_epi,len(D_real.buffer)]),replace=False)
        real_data_list=[D_real.buffer[pos] for pos in pos_list]
        
        data_list=real_data_list+fake_data_list
        
        # print(len(real_data_list))
        
        t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(data_list)
        t_Rs = self.reward_to_value(t_rewards, self.gamma)

        Z = 0
        b = 0
        losses = []
        Z_s = []

        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            p_log_prob = 0
            q_log_prob = 0
            for t in range(len(Rs)):
                p_log_prob += (self.cal_log_prob(self.index_to_onehot(states[t]), actions[t])).data.numpy()
                q_log_prob += log_probs[t].data.numpy()
            Z_ = math.exp(p_log_prob) / math.exp(q_log_prob)
            Z += Z_
            Z_s.append(Z_)
            b += Z_ * sum(Rs) / len(Rs)
        b = b / Z
        
        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            loss = 0.

            for t in range(len(Rs)):
                loss = loss - (log_probs[t] * (Variable(Rs[t] - b).expand_as(log_probs[t]))).sum()

            Z_ = Z_s.pop(0)
            loss = loss / Z_
            losses.append(loss)
            
        loss = sum(losses) / Z

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        utils.clip_grad_value_(self.model.parameters(),40)
        self.optimizer.step()
        return

    def train_(self, D_real):
        
        # Pure policy gradient
        
        data_list=D_real.buffer
        # print(len(data_list))
        
        t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(data_list)
        t_Rs = self.reward_to_value(t_rewards, self.gamma)

        Z = 0
        b = 0
        losses = []
        Z_s = []

        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            p_log_prob = 0
            q_log_prob = 0
            for t in range(len(Rs)):
                p_log_prob += (self.cal_log_prob(self.index_to_onehot(states[t]), actions[t])).data.numpy()
                q_log_prob += log_probs[t].data.numpy()
            Z_ = math.exp(p_log_prob) / math.exp(q_log_prob)
            Z += Z_
            Z_s.append(Z_)
            b += Z_ * sum(Rs) / len(Rs)
        b = b / Z


        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            loss = 0.

            for t in range(len(Rs)):
                loss = loss - (log_probs[t] * (Variable(Rs[t] - b).expand_as(log_probs[t]))).sum()

            Z_ = Z_s.pop(0)
            loss = loss / Z_
            losses.append(loss)
            
        loss = sum(losses) / Z

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        utils.clip_grad_value_(self.model.parameters(),40)
        self.optimizer.step()

    def mod_MBPO_train_2(self,D_real,fraction_of_real=None,mult_factor=1):
        
        # Given D_real,and the fraction of real to fake trajs,then train the policy on data comprising D_fake and D_real
        # ST real_ratio  follows the value given
        

        self.env_model.set_start_state()
        self.env_model.update_param_given_epi(D_real)
        self.env_model.set_start_state()
        # self.D_fake.flush_all()
        
        # NOTE : Here I'm completely flushing out past data and refill again 
        # print(self.D_fake.position)
        # print(int(mult_factor*D_real.position)-self.D_fake.position)
        
        self.update_D_fake(None,int(mult_factor*D_real.position)-self.D_fake.position)
        
        frc_of_real=self.fraction_of_real
        if fraction_of_real is not None:
            frc_of_real=fraction_of_real
        
        batch_size=D_real.position
        
        num_of_real_epi=int(batch_size*frc_of_real)
        num_of_fake_epi=batch_size-num_of_real_epi
        
        pos_list=np.random.choice(a=len(self.D_fake.buffer),size=min([num_of_fake_epi,len(self.D_fake.buffer)]),replace=False)
        fake_data_list=[self.D_fake.buffer[pos] for pos in pos_list]
        
        pos_list=np.random.choice(a=len(D_real.buffer),size=min([num_of_real_epi,len(D_real.buffer)]),replace=False)
        real_data_list=[D_real.buffer[pos] for pos in pos_list]
        # print(len(fake_data_list))
        data_list=real_data_list+fake_data_list
        
        # print(len(real_data_list))
        
        t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(data_list)
        t_Rs = self.reward_to_value(t_rewards, self.gamma)

        Z = 0
        b = 0
        losses = []
        Z_s = []

        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            p_log_prob = 0
            q_log_prob = 0
            for t in range(len(Rs)):
                p_log_prob += (self.cal_log_prob(self.index_to_onehot(states[t]), actions[t])).data.numpy()
                q_log_prob += log_probs[t].data.numpy()
            Z_ = math.exp(p_log_prob) / math.exp(q_log_prob)
            Z += Z_
            Z_s.append(Z_)
            b += Z_ * sum(Rs) / len(Rs)
        b = b / Z
        
        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            loss = 0.

            for t in range(len(Rs)):
                loss = loss - (log_probs[t] * (Variable(Rs[t] - b).expand_as(log_probs[t]))).sum()

            Z_ = Z_s.pop(0)
            loss = loss / Z_
            losses.append(loss)
            
        loss = sum(losses) / Z

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        utils.clip_grad_value_(self.model.parameters(),40)
        self.optimizer.step()
        return

        
        



In [298]:
A_1=Agent(env.observation_space,env.action_space)
A_1.env_model.set_start_state()
D_real.flush_all()

In [288]:
init_params=torch.ones(env.action_space.n,env.observation_space.n)/env.action_space.n

In [319]:
def  _multi_round_nmdp_simple():
    with dsl.new() as mdp:
        # Write down the MDP dynamics here 
        
        start = dsl.state()
        S_1=dsl.state()
        end = dsl.terminal_state()
        
        A_0=dsl.action()
        A_1=dsl.action()

        start & A_0 > dsl.reward(0) | dsl.reward(10)
        start & A_0 > start * 10 | end
        start & A_1 > dsl.reward(0) | dsl.reward(10) | dsl.reward(0)
        start & A_1 > start * 10 | end * 1 | S_1 * 1
        
        S_1 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_0 > S_1 * 1 | start
        S_1 & A_1 > dsl.reward(0) | dsl.reward(10)
        S_1 & A_1 > start * 5 | end
        
        dsl.discount(0.5)

        return mdp.validate()
    
def  _multi_round_nmdp_complex():
    with dsl.new() as mdp:
        # Write down the MDP dynamics here 
        
        start = dsl.state()
        S_1=dsl.state()
        S_2=dsl.state()
        S_3=dsl.state()
        S_4=dsl.state()
        S_5=dsl.state()
        end = dsl.terminal_state()
        
        A_0=dsl.action()
        A_1=dsl.action()

        start & A_0 > dsl.reward(10) | dsl.reward(0)
        start & A_0 > end * 1 | start * 10
        start & A_1 > dsl.reward(0) | dsl.reward(0)
        start & A_1 > start * 10 | S_1 * 1
        
        S_1 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_0 > S_1 * 10 | start * 1
        S_1 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_1 > S_1 * 1 | S_2
        
        S_2 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_2 & A_0 > S_2 * 1 | S_1
        S_2 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_2 & A_1 > S_2 * 1 | S_3
        
        S_3 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_3 & A_0 > S_3 * 1 | S_2
        S_3 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_3 & A_1 > S_3 * 1 | S_4
        
        S_4 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_4 & A_0 > S_4 * 1 | S_3
        S_4 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_4 & A_1 > S_4 * 1 | S_5
        
        S_5 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_5 & A_0 > S_5 * 1 | S_4
        S_5 & A_1 > dsl.reward(10) | dsl.reward(0)
        S_5 & A_1 > end * 1 | S_1 * 10
        
        dsl.discount(0.5)

        return mdp.validate() 

MULTI_ROUND_NDMP = _multi_round_nmdp_complex()



solver = lp.LinearProgramming(MULTI_ROUND_NDMP)
print(solver.compute_q_table(max_iterations=10000, all_close=functools.partial(np.allclose, rtol=1e-10, atol=1e-10)))
env = MULTI_ROUND_NDMP.to_env()
env.reset()

[[9.16666667 4.20138889]
 [0.76388889 0.25462963]
 [0.25462963 0.21219136]
 [0.21219136 0.5941358 ]
 [0.5941358  1.78240741]
 [1.78240741 5.34722222]
 [0.         0.        ]]


1

In [382]:
A_1=Agent(env.observation_space,env.action_space)
A_1.env_model.set_start_state()
D_real.flush_all()
init_params=torch.ones(env.action_space.n,env.observation_space.n)/env.action_space.n
for param in A_1.model.fc1.parameters():
    param.data = nn.parameter.Parameter(init_params)
print("Before")
print(list(A_1.model.fc1.parameters()))

for epoch_id in tqdm(range(600)):
    D_real=update_D_real(D_real,env,A_1,1)
    if epoch_id%25==0:
        print(list(A_1.model.fc1.parameters()))
    for i in range(1):
        A_1.mod_MBPO_train_2(D_real,0)
        # A_1.train_(D_real)
print("After")
print(list(A_1.model.fc1.parameters()))

Before
[Parameter containing:
tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000]],
       requires_grad=True)]


  2%|▏         | 12/600 [00:00<00:04, 119.01it/s]

[Parameter containing:
tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000]],
       requires_grad=True)]


  4%|▍         | 24/600 [00:00<00:07, 74.95it/s] 

[Parameter containing:
tensor([[0.4859, 0.4812, 0.4908, 0.5173, 0.4844, 0.4950, 0.5000],
        [0.5141, 0.5188, 0.5092, 0.4827, 0.5156, 0.5050, 0.5000]],
       requires_grad=True)]


  9%|▉         | 53/600 [00:01<00:20, 26.18it/s]

[Parameter containing:
tensor([[0.4923, 0.4766, 0.4976, 0.5373, 0.4730, 0.4746, 0.5000],
        [0.5077, 0.5234, 0.5024, 0.4627, 0.5270, 0.5254, 0.5000]],
       requires_grad=True)]


 13%|█▎        | 76/600 [00:03<00:38, 13.48it/s]

[Parameter containing:
tensor([[0.5021, 0.4777, 0.5074, 0.5472, 0.4614, 0.4574, 0.5000],
        [0.4979, 0.5223, 0.4926, 0.4528, 0.5386, 0.5426, 0.5000]],
       requires_grad=True)]


 17%|█▋        | 101/600 [00:05<00:55,  8.96it/s]

[Parameter containing:
tensor([[0.5171, 0.4810, 0.5114, 0.5492, 0.4493, 0.4559, 0.5000],
        [0.4829, 0.5190, 0.4886, 0.4508, 0.5507, 0.5441, 0.5000]],
       requires_grad=True)]


 21%|██        | 126/600 [00:09<01:04,  7.33it/s]

[Parameter containing:
tensor([[0.5363, 0.4855, 0.5115, 0.5509, 0.4392, 0.4625, 0.5000],
        [0.4637, 0.5145, 0.4885, 0.4491, 0.5608, 0.5375, 0.5000]],
       requires_grad=True)]


 25%|██▌       | 151/600 [00:12<01:14,  6.02it/s]

[Parameter containing:
tensor([[0.5560, 0.4911, 0.5137, 0.5561, 0.4285, 0.4691, 0.5000],
        [0.4440, 0.5089, 0.4863, 0.4439, 0.5715, 0.5309, 0.5000]],
       requires_grad=True)]


 29%|██▉       | 176/600 [00:17<01:23,  5.09it/s]

[Parameter containing:
tensor([[0.5747, 0.4969, 0.5169, 0.5620, 0.4176, 0.4687, 0.5000],
        [0.4253, 0.5031, 0.4831, 0.4380, 0.5824, 0.5313, 0.5000]],
       requires_grad=True)]


 33%|███▎      | 200/600 [00:22<01:29,  4.49it/s]

[Parameter containing:
tensor([[0.5913, 0.5027, 0.5210, 0.5660, 0.4070, 0.4613, 0.5000],
        [0.4087, 0.4973, 0.4790, 0.4340, 0.5930, 0.5387, 0.5000]],
       requires_grad=True)]


 38%|███▊      | 225/600 [00:28<01:35,  3.91it/s]

[Parameter containing:
tensor([[0.6118, 0.5117, 0.5263, 0.5693, 0.3964, 0.4490, 0.5000],
        [0.3882, 0.4883, 0.4737, 0.4307, 0.6036, 0.5510, 0.5000]],
       requires_grad=True)]


 42%|████▏     | 250/600 [00:35<01:39,  3.51it/s]

[Parameter containing:
tensor([[0.6345, 0.5240, 0.5329, 0.5731, 0.3864, 0.4348, 0.5000],
        [0.3655, 0.4760, 0.4671, 0.4269, 0.6136, 0.5652, 0.5000]],
       requires_grad=True)]


 46%|████▌     | 275/600 [00:43<01:41,  3.21it/s]

[Parameter containing:
tensor([[0.6582, 0.5389, 0.5458, 0.5761, 0.3757, 0.4203, 0.5000],
        [0.3418, 0.4611, 0.4542, 0.4239, 0.6243, 0.5797, 0.5000]],
       requires_grad=True)]


 50%|█████     | 300/600 [00:51<01:42,  2.93it/s]

[Parameter containing:
tensor([[0.6806, 0.5544, 0.5714, 0.5770, 0.3645, 0.4020, 0.5000],
        [0.3194, 0.4456, 0.4286, 0.4230, 0.6355, 0.5980, 0.5000]],
       requires_grad=True)]


 54%|█████▍    | 325/600 [01:00<01:41,  2.70it/s]

[Parameter containing:
tensor([[0.7031, 0.5743, 0.6069, 0.5799, 0.3523, 0.3837, 0.5000],
        [0.2969, 0.4257, 0.3931, 0.4201, 0.6477, 0.6163, 0.5000]],
       requires_grad=True)]


 58%|█████▊    | 350/600 [01:10<01:39,  2.50it/s]

[Parameter containing:
tensor([[0.7252, 0.5949, 0.6471, 0.5843, 0.3392, 0.3659, 0.5000],
        [0.2748, 0.4051, 0.3529, 0.4157, 0.6608, 0.6341, 0.5000]],
       requires_grad=True)]


 62%|██████▎   | 375/600 [01:21<01:40,  2.23it/s]

[Parameter containing:
tensor([[0.7472, 0.6165, 0.6907, 0.5874, 0.3254, 0.3506, 0.5000],
        [0.2528, 0.3835, 0.3093, 0.4126, 0.6746, 0.6494, 0.5000]],
       requires_grad=True)]


 67%|██████▋   | 400/600 [01:32<01:33,  2.14it/s]

[Parameter containing:
tensor([[0.7701, 0.6391, 0.7379, 0.5913, 0.3109, 0.3363, 0.5000],
        [0.2299, 0.3609, 0.2621, 0.4087, 0.6891, 0.6637, 0.5000]],
       requires_grad=True)]


 71%|███████   | 425/600 [01:44<01:26,  2.03it/s]

[Parameter containing:
tensor([[0.7952, 0.6659, 0.7898, 0.5949, 0.2943, 0.3213, 0.5000],
        [0.2048, 0.3341, 0.2102, 0.4051, 0.7057, 0.6787, 0.5000]],
       requires_grad=True)]


 75%|███████▌  | 450/600 [01:58<01:20,  1.86it/s]

[Parameter containing:
tensor([[0.8228, 0.6981, 0.8483, 0.5968, 0.2753, 0.3066, 0.5000],
        [0.1772, 0.3019, 0.1517, 0.4032, 0.7247, 0.6934, 0.5000]],
       requires_grad=True)]


 79%|███████▉  | 475/600 [02:11<01:12,  1.74it/s]

[Parameter containing:
tensor([[0.8530, 0.7393, 0.9138, 0.5986, 0.2545, 0.2892, 0.5000],
        [0.1470, 0.2607, 0.0862, 0.4014, 0.7455, 0.7108, 0.5000]],
       requires_grad=True)]


 83%|████████▎ | 500/600 [02:26<00:58,  1.72it/s]

[Parameter containing:
tensor([[0.8874, 0.7917, 0.9845, 0.5991, 0.2305, 0.2750, 0.5000],
        [0.1126, 0.2083, 0.0155, 0.4009, 0.7695, 0.7250, 0.5000]],
       requires_grad=True)]


 88%|████████▊ | 525/600 [02:41<00:45,  1.64it/s]

[Parameter containing:
tensor([[ 0.9292,  0.8574,  1.0620,  0.5959,  0.2002,  0.2621,  0.5000],
        [ 0.0708,  0.1426, -0.0620,  0.4041,  0.7998,  0.7379,  0.5000]],
       requires_grad=True)]


 92%|█████████▏| 550/600 [02:57<00:32,  1.54it/s]

[Parameter containing:
tensor([[ 0.9832,  0.9375,  1.1466,  0.5777,  0.1621,  0.2613,  0.5000],
        [ 0.0168,  0.0625, -0.1466,  0.4223,  0.8379,  0.7387,  0.5000]],
       requires_grad=True)]


 96%|█████████▌| 575/600 [03:14<00:17,  1.40it/s]

[Parameter containing:
tensor([[ 1.0552,  1.0308,  1.2393,  0.5156,  0.1134,  0.3046,  0.5000],
        [-0.0552, -0.0308, -0.2393,  0.4844,  0.8866,  0.6954,  0.5000]],
       requires_grad=True)]


100%|██████████| 600/600 [03:32<00:00,  2.83it/s]

After
[Parameter containing:
tensor([[ 1.1483,  1.1354,  1.3413,  0.4089,  0.0471,  0.4004,  0.5000],
        [-0.1483, -0.1354, -0.3413,  0.5911,  0.9529,  0.5996,  0.5000]],
       requires_grad=True)]





In [270]:

# A_1.env_model.terminal_state=env.observation_space.n-1
print(A_1.env_model.state_action_to_state_dict)
# print(A_1.env_model.state_action_to_reward_dict)



{(0, 0): {0: 46.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 51.0}, (0, 1): {0: 30.0, 1: 19.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (1, 0): {0: 66.0, 1: 61.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (1, 1): {0: 1.0, 1: 44.0, 2: 58.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (2, 0): {0: 1.0, 1: 73.0, 2: 84.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (2, 1): {0: 1.0, 1: 1.0, 2: 54.0, 3: 61.0, 4: 1.0, 5: 1.0, 6: 1.0}, (3, 0): {0: 1.0, 1: 1.0, 2: 64.0, 3: 82.0, 4: 1.0, 5: 1.0, 6: 1.0}, (3, 1): {0: 1.0, 1: 1.0, 2: 1.0, 3: 63.0, 4: 73.0, 5: 1.0, 6: 1.0}, (4, 0): {0: 1.0, 1: 1.0, 2: 1.0, 3: 65.0, 4: 57.0, 5: 1.0, 6: 1.0}, (4, 1): {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 49.0, 5: 53.0, 6: 1.0}, (5, 0): {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 30.0, 5: 29.0, 6: 1.0}, (5, 1): {0: 1.0, 1: 24.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 18.0}, (6, 0): {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (6, 1): {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}}


In [354]:

def expect_num_steps(policy_param,num_of_epochs,Agent):
    
    A1=Agent
    for param in A1.model.fc1.parameters():
        param.data = nn.parameter.Parameter(policy_param)
    
    horizon_len=20
    
    env.reset()
    # print(list(A_1.model.fc1.parameters()))
    
    s_t_index=env._state.index

    trajs=[]
    # D_real.flush_all()

    result=0

    b_count=[]
    for traj_id in range(num_of_epochs):
        env.reset()
        # display_env()
        s_t_index=env._state.index
        
        states=[]
        log_probs=[]
        rewards=[]
        actions=[]
        nstates=[]
        
        for t in range(horizon_len):
            
            s_t=F.one_hot(torch.tensor(s_t_index),num_classes=env.observation_space.n).unsqueeze(dim=0)
            s_t=s_t.type(torch.FloatTensor)
            a_t, log_prob = A1.action(s_t)
            ns_t_index, r_t, done, _ = env.step(a_t.numpy()[0][0])
            
            states.append(s_t_index)
            actions.append(a_t)
            log_probs.append(log_prob)
            rewards.append(r_t)
            nstates.append(ns_t_index)
            s_t_index=ns_t_index
            if done:
                break
            b_count.append(len(rewards))   
        D_real.push(states, actions, rewards,nstates, log_probs)
    
    return sum(b_count)/len(b_count)



In [383]:
init_params=torch.Tensor([[ 1.1483,  1.1354,  1.3413,  0.4089,  0.0471,  0.4004,  0.5000],
        [-0.1483, -0.1354, -0.3413,  0.5911,  0.9529,  0.5996,  0.5000]])
init_para=init_params.numpy()
np.savetxt("./experiment_new/alpha_variation/dummy_exp.csv", init_para, delimiter=",")

In [384]:
my_data = np.genfromtxt("./experiment_new/alpha_variation/dummy_exp.csv", delimiter=',')
init_params=torch.from_numpy(my_data)
rew_with_good=expect_num_steps(init_params.float(),10000,A_1)
print(rew_with_good)

9.748645913203497


In [221]:
print(A_1.env_model.state_action_to_state_dict)

{(0, 0): {0: 37.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 42.0}, (0, 1): {0: 41.0, 1: 38.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (1, 0): {0: 66.0, 1: 64.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (1, 1): {0: 1.0, 1: 72.0, 2: 58.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (2, 0): {0: 1.0, 1: 50.0, 2: 49.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (2, 1): {0: 1.0, 1: 1.0, 2: 63.0, 3: 73.0, 4: 1.0, 5: 1.0, 6: 1.0}, (3, 0): {0: 1.0, 1: 1.0, 2: 60.0, 3: 51.0, 4: 1.0, 5: 1.0, 6: 1.0}, (3, 1): {0: 1.0, 1: 1.0, 2: 1.0, 3: 78.0, 4: 79.0, 5: 1.0, 6: 1.0}, (4, 0): {0: 1.0, 1: 1.0, 2: 1.0, 3: 57.0, 4: 41.0, 5: 1.0, 6: 1.0}, (4, 1): {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 48.0, 5: 54.0, 6: 1.0}, (5, 0): {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 24.0, 5: 21.0, 6: 1.0}, (5, 1): {0: 1.0, 1: 26.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 26.0}, (6, 0): {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}, (6, 1): {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0}}


In [253]:
print("After")
# A_1.env_model.terminal_state=env.observation_space.n-1
print(A_1.env_model.state_action_to_state_dict)
# print(A_1.env_model.state_action_to_reward_dict)

After
{}


In [235]:
D_real=update_D_real(D_real,env,A_1,1)

In [156]:
A_1.env_model.update_param_given_epi(D_real)

1


In [162]:
def index_to_onehot(id):
    id=torch.tensor(id)
    s_t = torch.nn.functional.one_hot(id,num_classes=7)
    s_t=s_t.type(torch.FloatTensor)
    
    return s_t

def onehot_to_index(self,x):
    id= torch.argmax(x, dim=1)
    return id

In [271]:
A_1.MBPO_train_2(D_real,1)

0
