In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils as utils
from torch.autograd import Variable

import math

from blackhc.mdp import dsl
from blackhc import mdp
import time

from blackhc.mdp import lp
import functools
import numpy as np

from tqdm import tqdm
from matplotlib import pyplot as plt
from numpy import random

from operator import itemgetter

from collections import defaultdict


In [2]:
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    def flush_all(self):
        self.buffer = []
        self.position = 0
        return

    def push(self, state, action, reward, next_state,policy):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state,policy)
        self.position = (self.position + 1) % self.capacity

    def push_batch(self, batch):
        if len(self.buffer) < self.capacity:
            append_len = min(self.capacity - len(self.buffer), len(batch))
            self.buffer.extend([None] * append_len)

        if self.position + len(batch) < self.capacity:
            self.buffer[self.position : self.position + len(batch)] = batch
            self.position += len(batch)
        else:
            self.buffer[self.position : len(self.buffer)] = batch[:len(self.buffer) - self.position]
            self.buffer[:len(batch) - len(self.buffer) + self.position] = batch[len(self.buffer) - self.position:]
            self.position = len(batch) - len(self.buffer) + self.position

    def sample(self, batch_size):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        batch = random.sample(self.buffer, int(batch_size))
        state, action, reward, next_state,policy = map(np.stack, zip(*batch))
        return state, action, reward, next_state,policy

    def sample_all_batch(self, batch_size):
        idxes = np.random.randint(0, len(self.buffer), batch_size)
        batch = list(itemgetter(*idxes)(self.buffer))
        state, action, reward, next_state,policy = map(np.stack, zip(*batch))
        return state, action, reward, next_state,policy

    def return_all(self):
        return self.buffer

    def __len__(self):
        return len(self.buffer)

In [3]:
class pred_env:
    # initialize
    def __init__(self,horizon_length,k):
        
        self.prev_state =None
        self.curr_state =None
           
        self.state_list=list()
        self.action_list=list()
        self.state_to_action_map=dict()

        
        self.P=defaultdict()
        self.R=defaultdict()
        
        self.horizon_len=horizon_length
        self.terminal_state=None
        self.k=k
    
    def reset(self):
        
        self.state_list=list()
        self.action_list=list()
        self.state_to_action_map=dict()

        
        self.P=defaultdict()
        self.R=defaultdict()
        
    # Parameter Estimation 
    
    def update_param_given_epi(self,D_real):
        
        episodes=D_real.buffer
        

        # following SARSA format
        for epi_id in range(len(episodes)):
    
            t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(episodes[epi_id])
            i=0 
            while i<len(t_states):
                
                # updating the list of states
                
                if any([torch.equal(x,t_states[i]) for x in self.state_list])!=True:
                    self.state_list.append(t_states[i])
                    self.state_to_action_map.update({t_states[i]:[]})
                
                
                
                # updating the list of actions
            
            

                if any([torch.equal(x,t_actions[i]) for x in self.action_list])!=True:
                    self.action_list.append(t_actions[i])
                    
                if any([torch.equal(x,t_actions[i]) for x in self.state_to_action_map[self.smooth_check(self.state_to_action_map,t_states[i])]])!=True:
                    self.state_to_action_map[self.smooth_check(self.state_to_action_map,t_states[i])].append(t_actions[i])
                
                # # update state,action to next state count map
                
                tru_tup,flag=self.double_smooth_check(self.P,(t_states[i],t_actions[i]))
                if flag!=True:
                    self.P[(t_states[i],t_actions[i])]={t_nstates[i]:1}
                    self.R[(t_states[i],t_actions[i])]={t_nstates[i]:t_rewards[i]}
                    
                else:
                    if any([torch.equal(x,t_nstates[i]) for x in self.P[tru_tup].keys()])!=True:
                        self.P[tru_tup].update({t_nstates[i]:1})
                        self.R[tru_tup].update({t_nstates[i]:t_rewards[i]})
                        
                    else:
                        sec_tup=self.smooth_check(self.P[tru_tup],t_nstates[i])
                        self.P[tru_tup][sec_tup]+=1
                                    
                i+=1  
                
            if self.terminal_state is None and i<self.horizon_len:
                self.terminal_state=t_nstates[i-1]
                self.state_list.append(t_nstates[i-1])
        return 
    
    # Support functions
    def double_smooth_check(self,A,a):
        
        for ele in A.keys():
            if torch.equal(a[0],ele[0]) and torch.equal(a[1],ele[1]):
                return ele,True
        return a,False
    
    def smooth_check(self,A,a):

        for ele in A.keys():
            if torch.eq(a,ele).all():
                return ele
        return a
     
    def cvt_axis(self,traj):
        
        t_states =[]
        t_actions =[]
        t_nstates =[]
        t_rewards=[]
        t_log_probs=[]
        
        for i in range(len(traj[0])):
            t_states.append(traj[0][i])
            t_actions.append(traj[1][i])
            t_rewards.append(traj[2][i])
            t_nstates.append(traj[3][i])
            t_log_probs.append(traj[4][i])

        return (t_states, t_actions, t_rewards,t_nstates,t_log_probs) 
    
    def get_parameters(self):
        print("\nState list")
        print(self.state_list)
        print("\nAction list")
        print(self.action_list)
        print("\nState to action map")
        print(self.state_to_action_map)
        print("\nstate_action to next state")
        for x in self.P:
            print(x)
            print(self.P[x])
        print("\n state_action to reward map")
        for x in self.R:
            print(x)
            print(self.R[x])
        return
    
    def Is_terminal_state(self,s_t):
        if torch.equal(self.terminal_state,s_t):
            return True
        return False
    
    def set_start_state(self):
        if len(self.state_list)>0:
            
            p=[1]*len(self.state_list)
            p=[x/len(self.state_list) for x in p]
            s_t_index=np.random.choice(np.arange(len(self.state_list)),p=p)
            s_t=self.state_list[s_t_index]
            while torch.equal(s_t,self.terminal_state):
                s_t_index=np.random.choice(np.arange(len(self.state_list)),p=p)
                s_t=self.state_list[s_t_index]
            self.curr_state=s_t
        return
    
    def list_check_up(self,A,s_t):
        for x in A:
            if torch.equal(x,s_t):
                return x
        return None
   
    # Fake Data generation functions
    
    def step_v1(self,a_t):
        
        next_state=0
        un_norm_distr=self.P[self.double_smooth_check(self.P,(self.curr_state,a_t))[0]]
        norm_factor=sum(list(un_norm_distr.values()))
        choices=list(un_norm_distr.keys())
        p=[x/norm_factor for x in un_norm_distr.values()]
        
        next_state_id=np.random.choice(np.arange(len(choices)),p=p)
        next_state=choices[next_state_id]
        rew_dict=self.R[self.double_smooth_check(self.R,(self.curr_state,a_t))[0]]
        next_state_repr=None
        for x in rew_dict.keys():
            if torch.equal(x,next_state):
                next_state_repr=x
                break
        reward=rew_dict[next_state_repr]
        
        Is_done=False
        if self.Is_terminal_state(next_state):
            Is_done=True

        return next_state,reward,Is_done,None
# sample a state from D_real



In [4]:

def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)



class Network(nn.Module):

    def __init__(self, input_layer,output_layer):
        super(Network, self).__init__()
        
        self.fc1 = nn.Linear(input_layer, output_layer,bias=False)
        self.fc2=nn.Softmax(dim=1)

    def forward(self, input_):
        x=self.fc1(input_)
        y=self.fc2(x)
        return y
    


class Agent():

    def __init__(self,observation_space,action_space,gamma=0.99,learning_rate=1e-3,horizon_len=20,k=10,fraction_of_real=0.5,batch_size=200):

        self.model = Network(observation_space.n,action_space.n)
        self.gamma = gamma
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.model.train()
        self.horizon_len=horizon_len # Assuming we already know the horizon length
        self.env_model =pred_env(self.horizon_len,k)
        self.D_fake=ReplayMemory(capacity=10000)
        self.fraction_of_real=fraction_of_real
        self.batch_size=batch_size


    def init_env_model(self):
        self.env_model.set_start_state()
        self.D_fake.flush_all()
    def reset(self):
        init_weights(self.model)
        self.env_model.reset()
        self.D_fake.flush_all()

    def action(self, state):
        
        probs = self.model(Variable(state))
        action = probs.multinomial(1).data
        prob = probs[:, action[0,0]].view(1, -1)
        log_prob = prob.log()

        return(action, log_prob)
    
    def update_D_fake(self,num_of_epi,start_state=None):
        
    
        if start_state is None:
            s_t=self.env_model.curr_state
        else:
            s_t=start_state

        self.env_model.curr_state=s_t
        
        
        result=[]
        

        trajs=[]

        for traj_id in range(num_of_epi):
            
            if self.env_model.Is_terminal_state(self.env_model.curr_state):
                self.env_model.set_start_state()
            s_t=self.env_model.curr_state
            
            states=[]
            log_probs=[]
            rewards=[]
            actions=[]
            nstates=[]
            
            for t in range(self.env_model.k):
                a_t, log_prob = self.action(s_t)
                while True:
                    rlt=self.env_model.list_check_up(self.env_model.state_to_action_map,s_t)
                    if rlt is None:
                        print(self.env_model.state_to_action_map)
                        print(s_t)
                        print(a_t)
                        print(done)
                        print("Pover")
                        return 
                    else:
                        if any([torch.equal(a_t,x) for x in self.env_model.state_to_action_map[rlt]])!=True:
                                a_t, log_prob = self.action(s_t)
                        else:
                            break
                    
                ns_t, r_t, done, _ = self.env_model.step_v1(a_t)

                states.append(s_t)
                actions.append(a_t)
                log_probs.append(log_prob)
                rewards.append(r_t)
                nstates.append(ns_t)
                
                s_t=ns_t
                self.curr_state=ns_t
                if done:
                    break
            self.D_fake.push(states, actions, rewards,nstates, log_probs)      
        return 
       
    def cvt_axis(self,trajs):
        t_states = []
        t_actions = []
        t_rewards = []
        t_nstates = []
        t_log_probs = []

        for traj in trajs:
            t_states.append(traj[0])
            t_actions.append(traj[1])
            t_rewards.append(traj[2])
            t_nstates.append(traj[3])
            t_log_probs.append(traj[4])

        return (t_states, t_actions, t_rewards,t_states,t_log_probs)
    
    def reward_to_value(self,t_rewards, gamma):

        t_Rs = []

        for rewards in t_rewards:
            Rs = []
            R = torch.zeros(1, 1)

            for i in reversed(range(len(rewards))):
                R = gamma * R + rewards[i]
                Rs.insert(0, R)
            t_Rs.append(Rs)
            
        return(t_Rs)

    def cal_log_prob(self, state, action):

        probs = self.model(Variable(state))
        prob = probs[:, action[0,0]].view(1, -1)
        log_prob = prob.log()

        return(log_prob)
    
    def MBPO_train_1(self,D_real,mult_fcator=None):
        
        # Given D_real,and a multiplicative factor,will generate fake_data 
        # ST :len(D_fake)=multipl_factor*len(D_real)
        
        self.env_model.reset()
        self.env_model.update_param_given_epi(D_real)
        self.init_env_model()
        multiple_factor = (1-self.fraction_of_real)/self.fraction_of_real
        if mult_fcator is not None:
            multiple_factor=mult_fcator
        self.update_D_fake(int(multiple_factor*D_real.position))
        data_list=self.D_fake.buffer
        
        
        
        t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(data_list)
        t_Rs = self.reward_to_value(t_rewards, self.gamma)

        Z = 0
        b = 0
        losses = []
        Z_s = []

        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            p_log_prob = 0
            q_log_prob = 0
            for t in range(len(Rs)):
                p_log_prob += (self.cal_log_prob(states[t], actions[t])).data.numpy()
                q_log_prob += log_probs[t].data.numpy()
            Z_ = math.exp(p_log_prob) / math.exp(q_log_prob)
            Z += Z_
            Z_s.append(Z_)
            b += Z_ * sum(Rs) / len(Rs)
        b = b / Z
        
        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            loss = 0.

            for t in range(len(Rs)):
                loss = loss - (log_probs[t] * (Variable(Rs[t] - b).expand_as(log_probs[t]))).sum()

            Z_ = Z_s.pop(0)
            loss = loss / Z_
            losses.append(loss)
            
        loss = sum(losses) / Z

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        utils.clip_grad_value_(self.model.parameters(),40)
        self.optimizer.step()
        
        return
      
    def MBPO_train_2(self,D_real,fraction_of_real=None,mult_factor=1):
        
        # Given D_real,and the fraction of real to fake trajs,then train the policy on data comprising D_fake and D_real
        # ST real_ratio  follows the value given
        
        self.env_model.reset()
        self.env_model.update_param_given_epi(D_real)
        self.init_env_model()
        # mult_factor = (1-self.fraction_of_real)/self.fraction_of_real
        
        self.update_D_fake(int(mult_factor*D_real.position))
        
        frc_of_real=self.fraction_of_real
        if fraction_of_real is not None:
            frc_of_real=fraction_of_real
        
        self.batch_size=D_real.position
        
        num_of_real_epi=int(self.batch_size*frc_of_real)
        num_of_fake_epi=self.batch_size-num_of_real_epi
        
        pos_list=np.random.choice(a=len(self.D_fake.buffer),size=min([num_of_fake_epi,len(self.D_fake.buffer)]),replace=False)
        fake_data_list=[self.D_fake.buffer[pos] for pos in pos_list]
        
        pos_list=np.random.choice(a=len(D_real.buffer),size=min([num_of_real_epi,len(D_real.buffer)]),replace=False)
        real_data_list=[D_real.buffer[pos] for pos in pos_list]
        
        data_list=real_data_list+fake_data_list
        
        # print(len(real_data_list))
        
        t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(data_list)
        t_Rs = self.reward_to_value(t_rewards, self.gamma)

        Z = 0
        b = 0
        losses = []
        Z_s = []

        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            p_log_prob = 0
            q_log_prob = 0
            for t in range(len(Rs)):
                p_log_prob += (self.cal_log_prob(states[t], actions[t])).data.numpy()
                q_log_prob += log_probs[t].data.numpy()
            Z_ = math.exp(p_log_prob) / math.exp(q_log_prob)
            Z += Z_
            Z_s.append(Z_)
            b += Z_ * sum(Rs) / len(Rs)
        b = b / Z
        
        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            loss = 0.

            for t in range(len(Rs)):
                loss = loss - (log_probs[t] * (Variable(Rs[t] - b).expand_as(log_probs[t]))).sum()

            Z_ = Z_s.pop(0)
            loss = loss / Z_
            losses.append(loss)
            
        loss = sum(losses) / Z

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        utils.clip_grad_value_(self.model.parameters(),40)
        self.optimizer.step()
        return

    def train_(self, D_real):
        
        # Pure policy gradient
        
        data_list=D_real.buffer
        # print(len(data_list))
        
        t_states, t_actions, t_rewards,t_nstates,t_log_probs = self.cvt_axis(data_list)
        t_Rs = self.reward_to_value(t_rewards, self.gamma)

        Z = 0
        b = 0
        losses = []
        Z_s = []

        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            p_log_prob = 0
            q_log_prob = 0
            for t in range(len(Rs)):
                p_log_prob += (self.cal_log_prob(states[t], actions[t])).data.numpy()
                q_log_prob += log_probs[t].data.numpy()
            Z_ = math.exp(p_log_prob) / math.exp(q_log_prob)
            Z += Z_
            Z_s.append(Z_)
            b += Z_ * sum(Rs) / len(Rs)
        b = b / Z


        for (states, actions, Rs, log_probs) in zip(t_states, t_actions, t_Rs, t_log_probs):
            loss = 0.

            for t in range(len(Rs)):
                loss = loss - (log_probs[t] * (Variable(Rs[t] - b).expand_as(log_probs[t]))).sum()

            Z_ = Z_s.pop(0)
            loss = loss / Z_
            losses.append(loss)
            
        loss = sum(losses) / Z

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        utils.clip_grad_value_(self.model.parameters(),40)
        self.optimizer.step()





In [5]:
def  _multi_round_nmdp_simple():
    with dsl.new() as mdp:
        # Write down the MDP dynamics here 
        
        start = dsl.state()
        S_1=dsl.state()
        end = dsl.terminal_state()
        
        A_0=dsl.action()
        A_1=dsl.action()

        start & A_0 > dsl.reward(0) | dsl.reward(10)
        start & A_0 > start * 10 | end
        start & A_1 > dsl.reward(0) | dsl.reward(10) | dsl.reward(0)
        start & A_1 > start * 10 | end * 1 | S_1 * 1
        
        S_1 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_0 > S_1 * 1 | start
        S_1 & A_1 > dsl.reward(0) | dsl.reward(10)
        S_1 & A_1 > start * 5 | end
        
        dsl.discount(0.5)

        return mdp.validate()
    
def  _multi_round_nmdp_complex():
    with dsl.new() as mdp:
        # Write down the MDP dynamics here 
        
        start = dsl.state()
        S_1=dsl.state()
        S_2=dsl.state()
        S_3=dsl.state()
        S_4=dsl.state()
        S_5=dsl.state()
        end = dsl.terminal_state()
        
        A_0=dsl.action()
        A_1=dsl.action()

        start & A_0 > dsl.reward(10) | dsl.reward(0)
        start & A_0 > end * 1 | start
        start & A_1 > dsl.reward(0) | dsl.reward(0)
        start & A_1 > start * 1 | S_1
        
        S_1 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_0 > S_1 * 1 | start
        S_1 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_1 & A_1 > S_1 * 1 | S_2
        
        S_2 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_2 & A_0 > S_2 * 1 | S_1
        S_2 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_2 & A_1 > S_2 * 1 | S_3
        
        S_3 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_3 & A_0 > S_3 * 1 | S_2
        S_3 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_3 & A_1 > S_3 * 1 | S_4
        
        S_4 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_4 & A_0 > S_4 * 1 | S_3
        S_4 & A_1 > dsl.reward(0) | dsl.reward(0)
        S_4 & A_1 > S_4 * 1 | S_5
        
        S_5 & A_0 > dsl.reward(0) | dsl.reward(0)
        S_5 & A_0 > S_5 * 1 | S_4
        S_5 & A_1 > dsl.reward(10) | dsl.reward(0)
        S_5 & A_1 > end * 1 | S_1
        
        dsl.discount(0.5)

        return mdp.validate() 

MULTI_ROUND_NDMP = _multi_round_nmdp_complex()



solver = lp.LinearProgramming(MULTI_ROUND_NDMP)
solver.compute_q_table(max_iterations=10000, all_close=functools.partial(np.allclose, rtol=1e-10, atol=1e-10))
# mdp.display_mdp(MULTI_ROUND_NDMP)


array([[6.66666667, 2.22222222],
       [2.22222222, 0.74074074],
       [0.74074074, 0.33950617],
       [0.33950617, 0.61728395],
       [0.61728395, 1.85185185],
       [1.85185185, 5.55555556],
       [0.        , 0.        ]])

In [6]:
D_real=ReplayMemory(capacity=10000)


# Importance Sampling-based Agent

# Defining MDP-following openai gym structure

In [7]:
# model estimation



In [8]:
env = MULTI_ROUND_NDMP.to_env()
env.reset()
A1=Agent(env.observation_space,env.action_space)
A1.env_model.reset()


In [9]:
def display_env():    
    env.render()
    env.render_widget.width=500
    time.sleep(0.200)

# Graphical-representation of the MDP 

In [10]:
# display_env()

In [11]:
# num_of_epochs=1000
# horizon_len=40

num_of_epochs=30
horizon_len=20

# Off-policy policy gradient 

In [12]:

# update D_real


def update_D_real(D_real,env,Agent,num_of_epochs):
    
    A1=Agent
    
    
    env.reset()
    
    s_t_index=env._state.index
    s_t=F.one_hot(torch.tensor(s_t_index),num_classes=env.observation_space.n).unsqueeze(dim=0)
    s_t=s_t.type(torch.FloatTensor)

    trajs=[]
    # D_real.flush_all()

    result=0


    for traj_id in range(num_of_epochs):
        env.reset()
        # display_env()
        s_t_index=env._state.index
        
        states=[]
        log_probs=[]
        rewards=[]
        actions=[]
        nstates=[]
        
        for t in range(horizon_len):
            s_t=F.one_hot(torch.tensor(s_t_index),num_classes=env.observation_space.n).unsqueeze(dim=0)
            s_t=s_t.type(torch.FloatTensor)
            a_t, log_prob = A1.action(s_t)
            ns_t, r_t, done, _ = env.step(a_t.numpy()[0][0])
            # display_env()
            if t!=0:
                nstates.append(s_t)
            states.append(s_t)
            actions.append(a_t)
            log_probs.append(log_prob)
            rewards.append(r_t)
            s_t_index=ns_t
            if done:
                break
        # time.sleep(2)
        s_t=F.one_hot(torch.tensor(s_t_index),num_classes=env.observation_space.n).unsqueeze(dim=0)
        s_t=s_t.type(torch.FloatTensor)
        nstates.append(s_t)
        if traj_id==0:
            result=sum(rewards)    
        D_real.push(states, actions, rewards,nstates, log_probs)
    
    return D_real


In [13]:
list(A1.model.fc1.parameters())

[Parameter containing:
 tensor([[ 0.1560, -0.0975, -0.3777,  0.2764, -0.0925,  0.1440,  0.3418],
         [ 0.3479,  0.1932,  0.3007,  0.2642, -0.2043,  0.3201,  0.3473]],
        requires_grad=True)]

In [14]:
class Test_bench:
    def __init__(self):
        self.horizon_len=20     
        self.num_of_real_epi=10
        self.k=10
        self.mult_factor=1.0
        self.real_ratio=0.5
        self.capacity=10000
        self.batch_size=2000
        self.learning_rate=0.001
        self.gamma=0.99
        
        self.num_of_outerloop=100 # outer loop
        self.num_of_innerloop=10 # inner loop
        
        self.D_real=ReplayMemory(self.capacity)
        self.env=None
        self.Agent=None
    def init_play_ground(self,env):
        self.env=env
        self.env.reset()
        self.A1=Agent(self.env.observation_space,self.env.action_space,gamma=self.gamma,learning_rate=self.learning_rate,horizon_len=self.horizon_len,k=self.k,fraction_of_real=self.real_ratio,batch_size=self.batch_size)
        self.A1.reset()
        
    def reset_play_ground(self):
        self.env.reset()
        # self.A1.reset()     #######      !!!!!!!!!!!!!!!      ########
        self.D_real.flush_all()
        
    def display_env(self):    
        self.env.render()
        self.env.render_widget.width=500
        time.sleep(0.200)
        
    def update_D_real(self,num_of_epochs=None):
    
        num_of_episodes=self.num_of_real_epi
        if num_of_epochs is not None:
            num_of_episodes=num_of_epochs
        
        
        self.env.reset()
        
        s_t_index=self.env._state.index
        s_t=F.one_hot(torch.tensor(s_t_index),num_classes=self.env.observation_space.n).unsqueeze(dim=0)
        s_t=s_t.type(torch.FloatTensor)

        trajs=[]
        # D_real.flush_all()

        result=[]


        for traj_id in range(num_of_episodes):
            self.env.reset()
            # display_env()
            s_t_index=self.env._state.index
            
            states=[]
            log_probs=[]
            rewards=[]
            actions=[]
            nstates=[]
        
            for t in range(self.horizon_len):
                s_t=F.one_hot(torch.tensor(s_t_index),num_classes=self.env.observation_space.n).unsqueeze(dim=0)
                s_t=s_t.type(torch.FloatTensor)
                a_t, log_prob = self.A1.action(s_t)
                ns_t, r_t, done, _ = self.env.step(a_t.numpy()[0][0])
                # display_env()
                if t!=0:
                    nstates.append(s_t)
                states.append(s_t)
                actions.append(a_t)
                log_probs.append(log_prob)
                rewards.append(r_t)
                s_t_index=ns_t
                if done:
                    break
            # time.sleep(2)
            s_t=F.one_hot(torch.tensor(s_t_index),num_classes=self.env.observation_space.n).unsqueeze(dim=0)
            s_t=s_t.type(torch.FloatTensor)
            nstates.append(s_t)

            result.append(self.reward_to_value(rewards,0.5)) 
            self.D_real.push(states, actions, rewards,nstates, log_probs)
        
        
        return result
    
    def reward_to_value(self,t_rewards, gamma):

        Rs = []
        R = torch.zeros(1, 1)
        gamma=0.5
        for i in reversed(range(len(t_rewards))):
            R = gamma * R + t_rewards[i]
            
        return(R)
    
    def perform_pure_fake(self,mul_factor,init_params):
        
        # MBPO based agent 

        self.reset_play_ground()
        result=[]
        
        for param in self.A1.model.fc1.parameters():
            param.data = nn.parameter.Parameter(init_params)

        print("\nBefore training:")

        print(list(self.A1.model.fc1.parameters()))
        
        
        
        for x in tqdm(range(self.num_of_outerloop)):
            result=result+self.update_D_real(num_of_epochs=10)
            for i in range(self.num_of_innerloop):
                self.A1.MBPO_train_1(self.D_real,mul_factor)
                # A1.MBPO_train_2(D_real)
                # A1.train_(D_real)
                # pass
        print("\nAfter training:")
        print(list(self.A1.model.fc1.parameters()))
        
        return result
    
    def perform_mixed_strategy(self,fraction,init_params,mult_factor=1):
        
        # MBPO based agent 

        self.reset_play_ground()
        result=[]
        
        for param in self.A1.model.fc1.parameters():
            param.data = nn.parameter.Parameter(init_params)
        
        print("\nBefore training:")

        print(list(self.A1.model.fc1.parameters()))
        
        for x in tqdm(range(self.num_of_outerloop)):
            result=result+self.update_D_real(num_of_epochs=10)
            if x%25==0:
                print(list(self.A1.model.fc1.parameters()))
                
            for i in range(self.num_of_innerloop):
                # self.A1.MBPO_train_1(self.D_real)
                self.A1.MBPO_train_2(self.D_real,fraction,mult_factor)
                # self.A1.train_(D_real)
                # pass
        print("\nAfter training:")
        print(list(self.A1.model.fc1.parameters()))
        
        return result
    
    def perform_pure_real(self,init_params=None,outerloop=100):
        
        # MBPO based agent 

        self.reset_play_ground()
        result=[]
        if init_params is not None:
            for param in self.A1.model.fc1.parameters():
                param.data = nn.parameter.Parameter(init_params)
        else:
            init_weights(self.A1.model.fc1)

        print("\nBefore training:")

        print(list(self.A1.model.fc1.parameters()))
        
        self.num_of_outerloop=outerloop
        
        for x in tqdm(range(self.num_of_outerloop)):
            result=result+self.update_D_real(num_of_epochs=10)
            
            if x%25==0:
                print(list(self.A1.model.fc1.parameters()))

            for i in range(self.num_of_innerloop):
                # self.A1.MBPO_train_1(self.D_real)
                # self.A1.MBPO_train_2(D_real)
                self.A1.train_(self.D_real)
                # pass
        print("\nAfter training:")
        print(list(self.A1.model.fc1.parameters()))
        
        return result
    
    def Fraction_exp(self,list_of_fraction_of_real):
        
        result=[]
        for real_fraction in list_of_fraction_of_real:
            trajs=[]
            for i in range(10):
                trajs.append(self.perform_mixed_strategy(real_fraction))
            result.append(trajs)
            
        # result=np.array(result)
        A=np.array(result)
        mean=A.mean(axis=1)
        std=np.sqrt(A.var(axis=1))
        UCB=np.add(mean,std)
        MEAN=mean
        LCB=np.subtract(mean,std)
        
        for i in range(len(list_of_fraction_of_real)):
            plt.title(list_of_fraction_of_real[i])
            plt.plot(MEAN[i],'-b', label='mean')
            plt.plot(LCB[i],'-r',label="LCB")
            plt.plot(UCB[i],'-g',label="UCB")
            # fill the area with black color, opacity 0.15
            plt.fill_between(list(range(len(MEAN[i]))), UCB[i],LCB[i], color="k", alpha=0.15)
            plt.xlabel("Time axis")
            plt.ylabel("Result")
            plt.legend()
            plt.show()
            plt.clf()
        return
                
  

In [15]:
# MBPO based agent 

# D_real.flush_all()
# init_weights(A1.model)

# print("\nBefore training:")

# print(list(A1.model.fc1.parameters()))

# for x in tqdm(range(100)):
#     D_real=update_D_real(D_real,env,A1,10)
#     for i in range(10):
#         # A1.MBPO_train_1(D_real)
#         # A1.MBPO_train_2(D_real)
#         A1.train_(D_real)
#         # pass
# print(list(A1.model.fc1.parameters()))
# print("\nAfter training:")

In [16]:
import copy

In [17]:
play_ground=Test_bench()    # create an instance of Test_bench
env = MULTI_ROUND_NDMP.to_env() # initialize the environment
play_ground.init_play_ground(env=env)   # initialize Playground
play_ground.reset_play_ground() # setting to the Default state of Playground
temp=copy.copy(list(play_ground.A1.model.fc1.parameters()))#
# init_params=copy.copy(temp[0].data).detach()

In [18]:
temp[0]

Parameter containing:
tensor([[ 0.1741,  0.1458,  0.0439,  0.0518, -0.2410,  0.0314,  0.3139],
        [-0.0708,  0.2966, -0.0441,  0.2373, -0.0035,  0.3455, -0.3437]],
       requires_grad=True)

In [25]:
# init_params=temp
# init_para=init_params[0].detach().numpy()
# np.savetxt("./experiment/init_param_big.csv", init_para, delimiter=",")

In [26]:
my_data = np.genfromtxt("./experiment/init_param_big.csv", delimiter=',')
init_params=torch.from_numpy(my_data)

NameError: name 'init_params' is not defined

In [27]:



for param in play_ground.A1.model.fc1.parameters():
    param.data = nn.parameter.Parameter(init_params)

# init_weights(play_ground.A1.model.fc1)
# for param in play_ground.A1.model.fc1.parameters():
#     param=temp

# init_weights(play_ground.A1.model.fc1)
print(list(play_ground.A1.model.fc1.parameters()))
# init_params=torch.Tensor([[-0.7012,  0.9787,  0.8131],
#         [-0.4855,  0.4138, -0.2821]])
# init_para=init_params.numpy()
# np.savetxt("./experiment/init_param_small.csv", init_para, delimiter=",")



# init_params=torch.Tensor([[ 1.9763, -1.9540,  0.8131],
#         [-3.1630,  3.3465, -0.2821]])
# init_para=init_params.numpy()
# np.savetxt("./experiment/after_param_small.csv", init_para, delimiter=",")
       


[Parameter containing:
tensor([[ 0.0174, -0.1990, -0.0372,  0.3741,  0.0769,  0.0338,  0.3562],
        [-0.1931,  0.2764, -0.0541, -0.0509,  0.2145, -0.2138,  0.1645]],
       dtype=torch.float64, requires_grad=True)]


In [4]:
# init_params=torch.Tensor([[ 1.9763, -1.9540,  0.8131],
#         [-3.1630,  3.3465, -0.2821]])
# init_para=init_params.numpy()
# np.savetxt("./experiment/after_param_small.csv", init_para, delimiter=",")

In [24]:
my_data = np.genfromtxt("./experiment/init_param_big.csv", delimiter=',')
init_params=torch.from_numpy(my_data)
play_ground.num_of_outerloop=1000
play_ground.num_of_innerloop=1
result=play_ground.perform_pure_real(init_params.float(),play_ground.num_of_outerloop)
# play_ground.num_of_outerloop=10
# result=play_ground.perform_mixed_strategy(1,init_params.float())



Before training:
[Parameter containing:
tensor([[ 0.0174, -0.1990, -0.0372,  0.3741,  0.0769,  0.0338,  0.3562],
        [-0.1931,  0.2764, -0.0541, -0.0509,  0.2145, -0.2138,  0.1645]],
       requires_grad=True)]


  0%|          | 3/1000 [00:00<00:52, 19.11it/s]

[Parameter containing:
tensor([[ 0.0174, -0.1990, -0.0372,  0.3741,  0.0769,  0.0338,  0.3562],
        [-0.1931,  0.2764, -0.0541, -0.0509,  0.2145, -0.2138,  0.1645]],
       requires_grad=True)]


  2%|▎         | 25/1000 [00:05<05:30,  2.95it/s]

[Parameter containing:
tensor([[ 0.0417, -0.1772, -0.0216,  0.3635,  0.0472, -0.0048,  0.3562],
        [-0.2174,  0.2546, -0.0697, -0.0404,  0.2443, -0.1751,  0.1645]],
       requires_grad=True)]


  5%|▌         | 50/1000 [00:19<11:22,  1.39it/s]

[Parameter containing:
tensor([[ 0.0629, -0.1577,  0.0036,  0.3715,  0.0167, -0.0354,  0.3562],
        [-0.2386,  0.2351, -0.0949, -0.0483,  0.2748, -0.1446,  0.1645]],
       requires_grad=True)]


  8%|▊         | 75/1000 [00:43<16:35,  1.08s/it]

[Parameter containing:
tensor([[ 0.0849, -0.1363,  0.0316,  0.3768, -0.0114, -0.0618,  0.3562],
        [-0.2607,  0.2137, -0.1229, -0.0536,  0.3028, -0.1181,  0.1645]],
       requires_grad=True)]


 10%|█         | 100/1000 [01:16<21:03,  1.40s/it]

[Parameter containing:
tensor([[ 0.1081, -0.1138,  0.0634,  0.3826, -0.0397, -0.0887,  0.3562],
        [-0.2838,  0.1912, -0.1547, -0.0595,  0.3311, -0.0913,  0.1645]],
       requires_grad=True)]


 12%|█▎        | 125/1000 [01:57<25:29,  1.75s/it]

[Parameter containing:
tensor([[ 0.1330, -0.0902,  0.0999,  0.3905, -0.0694, -0.1166,  0.3562],
        [-0.3087,  0.1676, -0.1912, -0.0673,  0.3608, -0.0633,  0.1645]],
       requires_grad=True)]


 15%|█▌        | 150/1000 [02:47<30:31,  2.16s/it]

[Parameter containing:
tensor([[ 0.1597, -0.0651,  0.1431,  0.4072, -0.1023, -0.1463,  0.3562],
        [-0.3354,  0.1425, -0.2344, -0.0841,  0.3937, -0.0337,  0.1645]],
       requires_grad=True)]


 18%|█▊        | 175/1000 [03:45<33:53,  2.46s/it]

[Parameter containing:
tensor([[ 1.8858e-01, -3.7383e-02,  1.9463e-01,  4.4271e-01, -1.4177e-01,
         -1.7988e-01,  3.5616e-01],
        [-3.6430e-01,  1.1478e-01, -2.8591e-01, -1.1955e-01,  4.3319e-01,
         -5.1435e-05,  1.6450e-01]], requires_grad=True)]


 20%|██        | 200/1000 [04:53<37:43,  2.83s/it]

[Parameter containing:
tensor([[ 0.2210, -0.0047,  0.2556,  0.5098, -0.1920, -0.2207,  0.3562],
        [-0.3967,  0.0821, -0.3469, -0.1866,  0.4834,  0.0408,  0.1645]],
       requires_grad=True)]


 22%|██▎       | 225/1000 [06:09<40:23,  3.13s/it]

[Parameter containing:
tensor([[ 0.2598,  0.0389,  0.3314,  0.6022, -0.2621, -0.2782,  0.3562],
        [-0.4356,  0.0385, -0.4226, -0.2791,  0.5535,  0.0983,  0.1645]],
       requires_grad=True)]


 25%|██▌       | 250/1000 [07:33<42:22,  3.39s/it]

[Parameter containing:
tensor([[ 0.3146,  0.1068,  0.4254,  0.7049, -0.3562, -0.3646,  0.3562],
        [-0.4903, -0.0294, -0.5167, -0.3818,  0.6476,  0.1847,  0.1645]],
       requires_grad=True)]


 28%|██▊       | 275/1000 [09:06<45:51,  3.79s/it]

[Parameter containing:
tensor([[ 0.4004,  0.2042,  0.5190,  0.8010, -0.4509, -0.4672,  0.3562],
        [-0.5761, -0.1268, -0.6103, -0.4778,  0.7423,  0.2873,  0.1645]],
       requires_grad=True)]


 30%|███       | 300/1000 [10:45<46:07,  3.95s/it]

[Parameter containing:
tensor([[ 0.5020,  0.2907,  0.5918,  0.8747, -0.5243, -0.5498,  0.3562],
        [-0.6777, -0.2133, -0.6831, -0.5515,  0.8157,  0.3698,  0.1645]],
       requires_grad=True)]


 32%|███▎      | 325/1000 [12:32<47:38,  4.23s/it]

[Parameter containing:
tensor([[ 0.5837,  0.3590,  0.6522,  0.9356, -0.5850, -0.6156,  0.3562],
        [-0.7594, -0.2816, -0.7435, -0.6124,  0.8764,  0.4357,  0.1645]],
       requires_grad=True)]


 35%|███▌      | 350/1000 [14:26<50:41,  4.68s/it]

[Parameter containing:
tensor([[ 0.6494,  0.4171,  0.7054,  0.9892, -0.6385, -0.6723,  0.3562],
        [-0.8251, -0.3397, -0.7967, -0.6660,  0.9299,  0.4923,  0.1645]],
       requires_grad=True)]


 38%|███▊      | 375/1000 [16:24<50:06,  4.81s/it]

[Parameter containing:
tensor([[ 0.7063,  0.4690,  0.7540,  1.0379, -0.6871, -0.7232,  0.3562],
        [-0.8820, -0.3916, -0.8452, -0.7148,  0.9785,  0.5432,  0.1645]],
       requires_grad=True)]


 40%|████      | 400/1000 [18:31<52:10,  5.22s/it]

[Parameter containing:
tensor([[ 0.7574,  0.5166,  0.7991,  1.0833, -0.7324, -0.7701,  0.3562],
        [-0.9332, -0.4392, -0.8904, -0.7601,  1.0238,  0.5902,  0.1645]],
       requires_grad=True)]


 42%|████▎     | 425/1000 [20:41<50:16,  5.25s/it]

[Parameter containing:
tensor([[ 0.8047,  0.5613,  0.8418,  1.1261, -0.7752, -0.8142,  0.3562],
        [-0.9804, -0.4839, -0.9331, -0.8029,  1.0666,  0.6342,  0.1645]],
       requires_grad=True)]


 45%|████▌     | 450/1000 [22:56<50:41,  5.53s/it]

[Parameter containing:
tensor([[ 0.8491,  0.6035,  0.8824,  1.1668, -0.8159, -0.8560,  0.3562],
        [-1.0248, -0.5261, -0.9737, -0.8437,  1.1073,  0.6760,  0.1645]],
       requires_grad=True)]


 48%|████▊     | 475/1000 [25:18<50:08,  5.73s/it]

[Parameter containing:
tensor([[ 0.8912,  0.6439,  0.9215,  1.2060, -0.8550, -0.8960,  0.3562],
        [-1.0669, -0.5665, -1.0128, -0.8828,  1.1464,  0.7160,  0.1645]],
       requires_grad=True)]


 50%|█████     | 500/1000 [27:45<50:03,  6.01s/it]

[Parameter containing:
tensor([[ 0.9315,  0.6828,  0.9592,  1.2438, -0.8928, -0.9345,  0.3562],
        [-1.1072, -0.6054, -1.0505, -0.9206,  1.1842,  0.7546,  0.1645]],
       requires_grad=True)]


 52%|█████▎    | 525/1000 [30:17<48:50,  6.17s/it]

[Parameter containing:
tensor([[ 0.9704,  0.7204,  0.9958,  1.2805, -0.9294, -0.9718,  0.3562],
        [-1.1461, -0.6430, -1.0871, -0.9573,  1.2208,  0.7919,  0.1645]],
       requires_grad=True)]


 55%|█████▌    | 550/1000 [32:54<48:21,  6.45s/it]

[Parameter containing:
tensor([[ 1.0080,  0.7569,  1.0315,  1.3162, -0.9651, -1.0081,  0.3562],
        [-1.1837, -0.6795, -1.1228, -0.9930,  1.2565,  0.8282,  0.1645]],
       requires_grad=True)]


 57%|█████▊    | 575/1000 [35:37<47:09,  6.66s/it]

[Parameter containing:
tensor([[ 1.0445,  0.7925,  1.0663,  1.3511, -1.0000, -1.0435,  0.3562],
        [-1.2202, -0.7151, -1.1576, -1.0279,  1.2914,  0.8636,  0.1645]],
       requires_grad=True)]


 60%|██████    | 600/1000 [38:24<46:09,  6.92s/it]

[Parameter containing:
tensor([[ 1.0802,  0.8273,  1.1004,  1.3852, -1.0341, -1.0781,  0.3562],
        [-1.2559, -0.7499, -1.1917, -1.0621,  1.3255,  0.8982,  0.1645]],
       requires_grad=True)]


 62%|██████▎   | 625/1000 [41:13<42:20,  6.78s/it]

[Parameter containing:
tensor([[ 1.1150,  0.8614,  1.1339,  1.4188, -1.0676, -1.1120,  0.3562],
        [-1.2907, -0.7840, -1.2252, -1.0956,  1.3591,  0.9321,  0.1645]],
       requires_grad=True)]


 65%|██████▌   | 650/1000 [44:06<40:53,  7.01s/it]

[Parameter containing:
tensor([[ 1.1492,  0.8948,  1.1668,  1.4517, -1.1006, -1.1453,  0.3562],
        [-1.3249, -0.8174, -1.2581, -1.1286,  1.3920,  0.9654,  0.1645]],
       requires_grad=True)]


 68%|██████▊   | 675/1000 [47:04<39:05,  7.22s/it]

[Parameter containing:
tensor([[ 1.1827,  0.9278,  1.1992,  1.4842, -1.1330, -1.1781,  0.3562],
        [-1.3584, -0.8504, -1.2905, -1.1610,  1.4244,  0.9982,  0.1645]],
       requires_grad=True)]


 70%|███████   | 700/1000 [50:07<36:37,  7.33s/it]

[Parameter containing:
tensor([[ 1.2157,  0.9602,  1.2312,  1.5162, -1.1650, -1.2104,  0.3562],
        [-1.3914, -0.8828, -1.3225, -1.1930,  1.4564,  1.0305,  0.1645]],
       requires_grad=True)]


 72%|███████▎  | 725/1000 [53:13<34:07,  7.45s/it]

[Parameter containing:
tensor([[ 1.2482,  0.9922,  1.2627,  1.5477, -1.1966, -1.2423,  0.3562],
        [-1.4239, -0.9148, -1.3540, -1.2246,  1.4880,  1.0623,  0.1645]],
       requires_grad=True)]


 75%|███████▌  | 750/1000 [56:26<32:36,  7.83s/it]

[Parameter containing:
tensor([[ 1.2802,  1.0237,  1.2939,  1.5790, -1.2277, -1.2737,  0.3562],
        [-1.4560, -0.9463, -1.3852, -1.2558,  1.5192,  1.0938,  0.1645]],
       requires_grad=True)]


 78%|███████▊  | 775/1000 [59:42<29:33,  7.88s/it]

[Parameter containing:
tensor([[ 1.3119,  1.0549,  1.3247,  1.6098, -1.2586, -1.3048,  0.3562],
        [-1.4876, -0.9775, -1.4160, -1.2867,  1.5500,  1.1249,  0.1645]],
       requires_grad=True)]


 80%|████████  | 800/1000 [1:03:03<27:09,  8.15s/it]

[Parameter containing:
tensor([[ 1.3432,  1.0858,  1.3553,  1.6404, -1.2891, -1.3356,  0.3562],
        [-1.5189, -1.0084, -1.4465, -1.3172,  1.5806,  1.1557,  0.1645]],
       requires_grad=True)]


 82%|████████▎ | 825/1000 [1:06:42<25:30,  8.74s/it]

[Parameter containing:
tensor([[ 1.3741,  1.1164,  1.3855,  1.6706, -1.3194, -1.3661,  0.3562],
        [-1.5498, -1.0390, -1.4768, -1.3475,  1.6108,  1.1861,  0.1645]],
       requires_grad=True)]


 85%|████████▌ | 850/1000 [1:10:20<21:44,  8.70s/it]

[Parameter containing:
tensor([[ 1.4047,  1.1466,  1.4155,  1.7006, -1.3494, -1.3963,  0.3562],
        [-1.5805, -1.0692, -1.5067, -1.3775,  1.6408,  1.2163,  0.1645]],
       requires_grad=True)]


 88%|████████▊ | 875/1000 [2:24:44<1:33:08, 44.71s/it]  

[Parameter containing:
tensor([[ 1.4351,  1.1766,  1.4452,  1.7304, -1.3791, -1.4262,  0.3562],
        [-1.6108, -1.0992, -1.5365, -1.4072,  1.6705,  1.2463,  0.1645]],
       requires_grad=True)]


 90%|█████████ | 900/1000 [2:28:40<16:12,  9.73s/it]  

[Parameter containing:
tensor([[ 1.4652,  1.2064,  1.4747,  1.7599, -1.4086, -1.4559,  0.3562],
        [-1.6409, -1.1290, -1.5660, -1.4367,  1.7000,  1.2760,  0.1645]],
       requires_grad=True)]


 92%|█████████▎| 925/1000 [2:32:27<11:12,  8.96s/it]

[Parameter containing:
tensor([[ 1.4950,  1.2360,  1.5040,  1.7892, -1.4379, -1.4854,  0.3562],
        [-1.6707, -1.1586, -1.5953, -1.4660,  1.7294,  1.3054,  0.1645]],
       requires_grad=True)]


 95%|█████████▌| 950/1000 [2:36:36<08:13,  9.87s/it]

[Parameter containing:
tensor([[ 1.5246,  1.2653,  1.5331,  1.8183, -1.4670, -1.5146,  0.3562],
        [-1.7003, -1.1879, -1.6244, -1.4952,  1.7585,  1.3347,  0.1645]],
       requires_grad=True)]


 98%|█████████▊| 975/1000 [2:40:28<03:56,  9.45s/it]

[Parameter containing:
tensor([[ 1.5540,  1.2944,  1.5620,  1.8472, -1.4960, -1.5437,  0.3562],
        [-1.7297, -1.2170, -1.6533, -1.5241,  1.7874,  1.3638,  0.1645]],
       requires_grad=True)]


100%|██████████| 1000/1000 [2:44:24<00:00,  9.86s/it]


After training:
[Parameter containing:
tensor([[ 1.5832,  1.3234,  1.5907,  1.8760, -1.5247, -1.5726,  0.3562],
        [-1.7589, -1.2460, -1.6820, -1.5528,  1.8161,  1.3927,  0.1645]],
       requires_grad=True)]





In [27]:
my_data = np.genfromtxt("./experiment/init_param_big.csv", delimiter=',')
init_params=torch.from_numpy(my_data)
# result=play_ground.perform_pure_real(init_params.float(),100)
play_ground.num_of_outerloop=1000
play_ground.num_of_innerloop=1
result=play_ground.perform_pure_fake(1,init_params.float())


Before training:
[Parameter containing:
tensor([[ 0.0174, -0.1990, -0.0372,  0.3741,  0.0769,  0.0338,  0.3562],
        [-0.1931,  0.2764, -0.0541, -0.0509,  0.2145, -0.2138,  0.1645]],
       requires_grad=True)]


  5%|▌         | 51/1000 [00:28<17:44,  1.12s/it]

In [25]:
init_params=torch.Tensor([[ 1.5832,  1.3234,  1.5907,  1.8760, -1.5247, -1.5726,  0.3562],
        [-1.7589, -1.2460, -1.6820, -1.5528,  1.8161,  1.3927,  0.1645]])
init_para=init_params.numpy()
np.savetxt("./experiment/inner_outer_variation/pure_real_inner_1_outer_1000.csv", init_para, delimiter=",")

In [55]:
result[4]

tensor([[0.]])

In [52]:
result[1997]

tensor([[10.]])

In [None]:
my_data = np.genfromtxt("./experiment/init_param_small.csv", delimiter=',')
init_params=torch.from_numpy(my_data)
result=play_ground.perform_pure_fake(1,init_params.float())

In [None]:
init_params

In [None]:
my_data = np.genfromtxt("./experiment/init_param_small.csv", delimiter=',')
init_params=torch.from_numpy(my_data)
result=play_ground.perform_mixed_strategy(1,init_params.float())

In [None]:
init_params

# Primal

In [57]:


fract_list=np.arange(0,1.1,.1)
prim_policy_list=[]
for fract in tqdm(fract_list):
    fract=np.round(fract,2)
    my_data = np.genfromtxt("./experiment/init_param_small.csv", delimiter=',')
    init_params=torch.from_numpy(my_data)
    result=play_ground.perform_mixed_strategy(fract,init_params.float())
    prim_policy_list.append(list(play_ground.A1.model.fc1.parameters()))
prim_policy_list=np.array(prim_policy_list)
np.savetxt("./experiment/exp_rslt_1.csv", prim_policy_list, delimiter=",")
    

  0%|          | 0/11 [00:00<?, ?it/s]


Before training:
[Parameter containing:
tensor([[-0.7012,  0.9787,  0.8131],
        [-0.4855,  0.4138, -0.2821]], requires_grad=True)]




In [None]:
prim_policy_list

# Secondary

In [None]:
fract_list=np.arange(0,1.1,.1)
sec_policy_list=[]
number_of_times=10

for fract in tqdm(fract_list):
    fract=np.round(fract,2)
    policy_for_fract=[]
    for i in range(number_of_times):
        my_data = np.genfromtxt("./experiment/init_param_small.csv", delimiter=',')
        init_params=torch.from_numpy(my_data)
        result=play_ground.perform_mixed_strategy(fract,init_params.float())
        policy_for_fract.append(list(play_ground.A1.model.fc1.parameters()))
    policy_for_fract=np.array(policy_for_fract)
    sec_policy_list.append(policy_for_fract)
sec_policy_list=np.array(sec_policy_list)
np.savetxt("./experiment/exp_rslt_2.csv", sec_policy_list, delimiter=",")

In [None]:
sec_policy_list

In [47]:
true_Q=solver.compute_q_table(max_iterations=10000, all_close=functools.partial(np.allclose, rtol=1e-10, atol=1e-10))
np.savetxt("./experiment/true_Q.csv", true_Q, delimiter=",")
true_Q

array([[9.16666667, 7.52025463],
       [4.49652778, 8.81944444],
       [0.        , 0.        ]])

In [None]:

np.arange(0,1,0.1)


In [21]:
def reward_array_with_a_policy(policy_param,num_of_episodes):
        play_ground.env.reset()
        
        s_t_index=play_ground.env._state.index
        s_t=F.one_hot(torch.tensor(s_t_index),num_classes=play_ground.env.observation_space.n).unsqueeze(dim=0)
        s_t=s_t.type(torch.FloatTensor)

        trajs=[]
        # D_real.flush_all()

        result=[]
        
        for param in play_ground.A1.model.fc1.parameters():
            param.data = nn.parameter.Parameter(policy_param)
        
        
        b_states=[]
        b_log_probs=[]
        b_rewards=[]
        b_actions=[]
        b_nstates=[]
        b_count=[]
        
        
        for traj_id in range(num_of_episodes):
            play_ground.env.reset()
            # display_env()
            s_t_index=play_ground.env._state.index
            
            states=[]
            log_probs=[]
            rewards=[]
            actions=[]
            nstates=[]
        
            for t in range(100):
                s_t=F.one_hot(torch.tensor(s_t_index),num_classes=play_ground.env.observation_space.n).unsqueeze(dim=0)
                s_t=s_t.type(torch.FloatTensor)
                a_t, log_prob = play_ground.A1.action(s_t)
                ns_t, r_t, done, _ = play_ground.env.step(a_t.numpy()[0][0])
                # display_env()
                if t!=0:
                    nstates.append(s_t)
                states.append(s_t)
                actions.append(a_t)
                log_probs.append(log_prob)
                rewards.append(r_t)
                s_t_index=ns_t
                if done:
                    break
            # time.sleep(2)
            s_t=F.one_hot(torch.tensor(s_t_index),num_classes=play_ground.env.observation_space.n).unsqueeze(dim=0)
            s_t=s_t.type(torch.FloatTensor)
            nstates.append(s_t)
            
            b_states.append(states)
            b_actions.append(actions)
            b_rewards.append(rewards)
            b_states.append(nstates)
            b_count.append(len(rewards))
        return b_states,b_actions,b_rewards,b_nstates,sum(b_count)/len(b_count)
    

# Plain-Test 

In [26]:
my_data = np.genfromtxt("./experiment/init_param_big.csv", delimiter=',')
init_params=torch.from_numpy(my_data)
rew_with_bad=reward_array_with_a_policy(init_params.float(),10000)
print("Reward_array with bad parameters ")
print(rew_with_bad[4])

# my_data = np.genfromtxt("./experiment/alpha_variation/after_param_big_alpha_1.csv", delimiter=',')
# init_params=torch.from_numpy(my_data)
# rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
# print("\nReward_array with good parameters : fraction 1")
# print(rew_with_good[4])

# my_data = np.genfromtxt("./experiment/alpha_variation/after_param_big_alpha_point_0.csv", delimiter=',')
# init_params=torch.from_numpy(my_data)
# rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
# print("\nReward_array with good parameters : fraction 0.4")
# print(rew_with_good[4])


# my_data = np.genfromtxt("./experiment/alpha_variation/after_param_big_alpha_point_0_v1.csv", delimiter=',')
# init_params=torch.from_numpy(my_data)
# rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
# print("\nReward_array with good parameters : fraction 0.4")
# print(rew_with_good[4])

# my_data = np.genfromtxt("./experiment/alpha_variation/after_param_big_alpha_point_0_v2.csv", delimiter=',')
# init_params=torch.from_numpy(my_data)
# rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
# print("\nReward_array with good parameters : fraction 0.4")
# print(rew_with_good[4])

# my_data = np.genfromtxt("./experiment/alpha_variation_big/after_param_big_alpha_point_0.csv", delimiter=',')
# init_params=torch.from_numpy(my_data)
# rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
# print("\nReward_array with good parameters : fraction 0.4")
# print(rew_with_good[4])

# my_data = np.genfromtxt("./experiment/alpha_variation_big/after_param_big_alpha_point_0_gain_1_epoch_150.csv", delimiter=',')
# init_params=torch.from_numpy(my_data)
# rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
# print("\nReward_array with good parameters : fraction 0.4")
# print(rew_with_good[4])

# my_data = np.genfromtxt("./experiment/alpha_variation_big/after_param_big_alpha_point_0_v2.csv", delimiter=',')
# init_params=torch.from_numpy(my_data)
# rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
# print("\nReward_array with good parameters : fraction 0.4")
# print(rew_with_good[4])

my_data = np.genfromtxt("./experiment/inner_outer_variation/pure_real_inner_1_outer_200.csv", delimiter=',')
init_params=torch.from_numpy(my_data)
rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
print("\nReward_array with good parameters : fraction 0.4")
print(rew_with_good[4])

my_data = np.genfromtxt("./experiment/inner_outer_variation/pure_real_inner_1_outer_1000.csv", delimiter=',')
init_params=torch.from_numpy(my_data)
rew_with_good=reward_array_with_a_policy(init_params.float(),10000)
print("\nReward_array with good parameters : fraction 0.4")
print(rew_with_good[4])

Reward_array with bad parameters 
24.1017

Reward_array with good parameters : fraction 0.4
19.2233

Reward_array with good parameters : fraction 0.4
5.1448


In [29]:
print(rew_with_good[0][0])

[tensor([[1., 0., 0.]]), tensor([[1., 0., 0.]]), tensor([[1., 0., 0.]]), tensor([[1., 0., 0.]]), tensor([[1., 0., 0.]]), tensor([[1., 0., 0.]]), tensor([[1., 0., 0.]]), tensor([[1., 0., 0.]]), tensor([[1., 0., 0.]])]


In [30]:
print(rew_with_good[1][0])

[tensor([[0]]), tensor([[0]]), tensor([[0]]), tensor([[0]]), tensor([[0]]), tensor([[0]]), tensor([[0]]), tensor([[0]]), tensor([[0]])]


In [31]:
print(rew_with_good[2][0])

[0, 0, 0, 0, 0, 0, 0, 0, 10]


# plotting episodes vs rewards

In [None]:
plt.plot(range(len(result)), result)
plt.ylabel('reward')
plt.xlabel('episodes')
plt.grid(True)
plt.show()

env.close()