# Setup Model

In [59]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from UtilityMethods import utils
import sys
import pickle
import time
import pulp as p
import math
from copy import copy
import pprint as pp
import itertools
from tqdm import tqdm

## Global variables

In [60]:
# Global variables

# IS_VISIT_DEPENDENT = False # whether the above empirical estimates are visit-dependent or not
DATA = '../data/ACCORD_BPBGClass_v2.csv'

## State space and action space

In [61]:
# state space, actions available in each state are always the same

"""
# original fine-grained levels
state_features = ['sbp_discrete','hba1c_discrete','TC_discrete','hdl_discrete','BMI_discrete'] 
sbp_level = ['0', '1', '2', '3'] # possible values for sbp_discrete
hba1c_level = ['0', '1', '2', '3', '4', '5', '6', '7']
TC_level = ['0', '1', '2', '3']
hdl_level = ['0', '1', '2', '3']
BMI_level = ['0', '1', '2', '3']
"""

# here we merge levels
# sbp_level = ['0', '1', '2'] # sbp_discrete, 0: 0, 1:1, 2+3: 2
# hba1c_level = ['0', '1', '2'] # hba1c_discrete, 0+1: 0, 2+3: 1, 4+5+6+7: 2

sbp_level = ['0', '1', ] # sbp_discrete, 0: 0, 1:1, 2+3: 2
hba1c_level = ['0', '1'] # hba1c_discrete, 0+1: 0, 2+3: 1, 4+5+6+7: 2

TC_level = ['0', '1'] # TC_discrete, 0+1: 0, 2+3: 1
hdl_level = ['0', '1'] # hdl_discrete, 0+1: 0, 2+3: 1

# sbp_discrete_code_dict = {'0': '0', '1': '1',
#                           '2': '2', '3': '2',}

sbp_discrete_code_dict = {'0': '0', '1': '0',
                          '2': '1', '3': '1',}

# hba1c_discrete_code_dict = {'0': '0', '1': '0', 
#                             '2': '1', '3': '1', 
#                             '4': '2', '5': '2', 
#                             '6': '2', '7': '2'}

hba1c_discrete_code_dict = {'0': '0', '1': '0', 
                            '2': '0', '3': '0', 
                            '4': '1', '5': '1', 
                            '6': '1', '7': '1'}

TC_discrete_code_dict = {'0': '0', '1': '0',
                         '2': '1', '3': '1'}

hdl_discrete_code_dict = {'0': '0', '1': '0',
                          '2': '1', '3': '1'}

# 4 features, state space = 36
# state_features = ['sbp_discrete', 'hba1c_discrete', 'TC_discrete', 'hdl_discrete'] 
# combinations = itertools.product(sbp_level, hba1c_level, TC_level, hdl_level)

# 3 features, state space = 18
# state_features = ['sbp_discrete', 'hba1c_discrete', 'TC_discrete'] 
# combinations = itertools.product(sbp_level, hba1c_level, TC_level)

# 2 features, state space = 9
combinations = itertools.product(sbp_level, hba1c_level)
state_features = ['sbp_discrete', 'hba1c_discrete'] 

# 1 feature, srtate space = 3
# combinations = itertools.product(hba1c_level)
# state_features = ['hba1c_discrete'] 

states = [''.join(i) for i in combinations]
print('len(states) =', len(states))
print(states[:5])

N_STATES = len(states) 
state_code_to_index = {code: i for i, code in enumerate(states)}
state_index_to_code = {i: code for i, code in enumerate(states)}
for i in range(N_STATES):
    print(states[i], state_code_to_index[states[i]])
print()



# action space, 000000000 means bgclass_none, 111111111 means all bgmed class are precribed
# we donot include 'bgclass_none' as a action, because 000000000 means bgclass_none
# action_features = ['Diur', 'ACE', 'Beta-blocker', 'CCB', 
#                     'Bingu', 'Thiaz', 'Sulfon', 'Meglit'] # pick the top 4 most frequently prescribed BP and BG Med class 
                    
# action_features = ['Diur', 'ACE', 'Beta-blocker',  
#                     'Bingu', 'Thiaz', 'Sulfon', ] # pick the top 3 most frequently prescribed BP and BG Med class 

action_features = ['Diur', 'ACE',   
                    'Bingu', 'Thiaz', ] # pick the top 2 most frequently prescribed BP and BG Med class 
                    

combinations = list(itertools.product('01', repeat=len(action_features)))
actions = [''.join(i) for i in combinations]
print('len(actions) =', len(actions))
N_ACTIONS = len(actions) # number of actions = 512
action_code_to_index = {code: i for i, code in enumerate(actions)}
# print the first 5 action_code_to_index
for i in range(5):
    print(actions[i], action_code_to_index[actions[i]])

# build the action space for each state, assign the same action space to all states
ACTIONS_PER_STATE = {}
for s in range(N_STATES):
    ACTIONS_PER_STATE[s] = [i for i in range(N_ACTIONS)] # this is the action code index
print('Actions for State 0:', ACTIONS_PER_STATE[0])

len(states) = 4
['00', '01', '10', '11']
00 0
01 1
10 2
11 3

len(actions) = 16
0000 0
0001 1
0010 2
0011 3
0100 4
Actions for State 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


## Calculate empirical estimates of P, R, C

In [62]:
df = pd.read_csv(DATA)

In [63]:
print(df.shape)
patients_set = set(df['MaskID'].unique())
print('len(patients_set) =', len(patients_set))

(139005, 74)
len(patients_set) = 3595


In [64]:
# add the state and action code columns
action_code = []
state_code = []
hba1c_discrete_merged = []
for i in range(len(df)):
    row = df.iloc[i]
    s_code = ''
    a_code = ''
    for state_fea in state_features:
        code = str(row[state_fea])

        if state_fea == 'sbp_discrete':
            code = sbp_discrete_code_dict[code]            
        elif state_fea == 'hba1c_discrete':
            code = hba1c_discrete_code_dict[code]
            hba1c_discrete_merged.append(code)
        elif state_fea == 'TC_discrete':
            code = TC_discrete_code_dict[code]
        elif state_fea == 'hdl_discrete':
            code = hdl_discrete_code_dict[code]
        else:
            raise ValueError('state_fea not recognized')
            exit(1)       
        
        s_code += code
    
    for action_fea in action_features:
        a_code += str(row[action_fea])
    
    action_code.append(a_code)
    state_code.append(s_code)

assert len(hba1c_discrete_merged) == len(df)

df['hba1c_discrete_merged'] = hba1c_discrete_merged
df['action_code'] = action_code
df['state_code'] = state_code
print('Finished adding action_code and state_code columns')

DATA_MERGED = DATA[:-4] + '_merged.csv'
# write the merged data to file
df.to_csv(DATA_MERGED, index=False)

Finished adding action_code and state_code columns


In [65]:
for i in range(0):
    print(i)

In [66]:
#------------- calculate the empirical estimate of P, R, C based on entire dataset ----------------
        
count_s_a = {} # count the number of times state s and action a appear in the dataset, sparse format
count_s_a_d = {} # count the number of times state s, action a, and next state s' appear in the dataset
sum_r_s_a = {} # sum of the reward of state s and action a
sum_c1_s_a = {} # sum of the cost of state s and action a, this is the absolute value of SBP feedback
sum_c2_s_a = {} # sum of the cost of state s and action a, this is the absolute value of hba1c feedback
visit_number = [] # number of visits for each patient

# loop through each patient in the dataset
for i in tqdm(range(100001, 110252)):
    df_patient = df[df['MaskID'] == i]

    if len(df_patient) > 0:
        visit_number.append(len(df_patient))
    else:
        continue

    # loop through each visit of the patient
    for j in range(len(df_patient)-1): # loop before last visit
        row = df_patient.iloc[j]
        s_code = row['state_code']
        a_code = row['action_code']
        ns_code = df_patient.iloc[j+1]['state_code']

        # convert from code to index
        s = state_code_to_index[s_code]
        a = action_code_to_index[a_code]
        s_ = state_code_to_index[ns_code]

        r = df_patient.iloc[j]['CVDRisk_feedback']
        hba1c_fb = df_patient.iloc[j]['hba1c_feedback']
        sbp_fb = df_patient.iloc[j]['sbp_feedback']
        c1 = sbp_fb
        c2 = hba1c_fb

        if (s, a) not in count_s_a:
            count_s_a[(s, a)] = 1
            sum_r_s_a[(s, a)] = r 
            sum_c1_s_a[(s, a)] = c1
            sum_c2_s_a[(s, a)] = c2
        else:
            count_s_a[(s, a)] += 1
            sum_r_s_a[(s, a)] += r
            sum_c1_s_a[(s, a)] += c1
            sum_c2_s_a[(s, a)] += c2

        if (s, a, s_) not in count_s_a_d:
            count_s_a_d[(s, a, s_)] = 1
        else:
            count_s_a_d[(s, a, s_)] += 1

print('len(visit_number) =', len(visit_number))
print('averge visit_number =', sum(visit_number)/len(visit_number))

print('len(count_s_a) =', len(count_s_a))
print('len(count_s_a_d) =', len(count_s_a_d))
print('Finished counting by looping through the dataset')

100%|██████████| 10251/10251 [01:31<00:00, 111.74it/s]

len(visit_number) = 3595
averge visit_number = 38.666203059805284
len(count_s_a) = 64
len(count_s_a_d) = 256
Finished counting by looping through the dataset





In [67]:
# calculate the sparsity of state-action pairs
print('Total possible state-action pairs =', N_STATES * N_ACTIONS)
print('Seen state-action pairs =', len(count_s_a))
print('Unseen state-action pairs =', N_STATES * N_ACTIONS - len(count_s_a))
print('Sparsity of state-action pairs =', 1 - len(count_s_a)/(N_STATES * N_ACTIONS))

Total possible state-action pairs = 64
Seen state-action pairs = 64
Unseen state-action pairs = 0
Sparsity of state-action pairs = 0.0


In [68]:
# calculate the empirical estimate of P, R, C using counts

# initialize R, C, P, NOT using sparse matrix format
R = {} # N_STATES * N_ACTIONS, dictionary of reward matrices, this is the CVDRisk empirical estimate based on entire dataset
C1 = {} # N_STATES * N_ACTIONS, dictionary of cost matrices, this is SBP empirical estimate based on entire dataset
C2 = {} # N_STATES * N_ACTIONS, dictionary of cost matrices, this is hba1c empirical estimate based on entire dataset
P = {} # N_STATES * N_ACTIONS * N_STATES, dictionary of transition probability matrices, based on the entire dataset

for s in range(N_STATES):
    l = len(actions)
    R[s] = np.zeros(l)
    C1[s] = np.zeros(l)
    C2[s] = np.zeros(l)
    P[s] = {}    
    for a in range(N_ACTIONS):
        P[s][a] = np.zeros(N_STATES)
        
print('Finished initializing R, C, P')

for (s,a) in count_s_a: # only calculate for the states and actions that appearedin the dataset, for efficiency

    R[s][a] = sum_r_s_a[(s, a)]/max(count_s_a[(s, a)],1)
    C1[s][a] = sum_c1_s_a[(s, a)]/max(count_s_a[(s, a)],1)
    C2[s][a] = sum_c2_s_a[(s, a)]/max(count_s_a[(s, a)],1)

for (s, a, s_) in count_s_a_d:
    P[s][a][s_] = count_s_a_d[(s, a, s_)]/max(count_s_a[(s, a)],1)

print('Finished calculating the empirical estimate of P, R, C')

#------------- check the sparsity of P, R, C
print('\nDetails of P, R, C:')
print('P: {:.6f}% are non-zeros'.format(len(count_s_a_d)*100/(N_STATES*N_ACTIONS*N_STATES)))
print('R: {:.6f}% are non-zeros'.format(len(sum_r_s_a)*100/(N_STATES*N_ACTIONS)))
print('C1: {:.6f}% are non-zeros'.format(len(sum_c1_s_a)*100/(N_STATES*N_ACTIONS)))
print('C2: {:.6f}% are non-zeros'.format(len(sum_c2_s_a)*100/(N_STATES*N_ACTIONS)))

# print sample values of P, R, C
print('\nSample values of P, R, C:')
print('P[0][0][0] =', P[0][0][0])
print('R[0][0] =', R[0][0])
print('C1[0][0] =', C1[0][0])
print('C2[0][0] =', C2[0][0])

# print the min, max, mean, median of R and C, R and C are dictionaries of numpy arrays
print('\nStatistics of R and C:')
print('R: min = {:.6f}, max = {:.6f}, mean = {:.6f}, median = {:.6f}, std = {:.6f}'.format(np.min(list(R.values())), np.max(list(R.values())), np.mean(list(R.values())), np.median(list(R.values())), np.std(list(R.values()))))
print('C1: min = {:.6f}, max = {:.6f}, mean = {:.6f}, median = {:.6f}, std = {:.6f}'.format(np.min(list(C1.values())), np.max(list(C1.values())), np.mean(list(C1.values())), np.median(list(C1.values())), np.std(list(C1.values()))))
print('C2: min = {:.6f}, max = {:.6f}, mean = {:.6f}, median = {:.6f}, std = {:.6f}'.format(np.min(list(C2.values())), np.max(list(C2.values())), np.mean(list(C2.values())), np.median(list(C2.values())), np.std(list(C2.values()))))

Finished initializing R, C, P
Finished calculating the empirical estimate of P, R, C

Details of P, R, C:
P: 100.000000% are non-zeros
R: 100.000000% are non-zeros
C1: 100.000000% are non-zeros
C2: 100.000000% are non-zeros

Sample values of P, R, C:
P[0][0][0] = 0.9473557499398122
R[0][0] = 0.21955534155996564
C1[0][0] = 122.56592568814702
C2[0][0] = 6.5586670411685395

Statistics of R and C:
R: min = 0.195295, max = 0.351946, mean = 0.256444, median = 0.255423, std = 0.045990
C1: min = 118.544858, max = 151.597127, mean = 132.507139, median = 131.618390, std = 9.961479
C2: min = 6.380569, max = 8.585431, mean = 7.467460, median = 7.500988, std = 0.793282


## Check Init states

In [69]:
def check_frequency(df, col_name):
    df = df[col_name]
    df = df.value_counts()
    print(df)
    print()

    # return the first index in the series
    return df.index[0]
    
# get the rows when the visit=='BLR' in df
df_blr = df[df['Visit']=='BLR']
INIT_STATES_LIST = df_blr['state_code'].unique() # we will sample uniformly from this list
print('len(INIT_STATES_LIST) =', len(INIT_STATES_LIST))

print('df_blr.shape =', df_blr.shape)
most_freq_blr_state = check_frequency(df_blr, 'state_code')
print('most_freq_blr_state =', most_freq_blr_state)
INIT_STATE_INDEX = state_code_to_index[most_freq_blr_state]
print('INIT_STATE_INDEX =', INIT_STATE_INDEX)

len(INIT_STATES_LIST) = 4
df_blr.shape = (3595, 77)
01    1561
11    1320
00     414
10     300
Name: state_code, dtype: int64

most_freq_blr_state = 01
INIT_STATE_INDEX = 1


In [70]:
most_freq_state = check_frequency(df, 'state_code')

00    77239
01    36697
10    14845
11    10224
Name: state_code, dtype: int64



## Compute solution.pkl and baseline.pkl files

In [71]:
print('state_code_to_index =', state_code_to_index)
print('state_index_to_code =', state_index_to_code)

state_code_to_index = {'00': 0, '01': 1, '10': 2, '11': 3}
state_index_to_code = {0: '00', 1: '01', 2: '10', 3: '11'}


In [84]:
EPISODE_LENGTH = 20 # average number of visits per patient

CONSTRAINT1_list = [100] * N_STATES # deviation * 20 visits
C1_b_list = [40] * N_STATES # constraint for baseline policy

# CONSTRAINT2_list = [16, 10, 10] * 3 # deviation * 20 visits 
# C2_b_list = [8, 5, 5] * 3  # constraint for baseline policy

CONSTRAINT2_list = [16] * N_STATES # deviation * 20 visits 
C2_b_list = [8] * N_STATES  # constraint for baseline policy

delta = 0.01 # bound

EPS = 0.01 # not used
M = 0 # not used

print('CONSTRAINT1_list =', CONSTRAINT1_list)
print('C1_b_list =', C1_b_list)
print('CONSTRAINT2_list =', CONSTRAINT2_list)
print('C2_b_list =', C2_b_list)

CONSTRAINT1_list = [100, 100, 100, 100]
C1_b_list = [40, 40, 40, 40]
CONSTRAINT2_list = [16, 16, 16, 16]
C2_b_list = [8, 8, 8, 8]


### Save the model settings

In [85]:
# dump the model settings and parameters to a pickle file
with open('output/model.pkl', 'wb') as f:
    pickle.dump([P, R, C1, C2, INIT_STATE_INDEX, INIT_STATES_LIST, state_code_to_index,
                CONSTRAINT1_list, C1_b_list, CONSTRAINT2_list, C2_b_list, N_STATES, N_ACTIONS, ACTIONS_PER_STATE, EPISODE_LENGTH, delta], f)

### Calculate the optimal policy

In [86]:
import importlib
import sys
importlib.reload(sys.modules['UtilityMethods'])
from UtilityMethods import utils

opt_policy_con_list = []
opt_value_LP_con_list = []
opt_cost1_LP_con_list = []
opt_cost2_LP_con_list = []
opt_q_con_list = []

print('Optimal policy:')
for state_idx in range(N_STATES):
    print('\nstate_idx =', state_idx)
    INIT_STATE_INDEX = state_idx
    CONSTRAINT1 = CONSTRAINT1_list[INIT_STATE_INDEX]
    CONSTRAINT2 = CONSTRAINT2_list[INIT_STATE_INDEX]
    C1_b = C1_b_list[INIT_STATE_INDEX]
    C2_b = C2_b_list[INIT_STATE_INDEX]

    print('CONSTRAINT1 =', CONSTRAINT1)
    print('CONSTRAINT2 =', CONSTRAINT2)
    print('C1_b =', C1_b)
    print('C2_b =', C2_b)

    util_methods_1 = utils(EPS, delta, M, P, R, C1, C2, INIT_STATE_INDEX, EPISODE_LENGTH, N_STATES, N_ACTIONS, ACTIONS_PER_STATE, CONSTRAINT1, C1_b, CONSTRAINT2, C2_b)

    # constrained MDP, solve the optimal policy using LP
    opt_policy_con, opt_value_LP_con, opt_cost1_LP_con, opt_cost2_LP_con, opt_q_con, flag = util_methods_1.compute_opt_LP_Constrained(0)

    if flag != 'Optimal':
        raise ValueError('LP not solved to optimality')
        

    # unconstrained = standard MDP, not used in DOPE
    # opt_policy_uncon, opt_value_LP_uncon, opt_cost_LP_uncon, opt_q_uncon = util_methods_1.compute_opt_LP_Unconstrained(0) 

    opt_policy_con_list.append(opt_policy_con)
    opt_value_LP_con_list.append(opt_value_LP_con)
    opt_cost1_LP_con_list.append(opt_cost1_LP_con)
    opt_cost2_LP_con_list.append(opt_cost2_LP_con)
    opt_q_con_list.append(opt_q_con)

    print("opt_value_LP_con[INIT_STATE_INDEX, 0] =",opt_value_LP_con[INIT_STATE_INDEX, 0])
    print("opt_cost1_LP_con[INIT_STATE_INDEX, 0] =",opt_cost1_LP_con[INIT_STATE_INDEX, 0])
    print("opt_cost2_LP_con[INIT_STATE_INDEX, 0] =",opt_cost2_LP_con[INIT_STATE_INDEX, 0])
    

with open('output/solution.pkl', 'wb') as f:
    pickle.dump([opt_policy_con_list, opt_value_LP_con_list, opt_cost1_LP_con_list,  opt_cost2_LP_con_list, opt_q_con_list], f)

Optimal policy:

state_idx = 0
CONSTRAINT1 = 100
CONSTRAINT2 = 16
C1_b = 40
C2_b = 8

Computing optimal policy with constrained LP solver ...
+++++ Optimal
printing best value constrained: 4.0949047878067235

value from the conLPsolver:
value of policy = 4.0949047878067235
cost1 of policy = 23.869942506200008
cost2 of policy = 8.34493858178205
opt_value_LP_con[INIT_STATE_INDEX, 0] = 4.0949047884010055
opt_cost1_LP_con[INIT_STATE_INDEX, 0] = 23.869942485841474
opt_cost2_LP_con[INIT_STATE_INDEX, 0] = 8.344938586684478

state_idx = 1
CONSTRAINT1 = 100
CONSTRAINT2 = 16
C1_b = 40
C2_b = 8

Computing optimal policy with constrained LP solver ...
+++++ Optimal
printing best value constrained: 4.09253400981072

value from the conLPsolver:
value of policy = 4.09253400981072
cost1 of policy = 29.062001631432853
cost2 of policy = 7.014366066960507
opt_value_LP_con[INIT_STATE_INDEX, 0] = 4.092534011685285
opt_cost1_LP_con[INIT_STATE_INDEX, 0] = 29.062001617081314
opt_cost2_LP_con[INIT_STATE_INDEX,

### Calculate the baseline policy

In [87]:
import importlib
import sys
importlib.reload(sys.modules['UtilityMethods'])
from UtilityMethods import utils


# baseline policy
print("Baseline policy:")

policy_b_list = []
value_b_list = []
cost1_b_list = []
cost2_b_list = []
q_b_list = []

for state_idx in range(N_STATES):
    print('\nstate_idx =', state_idx)
    INIT_STATE_INDEX = state_idx
    CONSTRAINT1 = CONSTRAINT1_list[INIT_STATE_INDEX]
    CONSTRAINT2 = CONSTRAINT2_list[INIT_STATE_INDEX]
    C1_b = C1_b_list[INIT_STATE_INDEX]
    C2_b = C2_b_list[INIT_STATE_INDEX]

    print('CONSTRAINT1 =', CONSTRAINT1)
    print('CONSTRAINT2 =', CONSTRAINT2)
    print('C1_b =', C1_b)
    print('C2_b =', C2_b)  

    util_methods_1 = utils(EPS, delta, M, P, R, C1, C2, INIT_STATE_INDEX, EPISODE_LENGTH, N_STATES, N_ACTIONS, ACTIONS_PER_STATE, C1_b, C1_b, C2_b, C2_b)
    policy_b, value_b, cost1_b, cost2_b, q_b, flag = util_methods_1.compute_opt_LP_Constrained(0)

    if flag != 'Optimal':
        raise ValueError('LP not solved to optimality')

    policy_b_list.append(policy_b)
    value_b_list.append(value_b)
    cost1_b_list.append(cost1_b)
    cost2_b_list.append(cost2_b)
    q_b_list.append(q_b)

    print("value_b[INIT_STATE_INDEX, 0] =",value_b[INIT_STATE_INDEX, 0])
    print("cost1_b[INIT_STATE_INDEX, 0] =",cost1_b[INIT_STATE_INDEX, 0])
    print("cost2_b[INIT_STATE_INDEX, 0] =",cost2_b[INIT_STATE_INDEX, 0])

with open('output/base.pkl', 'wb') as f:
    pickle.dump([policy_b_list, value_b_list, cost1_b_list, cost2_b_list, q_b_list], f)

Baseline policy:

state_idx = 0
CONSTRAINT1 = 100
CONSTRAINT2 = 16
C1_b = 40
C2_b = 8

Computing optimal policy with constrained LP solver ...
+++++ Optimal
printing best value constrained: 4.097421393259504

value from the conLPsolver:
value of policy = 4.097421393259504
cost1 of policy = 24.16660403568475
cost2 of policy = 7.999999990383125
value_b[INIT_STATE_INDEX, 0] = 4.09742139800955
cost1_b[INIT_STATE_INDEX, 0] = 24.166604017293324
cost2_b[INIT_STATE_INDEX, 0] = 7.999999999818228

state_idx = 1
CONSTRAINT1 = 100
CONSTRAINT2 = 16
C1_b = 40
C2_b = 8

Computing optimal policy with constrained LP solver ...
+++++ Optimal
printing best value constrained: 4.09253400981072

value from the conLPsolver:
value of policy = 4.09253400981072
cost1 of policy = 29.062001631432853
cost2 of policy = 7.014366066960507
value_b[INIT_STATE_INDEX, 0] = 4.092534011685285
cost1_b[INIT_STATE_INDEX, 0] = 29.062001617081314
cost2_b[INIT_STATE_INDEX, 0] = 7.014366067079475

state_idx = 2
CONSTRAINT1 = 100


# -------------STOP

### Decode calculated optimal and baseline policy

In [76]:
# decode the opt_policy_con [s, h, a]

def action_code_to_med_action(action_code):
    med_action = []
    for i in range(len(action_code)):
        if action_code[i] == '0':
            continue
        elif action_code[i] == '1':
            med_action.append(action_features[i])
    
    if len(med_action) == 0:
        return 'BPBGClass_none'
    else:
        return '+'.join(med_action)

for s in range(N_STATES):
    for h in range(EPISODE_LENGTH):
        for a in range(N_ACTIONS):
            if opt_policy_con[s, h, a] != 0:
                action_code = actions[a]
                med_action = action_code_to_med_action(action_code)
                # print('opt_policy_con[', s, ',', h, ',', a, '] =', opt_policy_con[s, h, a], ', action_code =', actions[a])
                print('state {}, timestep {}, action_code {}, prob {}: {}'.format(s, h, action_code, opt_policy_con[s, h, a], med_action))


state 0, timestep 0, action_code 0000, prob 0.0625: BPBGClass_none
state 0, timestep 0, action_code 0001, prob 0.0625: Thiaz
state 0, timestep 0, action_code 0010, prob 0.0625: Bingu
state 0, timestep 0, action_code 0011, prob 0.0625: Bingu+Thiaz
state 0, timestep 0, action_code 0100, prob 0.0625: ACE
state 0, timestep 0, action_code 0101, prob 0.0625: ACE+Thiaz
state 0, timestep 0, action_code 0110, prob 0.0625: ACE+Bingu
state 0, timestep 0, action_code 0111, prob 0.0625: ACE+Bingu+Thiaz
state 0, timestep 0, action_code 1000, prob 0.0625: Diur
state 0, timestep 0, action_code 1001, prob 0.0625: Diur+Thiaz
state 0, timestep 0, action_code 1010, prob 0.0625: Diur+Bingu
state 0, timestep 0, action_code 1011, prob 0.0625: Diur+Bingu+Thiaz
state 0, timestep 0, action_code 1100, prob 0.0625: Diur+ACE
state 0, timestep 0, action_code 1101, prob 0.0625: Diur+ACE+Thiaz
state 0, timestep 0, action_code 1110, prob 0.0625: Diur+ACE+Bingu
state 0, timestep 0, action_code 1111, prob 0.0625: Diur+A

In [77]:
# decode the policy_b [s, h, a]
for s in range(N_STATES):
    for h in range(EPISODE_LENGTH):
        for a in range(N_ACTIONS):
            if policy_b[s, h, a] != 0:
                # print('policy_b[', s, ',', h, ',', a, '] =', policy_b[s, h, a], ', action_code =', actions[a])
                action_code = actions[a]
                med_action = action_code_to_med_action(action_code)
                # print('opt_policy_con[', s, ',', h, ',', a, '] =', opt_policy_con[s, h, a], ', action_code =', actions[a])
                print('state {}, timestep {}, action_code {}, prob {}: {}'.format(s, h, action_code, opt_policy_con[s, h, a], med_action))                

state 0, timestep 0, action_code 0000, prob 0.0625: BPBGClass_none
state 0, timestep 0, action_code 0001, prob 0.0625: Thiaz
state 0, timestep 0, action_code 0010, prob 0.0625: Bingu
state 0, timestep 0, action_code 0011, prob 0.0625: Bingu+Thiaz
state 0, timestep 0, action_code 0100, prob 0.0625: ACE
state 0, timestep 0, action_code 0101, prob 0.0625: ACE+Thiaz
state 0, timestep 0, action_code 0110, prob 0.0625: ACE+Bingu
state 0, timestep 0, action_code 0111, prob 0.0625: ACE+Bingu+Thiaz
state 0, timestep 0, action_code 1000, prob 0.0625: Diur
state 0, timestep 0, action_code 1001, prob 0.0625: Diur+Thiaz
state 0, timestep 0, action_code 1010, prob 0.0625: Diur+Bingu
state 0, timestep 0, action_code 1011, prob 0.0625: Diur+Bingu+Thiaz
state 0, timestep 0, action_code 1100, prob 0.0625: Diur+ACE
state 0, timestep 0, action_code 1101, prob 0.0625: Diur+ACE+Thiaz
state 0, timestep 0, action_code 1110, prob 0.0625: Diur+ACE+Bingu
state 0, timestep 0, action_code 1111, prob 0.0625: Diur+A