In [11]:
import numpy as np

#Deterministic environment for optimal maintenance problem
#States: {0,1} Healthy, Faulty
#Actions: {0,1} Do nothing, Repair
#Rewards: -1,0,-10 
p = np.zeros((2,2,2)).astype(int)
#p[s,a]=(r,s')
p[0,0]=[0,0]
p[0,1]=[-10,0]
p[1,0]=[-1,1]
p[1,1]=[-10,0]

def epsilon_greedy_pi(Q,state,epsilon):
    if np.random.random() < epsilon:
        return np.random.randint(2)
    else:
        return np.argmax(Q[state,:])

def take_action(state_action):
    return p[state_action]

In [12]:
def DynaQ_plus(n, n_timestep, epsilon, alpha, gamma,k,model):
    #Initialize action-value function
    Q = np.zeros((2,2)).astype(int)
    
    #Model
    """if previous_model == False:
        model = np.zeros((p.shape)).astype(int)
    else:
        model = previous_model
    """
    time_since_taken = {(0,0):0,(0,1):0,(1,0):0,(1,1):0}
    observed_s_a = {}
    
    for i in range(n_timestep):
        #print("Iteration",i)
        #choose state and action
        state = np.random.randint(2)
        action = epsilon_greedy_pi(Q,state,epsilon)

        #track if state action pair was visited
        if state in observed_s_a:
            if action not in observed_s_a[state]:
                observed_s_a[state].append(action)
        else:
            observed_s_a[state] = [action]
        
        #track for how long state actions have not been visited
        for key in time_since_taken:
            if key == (state,action):
                time_since_taken[key] = 0
            else:
                time_since_taken[key] +=1
                
        #Experience
        reward, next_state = take_action((state,action))
        
        #Direct RL
        Q[state,action] += alpha * (reward + gamma * np.argmax(Q[next_state,:])-Q[state,action])
        
        #Model learning
        model[state,action] = [reward,next_state]

        #Add bonus to state action pairs depending on time since visited
        for s in [0,1]:
            for a in [0,1]:
                tau = time_since_taken[s,a]
                bonus = k * np.sqrt(tau)
                model[s,a][0] += bonus
                  
        #Planning
        for j in range(n):
            state = np.random.choice(list(observed_s_a.keys()))
            action = np.random.choice(observed_s_a[state])
            
            reward, next_state = model[state,action]
            Q[state,action] += alpha * (reward + gamma * np.argmax(Q[next_state,:]) - Q[state,action])
    return Q,model
    

In [13]:
model = np.zeros((p.shape)).astype(int)
Q , model= DynaQ_plus(5, 100, 0.3, 0.1, 0.9,0.1,model)

In [14]:
Q

array([[ 0, -1],
       [ 0,  0]])

In [15]:
model

array([[[ 0,  0],
        [-9,  0]],

       [[-1,  1],
        [ 0,  0]]])

In [16]:
#Environment changes. It is now free to repair
p = np.zeros((2,2,2)).astype(int)
#p[s,a]=(r,s')
p[0,0]=[0,0]

#p[0,1]=[-10,0]
p[0,1]=[10,0]

p[1,0]=[-1,1]

#p[1,1]=[-10,0]
p[1,1]=[10,0]

In [17]:
old_model = model

In [18]:
new_Q, new_model = DynaQ_plus(5, 10, 0.1, 0.1, 0.9,0.1, old_model)

In [19]:
new_Q

array([[0, 1],
       [0, 0]])

In [20]:
new_model

array([[[ 0,  0],
        [10,  0]],

       [[-1,  1],
        [ 0,  0]]])