In [1]:
import numpy as np

In [2]:
gamma = 0.75
alpha = 0.9

In [3]:
location_to_state = {'A': 0,
                     'B': 1,
                     'C': 2,
                     'D': 3,
                     'E': 4,
                     'F': 5,
                     'G': 6,
                     'H': 7,
                     'I': 8,
                     'J': 9,
                     'K': 10,
                     'L': 11}

In [4]:
actions = [0,1,2,3,4,5,6,7,8,9,10,11]

In [5]:
R = np.array([[0,1,0,0,0,0,0,0,0,0,0,0],
              [1,0,1,0,0,1,0,0,0,0,0,0],
              [0,1,0,0,0,0,1,0,0,0,0,0],
              [0,0,0,0,0,0,0,1,0,0,0,0],
              [0,0,0,0,0,0,0,0,1,0,0,0],
              [0,1,0,0,0,0,0,0,0,1,0,0],
              [0,0,1,0,0,0,1,1,0,0,0,0],
              [0,0,0,1,0,0,1,0,0,0,0,1],
              [0,0,0,0,1,0,0,0,0,1,0,0],
              [0,0,0,0,0,1,0,0,1,0,1,0],
              [0,0,0,0,0,0,0,0,0,1,0,1],
              [0,0,0,0,0,0,0,1,0,0,1,0]])

In [6]:
Q = np.array(np.zeros([12,12]))

In [7]:
for i in range(1000):
  current_state = np.random.randint(0,12)          
  playable_actions = []     
  for j in range(12):         
     if R[current_state, j] > 0:            
       playable_actions.append(j)
       next_state = np.random.choice(playable_actions)
       TD = R[current_state, next_state] + gamma * Q[next_state, np.argmax(Q[next_state,])] - Q[current_state, next_state]
       Q[current_state, next_state] = Q[current_state, next_state] + alpha * TD


In [8]:
print("Q-values:") 
print(Q.astype(int))

Q-values:
[[0 3 0 0 0 0 0 0 0 0 0 0]
 [3 0 3 0 0 3 0 0 0 0 0 0]
 [0 3 0 0 0 0 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 0 3 0 0 0]
 [0 3 0 0 0 0 0 0 0 3 0 0]
 [0 0 3 0 0 0 3 3 0 0 0 0]
 [0 0 0 3 0 0 3 0 0 0 0 3]
 [0 0 0 0 3 0 0 0 0 3 0 0]
 [0 0 0 0 0 3 0 0 3 0 3 0]
 [0 0 0 0 0 0 0 0 0 3 0 3]
 [0 0 0 0 0 0 0 3 0 0 3 0]]


In [9]:
state_to_location = {state: location for location, state in location_to_state.items()}


In [11]:
print(state_to_location)

{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L'}


In [12]:
def route(starting_location, ending_location):
    R_new = np.copy(R)
    ending_state = location_to_state[ending_location]
    R_new[ending_state, ending_state] = 1000
    Q = np.array(np.zeros([12,12]))
    for i in range(1000):
        current_state = np.random.randint(0,12)
        playable_actions = []
        for j in range(12):
            if R_new[current_state, j] > 0:
                playable_actions.append(j)
        next_state = np.random.choice(playable_actions)
        TD = R_new[current_state, next_state] + gamma * Q[next_state, np.argmax(Q[next_state,])] - Q[current_state, next_state]
        Q[current_state, next_state] = Q[current_state, next_state] + alpha * TD
    route = [starting_location]
    next_location = starting_location
    while (next_location != ending_location):
        starting_state = location_to_state[starting_location]
        next_state = np.argmax(Q[starting_state,])
        next_location = state_to_location[next_state]
        route.append(next_location)
        starting_location = next_location
    return route

In [18]:
print('Route:')
route('E' ,'G')

Route:


['E', 'I', 'J', 'K', 'L', 'H', 'G']