# Program 6 >> Demonstrate Q learning algorithm with Suitable assumption for a problem statement

In [1]:
import numpy as np

grid_size = 5
goal_state = (grid_size-1, grid_size-1)
obstacles = [(2, 2), (3, 1)]

learning_rate = 0.8
discount_factor = 0.95
num_episodes = 1000

q_table = np.zeros((grid_size, grid_size, 4))

def take_action(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice([0, 1, 2, 3]) 
    else:
        return np.argmax(q_table[state])

def update_q_value(state, action, reward, next_state):
    old_q_value = q_table[state][action]
    next_max_q = np.max(q_table[next_state])
    new_q_value = old_q_value + learning_rate * (reward + discount_factor * next_max_q - old_q_value)
    q_table[state][action] = new_q_value
    
def get_next_state(state, action):
    row, col = state

    next_row = row + (action == 2) - (action == 0)
    next_col = col + (action == 1) - (action == 3)
    if 0 <= next_row < grid_size and 0 <= next_col < grid_size:
        if (next_row, next_col) not in obstacles:
            return (next_row, next_col) 
    return state  

def get_reward(state):
    if state == goal_state:
        return 10
    elif state in obstacles:
        return -1
    else:
        return 0

def is_goal_state(state):
    return state == goal_state

def is_obstacle_state(state):
    return state in obstacles

for episode in range(num_episodes):
    state = (0, 0)
    done = False
    epsilon = 1.0 / (episode + 1)

    while not done:
        action = take_action(state, epsilon)
        next_state = get_next_state(state, action)
        reward = get_reward(next_state)

        update_q_value(state, action, reward, next_state)


        state = next_state
        done = is_goal_state(state) or is_obstacle_state(state)


print("Final Q-table:")
print(q_table)


Final Q-table:
[[[ 5.30736345  6.98337296  3.5880942   0.        ]
  [ 6.05534459  7.35091891  1.78085566  5.21931695]
  [ 5.88073513  7.73780937  0.          0.        ]
  [ 7.428297    8.1450625   3.14486283  6.7563636 ]
  [ 7.15660378  0.          8.57375     0.        ]]

 [[ 6.25159877  0.          0.          0.        ]
  [ 6.05534183  0.          0.          0.        ]
  [ 4.69879506  0.          0.          0.        ]
  [ 7.73664447  0.          0.          0.        ]
  [ 6.51605     0.          9.025       5.32740455]]

 [[ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 7.27883162  0.          0.          0.        ]
  [ 8.17492868  0.          9.5         5.32958793]]

 [[ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 4.38525245  0.          0.          0. 