In [None]:
import numpy as np

# Parameters
grid_size = 4
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
epsilon = 0.1  # Exploration rate
num_episodes = 1000

# Gridworld setup: 0 - empty, 1 - obstacle, -1 - goal
grid = np.zeros((grid_size, grid_size))
grid[-1, -1] = -1  # Goal

# Action space: up, down, left, right
actions = ['up', 'down', 'left', 'right']
q_table = np.zeros((grid_size, grid_size, len(actions)))

# Define the transition function
def get_next_state(state, action):
    i, j = state
    if action == 'up' and i > 0:
        return (i - 1, j)
    elif action == 'down' and i < grid_size - 1:
        return (i + 1, j)
    elif action == 'left' and j > 0:
        return (i, j - 1)
    elif action == 'right' and j < grid_size - 1:
        return (i, j + 1)
    return state

# Define the reward function
def get_reward(state):
    return grid[state]

# Epsilon-greedy action selection
def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)
    else:
        return actions[np.argmax(q_table[state[0], state[1], :])]

# Q-learning algorithm
for episode in range(num_episodes):
    state = (0, 0)  # Start at top-left corner
    done = False
    while not done:
        action = choose_action(state)
        next_state = get_next_state(state, action)
        reward = get_reward(next_state)
        action_index = actions.index(action)

        # Update Q-table using Q-learning update rule
        q_table[state[0], state[1], action_index] += alpha * (reward + gamma * np.max(q_table[next_state[0], next_state[1], :]) - q_table[state[0], state[1], action_index])
        
        state = next_state

        # End episode if we reach the goal
        if reward == -1:
            done = True

# Display learned Q-values
print(q_table)
