In [None]:
 import numpy as np

# Define the environment
# 0: Empty cell
# 1: Obstacle
# 2: Start
# 3: Goal
# The agent can move in four directions: up, down, left, right
environment = np.array([
    [0, 0, 0, 0, 0],
    [1, 1, 1, 1, 0],
    [0, 0, 0, 0, 0],
    [0, 1, 1, 1, 1],
    [0, 0, 0, 0, 3]
])

# Define parameters
num_states = np.prod(environment.shape)
num_actions = 4  # up, down, left, right
learning_rate = 0.1
gamma = 0.9  # discount factor
epsilon = 0.1  # exploration rate
num_episodes = 1000

# Initialize Q-table
Q = np.zeros((num_states, num_actions))

# Convert 2D coordinates to 1D index
def state_to_index(state):
    return state[0] * environment.shape[1] + state[1]

# Convert 1D index to 2D coordinates
def index_to_state(index):
    return (index // environment.shape[1], index % environment.shape[1])

# Choose action using epsilon-greedy policy
def choose_action(state):
    if np.random.rand() < epsilon:
        return np.random.randint(num_actions)  # random action
    else:
        return np.argmax(Q[state_to_index(state)])  # greedy action

# Perform Q-learning
for episode in range(num_episodes):
    state = (4, 0)  # starting state
    done = False

    while not done:
        action = choose_action(state)
        next_state = state

        # Move agent to next state
        if action == 0:  # up
            next_state = (max(state[0] - 1, 0), state[1])
        elif action == 1:  # down
            next_state = (min(state[0] + 1, environment.shape[0] - 1), state[1])
        elif action == 2:  # left
            next_state = (state[0], max(state[1] - 1, 0))
        elif action == 3:  # right
            next_state = (state[0], min(state[1] + 1, environment.shape[1] - 1))

        # Reward
        reward = -1 if environment[next_state[0], next_state[1]] != 1 else -100  # penalize hitting obstacles
        if environment[next_state[0], next_state[1]] == 3:  # goal
            reward = 100

        # Update Q-value
        Q[state_to_index(state), action] += learning_rate * (reward + gamma * np.max(Q[state_to_index(next_state)]) - Q[state_to_index(state), action])

        # Move to next state
        state = next_state

        # Check if episode is finished
        done = environment[state[0], state[1]] == 3

# Print learned Q-values
print("Learned Q-values:")
print(Q)


Learned Q-values:
[[ -1.39941645 -34.41760578  -1.39941645  -1.40264873]
 [ -1.39151876 -19.009       -1.39402223  -1.41008906]
 [ -1.39941645 -19.009       -1.39691299  -1.40840482]
 [ -1.30700216 -19.009       -1.35621784  -1.38016411]
 [ -1.29784433  -1.35854774  -1.30347968  -1.30918615]
 [ -0.43371879  -0.2167309  -10.         -10.        ]
 [ -0.23410896  -0.1        -10.009      -10.        ]
 [ -0.35705613  -0.2581454  -10.         -10.        ]
 [ -0.29546366  -0.29777332 -10.          -0.29610723]
 [ -1.26996729  -1.29898156 -27.1171      -1.22478977]
 [-19.01950578   3.43697941  -1.03914323  -1.01562902]
 [-10.         -10.          -1.1510526   -1.16662595]
 [-27.1171     -34.4160829   -1.2673802   -1.23845826]
 [-10.         -10.          -1.38156243  -1.35419788]
 [ -1.26377953 -18.1         -1.34237756  -1.31254187]
 [ -0.92491961  56.46140148  -0.95617925 -29.18979397]
 [ -0.2071      64.4187605   -0.109      -10.        ]
 [ -0.36387303  68.52563478 -10.         -10.  

In [None]:
import numpy as np
import tensorflow as tf

# Define the environment
# 0: Empty cell
# 1: Obstacle
# 2: Start
# 3: Goal
# The agent can move in four directions: up, down, left, right
environment = np.array([
    [0, 0, 0, 0, 0],
    [1, 1, 1, 1, 0],
    [0, 0, 0, 0, 0],
    [0, 1, 1, 1, 1],
    [0, 0, 0, 0, 3]
])

# Define parameters
num_actions = 4  # up, down, left, right
learning_rate = 0.001
gamma = 0.9  # discount factor
epsilon = 0.1  # exploration rate
num_episodes = 5
batch_size = 32

# Define neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(5*5,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_actions)
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss='mse')

# Convert state to one-hot encoding
def state_to_one_hot(state):
    one_hot = np.zeros((5, 5))
    one_hot[state[0], state[1]] = 1
    return one_hot.flatten()

# Perform Deep Q-Learning
for episode in range(num_episodes):
    state = (4, 0)  # starting state
    done = False
    total_reward = 0

    while not done:
        # Choose action using epsilon-greedy policy
        if np.random.rand() < epsilon:
            action = np.random.randint(num_actions)  # random action
        else:
            q_values = model.predict(np.array([state_to_one_hot(state)]))[0]
            action = np.argmax(q_values)  # greedy action

        # Move agent to next state based on chosen action
        next_state = state
        if action == 0:  # up
            next_state = (max(state[0] - 1, 0), state[1])
        elif action == 1:  # down
            next_state = (min(state[0] + 1, environment.shape[0] - 1), state[1])
        elif action == 2:  # left
            next_state = (state[0], max(state[1] - 1, 0))
        elif action == 3:  # right
            next_state = (state[0], min(state[1] + 1, environment.shape[1] - 1))

        # Calculate reward
        reward = -1 if environment[next_state[0], next_state[1]] != 1 else -100  # penalize hitting obstacles
        if environment[next_state[0], next_state[1]] == 3:  # goal
            reward = 100

        # Store experience in replay buffer
        model.fit(np.array([state_to_one_hot(state)]), np.array([[reward + gamma * np.max(model.predict(np.array([state_to_one_hot(next_state)])))]]), verbose=0)

        # Move to next state
        state = next_state

        # Check if episode is finished
        done = environment[state[0], state[1]] == 3

    # Update epsilon (exploration rate)
    epsilon *= 0.99

    # Print total reward for episode
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Print learned Q-values
print("Learned Q-values:")
for i in range(5):
    for j in range(5):
        state = (i, j)
        q_values = model.predict(np.array([state_to_one_hot(state)]))[0]
        print(f"State: {state} | Q-values: {q_values}")


Episode 1: Total Reward = 0
Episode 2: Total Reward = 0
Episode 3: Total Reward = 0
Episode 4: Total Reward = 0
Episode 5: Total Reward = 0
Learned Q-values:
State: (0, 0) | Q-values: [-77.84378  -77.55741  -78.59628  -77.687706]
State: (0, 1) | Q-values: [-67.00865  -66.889824 -67.683365 -67.09088 ]
State: (0, 2) | Q-values: [-67.53646  -67.0084   -68.04358  -67.506584]
State: (0, 3) | Q-values: [-43.543514 -43.363342 -43.951645 -43.6535  ]
State: (0, 4) | Q-values: [-46.08116  -45.62758  -45.884518 -45.732025]
State: (1, 0) | Q-values: [-70.91705 -70.25347 -71.09772 -70.4598 ]
State: (1, 1) | Q-values: [-78.77246  -78.22729  -79.49391  -78.230316]
State: (1, 2) | Q-values: [-94.573235 -94.040276 -95.807785 -94.23295 ]
State: (1, 3) | Q-values: [-55.270187 -54.91221  -55.421227 -54.70237 ]
State: (1, 4) | Q-values: [-54.614056 -54.23815  -54.59179  -54.274757]
State: (2, 0) | Q-values: [-69.789566 -69.72066  -70.42731  -69.42039 ]
State: (2, 1) | Q-values: [-69.67618 -69.1549  -70.291