In [13]:
import numpy as np
import random

In [14]:
alpha = 0.1   
gamma = 0.9   
epsilon = 0.1  
num_episodes = 1000
num_states = 4  
num_actions = 4  

In [15]:
rewards = np.array([
    [-1, -1, -1, 0],   
    [-1, -1, -1, 10],  
    [-1, -1, -1, -1],  
    [0, 10, -1, -1]    
])

In [16]:
Q = np.zeros((num_states, num_actions))
V = np.zeros(num_states)

In [17]:
def select_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, num_actions - 1)  
    else:
        return np.argmax(Q[state])  

In [18]:
def transition(state, action):
    next_state = action
    reward = rewards[state, action]
    return next_state, reward

In [19]:
def plain_q_learning():
    for episode in range(num_episodes):
        state = random.randint(0, num_states - 1)
        done = False
        while not done:
            action = select_action(state)
            next_state, reward = transition(state, action)
            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            state = next_state
            if reward == 10:  
                done = True

In [20]:
def td_0():
    for episode in range(num_episodes):
        state = random.randint(0, num_states - 1)
        done = False
        while not done:
            next_state, reward = transition(state, select_action(state))
            V[state] += alpha * (reward + gamma * V[next_state] - V[state])
            state = next_state
            if reward == 10:
                done = True

In [21]:
def td_2():
    n = 2  
    for episode in range(num_episodes):
        states = [random.randint(0, num_states - 1)]
        rewards_episode = []
        T = float('inf')
        t = 0
        while True:
            if t < T:
                action = select_action(states[-1])
                next_state, reward = transition(states[-1], action)
                states.append(next_state)
                rewards_episode.append(reward)
                if reward == 10:
                    T = t + 1

            tau = t - n + 1
            if tau >= 0:
                G = sum([gamma ** (i - tau - 1) * rewards_episode[i] for i in range(tau + 1, min(t + 1, T))])
                if tau + n < T:
                    G += gamma ** n * V[states[tau + n]]
                V[states[tau]] += alpha * (G - V[states[tau]])

            if tau == T - 1:
                break
            t += 1

Q-Learning with TD(0) and TD(2)

In [22]:
def q_learning_td_0():
    for episode in range(num_episodes):
        state = random.randint(0, num_states - 1)
        done = False
        while not done:
            action = select_action(state)
            next_state, reward = transition(state, action)
            Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
            V[state] += alpha * (reward + gamma * V[next_state] - V[state])
            state = next_state
            if reward == 10:
                done = True
                
def q_learning_td_2():
    n = 2  # Step size
    for episode in range(num_episodes):
        states = [random.randint(0, num_states - 1)]
        rewards_episode = []
        T = float('inf')
        t = 0
        while True:
            if t < T:
                action = select_action(states[-1])
                next_state, reward = transition(states[-1], action)
                states.append(next_state)
                rewards_episode.append(reward)
                if reward == 10:
                    T = t + 1
            tau = t - n + 1
            if tau >= 0:
                G = sum([gamma ** (i - tau - 1) * rewards_episode[i] for i in range(tau + 1, min(t + 1, T))])
                if tau + n < T:
                    G += gamma ** n * V[states[tau + n]]
                V[states[tau]] += alpha * (G - V[states[tau]])
                Q[states[tau], select_action(states[tau])] += alpha * (G - Q[states[tau], select_action(states[tau])])

            if tau == T - 1:
                break
            t += 1

In [23]:
plain_q_learning()
td_0()
td_2()
q_learning_td_0()
q_learning_td_2()

In [24]:
print("Q-Table:")
print(Q)
print("\nValue Function V:")
print(V)

Q-Table:
[[ 0.26937118  7.25793423 -0.74170404  2.7240472 ]
 [-0.25538632 -0.47348198 -0.23489449  0.75495003]
 [ 4.65454301  2.87573562  2.9833503   9.37319862]
 [-0.33655333  0.17625129 -0.44315168 -0.57413625]]

Value Function V:
[6.74602925 0.55679944 9.54334251 0.11177618]
