## Author

**Full Name: Yiğit Yalın**

**Student ID: 22002178**

### Import the required libraries

In [1]:
from enum import IntEnum

import numpy as np

## Enumerate the actions

In [2]:
class Actions(IntEnum):
    MINUS_1 = 0
    PLUS_1 = 1

## Enumerate the states

In [3]:
class States(IntEnum):
    ONE = 0
    TWO = 1
    THREE = 2
    FOUR = 3
    FIVE = 4
    SIX = 5
    SEVEN = 6

## Define the step function

In [4]:
def step(state, action):
    if action == Actions.MINUS_1:
        next_state = max(0, state.value - 1)
    else:
        next_state = min(6, state.value + 1)
    reward = -int(next_state != States.FOUR)
    return reward, States(next_state)

# Part a

## Create a function to calculate the temporal difference

In [5]:
def get_td(state, action, q_values, gamma):
    reward, next_state = step(state, action)
    return reward + gamma * q_values[next_state].max() - q_values[state][action]

## Perform tabular Q-Learning for given trajectory

In [6]:
alpha = 0.5
gamma = 1

### Initialize Q table

In [7]:
q = np.zeros((len(States), len(Actions)))

print(q)

[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


**1) $(s, a) = (3, -1)$**

In [8]:
td = get_td(States.THREE, Actions.MINUS_1, q, gamma)
q[States.THREE, Actions.MINUS_1] += alpha * td

print(q)

[[ 0.   0. ]
 [ 0.   0. ]
 [-0.5  0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]]


**2) $(s, a) = (2, 1)$**

In [9]:
td = get_td(States.TWO, Actions.PLUS_1, q, gamma)
q[States.TWO, Actions.PLUS_1] += alpha * td

print(q)

[[ 0.   0. ]
 [ 0.  -0.5]
 [-0.5  0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]]


**3) $(s, a) = (3, 1)$**

In [10]:
td = get_td(States.THREE, Actions.PLUS_1, q, gamma)
q[States.THREE, Actions.PLUS_1] += alpha * td

print(q)

[[ 0.   0. ]
 [ 0.  -0.5]
 [-0.5  0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]
 [ 0.   0. ]]


# Part b

$$
\mathbf{J}(\mathbf{w}) = (r + \gamma \max_{a^\prime}\hat{Q}(s^\prime, a^\prime, \mathbf{w^-}) - \hat{Q}(s, a, \mathbf{w}))^2
$$

$$
-\dfrac{1}2\nabla_{\mathbf{w}}\mathbf{J}(\mathbf{w}) = (r + \gamma \max_{a^\prime}\hat{Q(s^\prime, a^\prime, \mathbf{w^-}}) - \hat{Q}(s, a, \mathbf{w})) \hat{Q}(s, a, \mathbf{w})
$$

$$
\Delta \mathbf{w} = \alpha(r + \gamma \max_{a^\prime}\hat{Q(s^\prime, a^\prime, \mathbf{w^-})} - \hat{Q}(s, a, \mathbf{w})) \nabla_{\mathbf{w}} \hat{Q}(s, a, \mathbf{w})
$$

where, for linear function approximation,

$$
\nabla_{\mathbf{w}} \hat{Q}(s, a, \mathbf{w}) = \mathbf{x}
$$

with the feature vector $\mathbf{x}$.

# Part c

In [11]:
alpha = 0.25
gamma = 1

## Construct weight vectors

In [12]:
w = np.array([-1, 1, 1])

print(w)

[-1  1  1]


In [13]:
w_target = np.array([1, -1, -2])

print(w_target)

[ 1 -1 -2]


## Define a function that constructs feature vectors

In [14]:
def x(state, action):
    s = States(state).value + 1
    if action == Actions.MINUS_1:
        a = -1
    else:
        a = 1
    return np.array([s, a, 1])

## Perform a single gradient update for $(s, a, r, s^\prime) = (2, -1, -1, 1)$

In [15]:
state = States.TWO
action = Actions.MINUS_1
reward, next_state = step(state, action)

In [16]:
next_state_max_q_value = np.max([x(next_state, action) for action in Actions] @ w_target)
q_value = x(state, action) @ w
delta_w = alpha * (reward + gamma * next_state_max_q_value - q_value) * x(state, action)

print('Maximum of next state target q values:', next_state_max_q_value)
print('Current state q value:', q_value)
print('Delta w:', delta_w)

Maximum of next state target q values: 0
Current state q value: -2
Delta w: [ 0.5  -0.25  0.25]


In [17]:
w = w + delta_w

print(w)

[-0.5   0.75  1.25]
