# Deep Q-learning using backprop on small grdiworld

In [1]:
import os
os.chdir("..")
from src.gym_kalman.env_Gridworld import GridworldEnv

In [2]:
# Initialize value function
import numpy as np

# initialize the environment
grid_size = 4
reward_mean = -1
reward_std = 0.2

env = GridworldEnv(grid_size=grid_size, reward_mean=reward_mean, reward_std=reward_std)
num_states = env.observation_space.n
actions = np.arange(env.action_space.n)

In [3]:
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# set seed
np.random.seed(0)
random.seed(0)


In [4]:
Q_mean_table = np.zeros((num_states, len(actions)))
Q_std_table = np.ones((num_states, len(actions)))
Q_std_table[-1][:] = 0

GAMMA = 1
EPS_START = 0.9
EPS_END = 0.0001
EPS_DECAY = 1000

steps_done = 0

def select_action(state):
    global Q_mean_table
    global Q_std_table
    # # Epsilon-greedy action selection
    # global steps_done
    # sample = random.random()
    # eps_threshold = EPS_END + (EPS_START - EPS_END) * \
    #     math.exp(-1. * steps_done / EPS_DECAY)
    # steps_done += 1
    # if sample < eps_threshold:
    #     return random.choice(actions)
    # else:
    #     return np.argmax(Q_mean_table[state])

    # Thompson sampling
    q = np.random.normal(Q_mean_table[state], Q_std_table[state])
    return np.argmax(q)

    # # Random action
    # return random.choice(actions)

def select_greedy_action(state):
    global Q_mean_table
    return np.argmax(Q_mean_table[state])

In [5]:
# Extract current policy
def extract_policy(num_states, episode_i):
    policy = np.zeros(num_states)
    for state in range(num_states):
        if state == 15:  # Terminal state
            policy[state] = 10
            continue
        suggested_action = select_greedy_action(state)
        policy[state] = suggested_action

    policy_grid = np.array(policy).reshape((grid_size, grid_size))
    # Print title of the plot
    print(f"Episode {episode_i}'s policy")
    print(policy_grid)
    return

In [6]:
num_episodes = 50

for i_episode in range(num_episodes):
    # Initialize the environment and get its state
    state, info = env.reset()
    for t in count():
        action = select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated

        if terminated:
            next_state = None
        else:
            next_state = observation

        # Update Q-table
        if next_state is not None:
            max_Q_index = np.argmax(Q_mean_table[next_state])
            Q_mean_table[state][action] = reward_mean + GAMMA * np.max(Q_mean_table[next_state])
            Q_std_table[state][action] = np.sqrt(reward_std**2 + GAMMA * Q_std_table[next_state][max_Q_index]**2)
        else:
            Q_mean_table[state][action] = reward_mean
            Q_std_table[state][action] = reward_std

        # Move to the next state
        state = next_state

        if done:
            break

    # Find the index of the maximum Q-value for each state
    max_Q_index = np.argmax(Q_mean_table, axis=1)

    print(np.round(Q_mean_table[np.arange(num_states), max_Q_index].reshape((grid_size, grid_size)),2))
    print(np.round(Q_std_table[np.arange(num_states), max_Q_index].reshape((grid_size, grid_size)),2))
    print('-------------------------')

# Extract policy
extract_policy(num_states, i_episode)

print('Complete')

[[ 0.  0.  0.  0.]
 [ 0. -1.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0. -1.  0.  0.]]
[[1.   1.   1.   1.  ]
 [1.   1.02 1.   1.  ]
 [1.   1.   1.   1.  ]
 [1.   1.02 1.   0.  ]]
-------------------------
[[-2. -1. -1.  0.]
 [-1. -1. -1. -1.]
 [ 0. -1. -1.  0.]
 [ 0. -1.  0.  0.]]
[[1.04 1.02 1.02 1.  ]
 [1.02 1.02 1.02 1.02]
 [1.   1.02 1.02 1.  ]
 [1.   1.02 1.   0.  ]]
-------------------------
[[-2. -2. -2. -2.]
 [-2. -2. -2. -2.]
 [-1. -1. -1. -1.]
 [-2. -1.  0.  0.]]
[[1.04 1.04 1.04 1.04]
 [1.04 1.04 1.04 1.04]
 [1.02 1.02 1.02 0.2 ]
 [1.04 1.02 1.   0.  ]]
-------------------------
[[-3. -2. -2. -2.]
 [-3. -3. -2. -2.]
 [-3. -2. -1. -1.]
 [-2. -1. -1.  0.]]
[[1.06 1.04 1.04 1.04]
 [1.06 1.06 1.04 1.04]
 [1.06 1.04 1.02 0.2 ]
 [1.04 1.02 1.02 0.  ]]
-------------------------
[[-3. -3. -2. -2.]
 [-3. -3. -2. -2.]
 [-3. -2. -1. -1.]
 [-2. -2. -1.  0.]]
[[1.06 1.06 1.04 1.04]
 [1.06 1.06 1.04 1.04]
 [1.06 1.04 1.02 0.2 ]
 [1.04 1.04 0.2  0.  ]]
-------------------------
[[-4. -3. -2. -2.]
 

In [7]:
values = np.zeros(num_states)
values_std = np.zeros(num_states)
for state in range(num_states):
    if state == 15:  # Terminal state
        continue
    suggested_action = select_greedy_action(state)
    values[state] = Q_mean_table[state][suggested_action]
    values_std[state] = Q_std_table[state][suggested_action]
    # values[state] = np.mean(Q_table[state])

value_grid = np.array(values).reshape((grid_size, grid_size))
value_grid = np.round(value_grid, 2)
value_std_grid = np.array(values_std).reshape((grid_size, grid_size))
value_std_grid = np.round(value_std_grid, 2)
print("\nState values:")
print(value_grid)
print("\nState values std:")
print(value_std_grid)


State values:
[[-6. -5. -4. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]

State values std:
[[0.49 0.45 0.4  0.35]
 [0.45 0.4  0.35 0.28]
 [0.4  0.35 0.28 0.2 ]
 [0.35 0.28 0.2  0.  ]]


In [8]:
import numpy as np
import matplotlib.pyplot as plt

x = np.random.normal(loc=1, scale=0.2, size=5000)
y = np.random.normal(loc=1, scale=0.2, size=5000)

# Fit x+y to a normal distribution
z = x + y
mu = np.mean(z)
sigma = np.std(z)
print(f"mu: {mu}, sigma: {sigma}")
print(np.sqrt(0.2**2 + 0.2**2))

mu: 1.9954238561302766, sigma: 0.28128872316774894
0.28284271247461906
