Ideas for project:

-try larger grid sizes

-try UP and RIGHT moves more often

In [92]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from envs.GridWorld3x3Env import GridWorld3x3Env

Global constants

In [93]:
GAMMA = 0.95 # discount factor was 0.95 for all experiments in the original paper
MAX_STEPS = 10000

# Learner function implementations

As in the Double Q-learning paper, the Q-learning and Double Q-learning agents are implemented with an $\epsilon$-greedy exploration with $\epsilon = 1/\sqrt{n(s)}$, where $n(s)$ is the number of times state $s$ is visited by the agent. 

The learning rate $\alpha$ is determined by $\alpha = 1/n(s,a)$ (mode 1) or $\alpha = 1/n(s,a)^{0.8}$ (mode 2), where $n(s,a)$ corresponds to the number of times a given state-action pair is visited.

In [94]:
def e_greedy(env, Q, state, n_s):
    '''
    Implementation of episolon-greedy exploration as detailed above.
    '''
    
    # if n_s is 0, let epsilon = 1 for random exploration
    if n_s[state] == 0:
        epsilon = 1
    else:
        epsilon = 1 / np.sqrt(n_s[state])
    
    # choose action
    if np.random.rand() < epsilon:
        # random action
        action = env.action_space.sample()
    else:
        # greedy action
        max = np.where(np.max(Q[state]) == Q[state])[0]
        # in case there are ties, pick randomly
        action = np.random.choice(max)
    
    return action

In [95]:
def get_lr(n, mode):
    '''
    Function to calculate the learning rate based on the number of times a state-action pair is visited, and the mode.
    '''
    if mode == 1:
        alpha = 1/n
    elif mode == 2:
        alpha = 1/n^0.8
    return alpha

## Q-learning

In [100]:
def Q_learning(env, n_episodes, mode):
    '''
    Implementation of the Q-learning algorithm.
    '''
    
    # create Q table
    Q = np.zeros((env.observation_space.n, env.action_space.n))

    rewards_per_episode = np.zeros(n_episodes)

    rewards_per_timestep_per_episode = []

    for episode in range(0, n_episodes):
    # for episode in tqdm(range(n_episodes)):

        # keep track of number of times a state is visited (used to calculate epislon)
        n_s = np.zeros(env.observation_space.n)

        # keep track of number of times a state-action pair is visited (used to calculate alpha)
        n_s_a = np.zeros((env.observation_space.n, env.action_space.n))

        # reset env
        state, _ = env.reset()
        
        # update n_s for initial state
        n_s[state] += 1

        # array to keep track of rewards per timestep for this episode
        rewards_per_timestep = []

        terminated = False

        counter = 0

        while not terminated:

            counter += 1
            
            # choose action
            action = e_greedy(env, Q, state, n_s)

            # step and get reward
            next_state, reward, terminated, _ = env.step(action)

            # update n_s for new state
            n_s[next_state] += 1

            # update n_s_a for (state, action)
            n_s_a[state, action] += 1

            # get learning rate alpha
            alpha = get_lr(n_s_a[state, action], mode)

            # update Q table
            Q[state, action] += alpha * (reward + GAMMA * np.max(Q[next_state]) - Q[state, action])

            # update state to be next state
            state = next_state

            rewards_per_timestep.append(reward)
            rewards_per_episode[episode] += reward

        # episode terminated, append all rewards per timestep to the list
        rewards_per_timestep_per_episode.append(rewards_per_timestep)

    return rewards_per_timestep_per_episode, rewards_per_episode, Q

In [102]:
# test q-learning

rewards_per_experiment = []

# do 10000 experiments
for i in tqdm(range(10000)):
    
    env3x3 = GridWorld3x3Env()

    rewards_per_timestep, rewards_per_episode, Q = Q_learning(env=env3x3, n_episodes=1, mode=1)

    rewards_per_experiment.append(rewards_per_timestep)



100%|██████████| 10000/10000 [01:21<00:00, 123.26it/s]


## Double Q-learning

In [None]:
def Double_Q_learning():
    pass