In [5]:
import numpy as np

class QLearningAgent:
    def __init__(self, epsilon, decay, epsilon_min, alpha, gamma, n_actions):
        self.Q = {}
        self.n_actions = n_actions

        self.epsilon = epsilon
        self.decay = decay
        self.epsilon_min = epsilon_min

        self.alpha = alpha
        self.gamma = gamma

    def choose_action(self, env, state):
        #state = tuple(state)
        if state not in self.Q.keys():
            self.Q[state] = [0] * self.n_actions

        if np.random.random() < self.epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(self.Q[state])
        return action

    def epsilon_decay(self):
        self.epsilon = max(self.epsilon*self.decay, self.epsilon_min)

    def updates_Q(self, state, action, next_state, reward):
        #state = tuple(state)
        #next_state = tuple(next_state)
        if state not in self.Q.keys():
            self.Q[state] = [0] * self.n_actions
        if next_state not in self.Q.keys():
            self.Q[next_state] = [0] * self.n_actions

        self.Q[state][action] = self.Q[state][action] + self.alpha*(reward + self.gamma*np.max(self.Q[next_state]) - self.Q[state][action])


In [6]:
import gym

env = gym.make("Taxi-v3")

n_actions = env.action_space.n
print(n_actions)

6


In [7]:
agent = QLearningAgent(n_actions= n_actions,
                       epsilon= 0.99,
                       decay= 0.9999,
                       epsilon_min= 0.01,
                       alpha = 0.1,
                       gamma = 0.9)


In [8]:
returns_list = []

for episode in range(1, 100001):
    state = env.reset()
    done = False
    returns = 0
    
    #state = discretize(env, state)
    while not done:
        action = agent.choose_action(env, state)

        next_state, reward, done, _ = env.step(action)
       #next_state = discretize(env, next_state)

        agent.updates_Q(state, action, next_state, reward)

        returns += reward
        state = next_state
    
    agent.epsilon_decay()
    returns_list.append(returns)

    if episode %1000 == 0:
        print(f"Episode {episode}: return {returns}, average returns {np.mean(returns_list[-1000:]):.1f}, epsilon {agent.epsilon:.2f}")


Episode 1000: return -536, average returns -689.9, epsilon 0.90
Episode 2000: return -116, average returns -417.3, epsilon 0.81
Episode 3000: return -228, average returns -229.2, epsilon 0.73
Episode 4000: return -66, average returns -139.4, epsilon 0.66
Episode 5000: return -50, average returns -98.5, epsilon 0.60
Episode 6000: return -32, average returns -69.9, epsilon 0.54
Episode 7000: return -90, average returns -53.0, epsilon 0.49
Episode 8000: return -39, average returns -40.4, epsilon 0.44
Episode 9000: return -5, average returns -31.0, epsilon 0.40
Episode 10000: return -11, average returns -25.2, epsilon 0.36
Episode 11000: return -7, average returns -20.4, epsilon 0.33
Episode 12000: return 8, average returns -16.0, epsilon 0.30
Episode 13000: return -12, average returns -13.3, epsilon 0.27
Episode 14000: return -4, average returns -10.4, epsilon 0.24
Episode 15000: return -5, average returns -7.7, epsilon 0.22
Episode 16000: return 2, average returns -4.8, epsilon 0.20
Epis