# Q-learning for Taxi Problem

### Taxi problem

```
Map "+---------+",  
    "|R: | : :G|",  
    "| : | : : |",  
    "| : : : : |",  
    "| | : | : |",  
    "|Y| : |B: |",  
    "+---------+",  
```

Passenger locations:
- 0: R(ed)
- 1: G(reen)
- 2: Y(ellow)
- 3: B(lue)
- 4: in taxi

Destinations:
- 0: R(ed)
- 1: G(reen)
- 2: Y(ellow)
- 3: B(lue)

Actions:
There are 6 discrete deterministic actions:
- 0: move south
- 1: move north
- 2: move east 
- 3: move west 
- 4: pickup passenger
- 5: dropoff passenger

In [1]:
import gym
import numpy as np

In [101]:
class Qlearning:
    """Q learning"""
    def __init__(self, env):
        self.env = env
        self.n_state = env.observation_space.n
        self.n_act = env.action_space.n
        self.Q = np.random.rand(self.n_state, self.n_act)  # intial Q table arbitrarily
        self.old_Q = np.copy(self.Q)
        self.visited = np.zeros((self.n_state, self.n_act), dtype=np.int16)
        self.count_policy_unchange = 0
    
    def __epsGreedy(self, state, epsilon):
        choice = np.random.uniform() < epsilon
        if choice:   # need to explore
            return np.random.randint(0, self.n_act)
        else:        # pick the best move
            return np.argmax(self.Q[state, :])
        
    def train(self, epsilon, alpha, gamma=0.9, e=1e-2):
        count_episodes = 0
        while True:
            cur_state = self.env.reset()  # set initial state
            reward = None
            done = False
            states = [cur_state]
            self.old_Q = np.copy(self.Q)
            while not done:
                action = self.__epsGreedy(cur_state, epsilon)
                new_state, reward, done, _ = self.env.step(action)
                delta = alpha * (reward + gamma * np.max(self.Q[new_state, :]) * (not done) - self.Q[cur_state, action])
                self.Q[cur_state, action] = self.Q[cur_state, action] + delta
                self.visited[cur_state, action] += 1
                cur_state = new_state
                states.append(new_state)
            
            count_episodes += 1
            
            # decay the exploration rate every a few episodes
            if count_episodes % 200 == 0:
                epsilon *= 0.99  
            
            # check if the policy for the visited states in this episode changed after the update, if not, increment the counter
            if np.sum(np.abs(np.argmax(self.Q[states, :], axis=1) - np.argmax(self.old_Q[states, :], axis=1))) == 0:
                self.count_policy_unchange += 1
            else:
                self.count_policy_unchange = 0
            
            # if the policy has not changed after many episodes, complete training
            if self.count_policy_unchange >= 50000:
                print("Training complete!\n-Total episodes: {0}\n".format(count_episodes))
                break
            elif count_episodes % 10000 == 0:           # print some useful information
                print("Current episodes: {0};\tCurrent epsilon: {1}".format(count_episodes, epsilon))
                

In [102]:
# make the environment
env = gym.make("Taxi-v2")

In [103]:
agent = Qlearning(env)
agent.train(1, 0.1)    # set initial epsilon to 1 to allow full exploration at beginning

Current episodes: 10000;	Current epsilon: 0.6050060671375365
Current episodes: 20000;	Current epsilon: 0.36603234127322926
Current episodes: 30000;	Current epsilon: 0.22145178723886094
Current episodes: 40000;	Current epsilon: 0.13397967485796175
Current episodes: 50000;	Current epsilon: 0.08105851616218133
Current episodes: 60000;	Current epsilon: 0.04904089407128576
Current episodes: 70000;	Current epsilon: 0.029670038450977095
Current episodes: 80000;	Current epsilon: 0.017950553275045134
Current episodes: 90000;	Current epsilon: 0.010860193639877886
Training complete!
-Total episodes: 94953



In [105]:
agent.Q

array([[ 5.41396135e-03,  4.97909381e-01,  2.36642217e-02,
         4.93914395e-03,  1.07376891e-01,  3.79644162e-01],
       [ 1.62261467e+00,  2.91401630e+00,  1.62261467e+00,
         2.91401630e+00,  4.34890700e+00, -6.08598370e+00],
       [ 4.34890700e+00,  5.94323000e+00,  4.34890700e+00,
         5.94323000e+00,  7.71470000e+00, -3.05677000e+00],
       ...,
       [ 7.71469932e+00,  9.68300000e+00,  7.71469830e+00,
         5.94323000e+00, -1.28531619e+00, -1.28530521e+00],
       [ 1.62258427e+00,  2.91401630e+00,  1.62235931e+00,
         2.91396598e+00, -7.37742793e+00, -7.37772233e+00],
       [ 1.43000000e+01,  1.18700000e+01,  1.43000000e+01,
         1.70000000e+01,  5.30000000e+00,  5.30000000e+00]])

In [106]:
agent.visited

array([[   0,    0,    0,    0,    0,    0],
       [ 826,  855,  831,  834, 8676,  865],
       [ 871,  938,  975,  918, 8752,  902],
       ...,
       [ 286,  771,  312,  298,  303,  310],
       [ 317,  709,  315,  427,  323,  331],
       [ 659,  642,  628, 8597,  693,  702]], dtype=int16)