In [9]:
import numpy as np
import gym
import torch

In [14]:
env = gym.make('Taxi-v2').env

In [15]:
# to reset environment
print(env.reset())

# current state
print(env.s)

env.render()

248
248
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [26]:
print('Action Space: {}'.format(env.action_space.n))
print('State Space: {}'.format(env.observation_space.n))


Action Space: 6
State Space: 500


In [21]:
# action table

# output = {action: [(probability, nextstate, reward, done)]}
# 0 = south 
# 1 = north 
# 2 = east 
# 3 = west 
# 4 = pickup 
# 5 = dropoff

env.P[1]

{0: [(1.0, 101, -1, False)],
 1: [(1.0, 1, -1, False)],
 2: [(1.0, 21, -1, False)],
 3: [(1.0, 1, -1, False)],
 4: [(1.0, 17, -1, False)],
 5: [(1.0, 1, -10, False)]}

In [18]:
# generate random action
env.action_space.sample()

4

In [19]:
# step with action 
# output = (next state, reward, done, probability)
env.step(1)

(148, -1, False, {'prob': 1.0})

In [82]:
class QLearning():
    def __init__(self, env, alpha=0.1, gamma=0.6, epsilon=0.1):
        self.env = env
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount factor
        self.epsilon = epsilon # explore or exploit 
        self.q_table = np.zeros([env.observation_space.n, env.action_space.n])
        self.timesteps = []
    
    def train(self, iterations=1000):
        for i in range(iterations):
            state = self.env.reset()
            
            done = False
            timestep = 0
            
            while not done:
                if np.random.uniform(0,1) < self.epsilon:
                    action = self.env.action_space.sample() # Explore action space
                else:
                    action = np.argmax(self.q_table[state,:]) # Exploit learned values
                    
                next_state, reward, done, prob = self.env.step(action)
                
                current_val = self.q_table[state,action]
                next_max = np.max(self.q_table[next_state,:])
                
                self.q_table[state, action] = (1 - self.alpha) * current_val + self.alpha * (reward + self.gamma * next_max)
                
                state = next_state
                timestep += 1
            
            self.timesteps.append(timestep)
            
    def eval(self, episodes=1000):
        total_timestep = 0

        for i in range(episodes):
            state = self.env.reset()
            
            done = False
            timestep = 0

            while not done:
                action = np.argmax(self.q_table[state,:])
                state, reward, done, info = self.env.step(action)

                timestep += 1

            total_timestep += timestep
#             print('Episode %d: %d'%(i+1, timestep))
        return total_timestep/episodes
            
                

In [83]:
agent = QLearning(env)
agent.train(100000)

In [84]:
agent.eval(100)


Episode 1: 18
Episode 2: 10
Episode 3: 18
Episode 4: 12
Episode 5: 15
Episode 6: 11
Episode 7: 15
Episode 8: 16
Episode 9: 15
Episode 10: 7
Episode 11: 12
Episode 12: 13
Episode 13: 16
Episode 14: 13
Episode 15: 13
Episode 16: 13
Episode 17: 11
Episode 18: 11
Episode 19: 12
Episode 20: 10
Episode 21: 14
Episode 22: 11
Episode 23: 14
Episode 24: 11
Episode 25: 10
Episode 26: 8
Episode 27: 12
Episode 28: 16
Episode 29: 14
Episode 30: 14
Episode 31: 16
Episode 32: 11
Episode 33: 15
Episode 34: 11
Episode 35: 7
Episode 36: 11
Episode 37: 15
Episode 38: 17
Episode 39: 16
Episode 40: 11
Episode 41: 12
Episode 42: 13
Episode 43: 9
Episode 44: 12
Episode 45: 11
Episode 46: 15
Episode 47: 12
Episode 48: 12
Episode 49: 13
Episode 50: 11
Episode 51: 13
Episode 52: 9
Episode 53: 12
Episode 54: 11
Episode 55: 15
Episode 56: 14
Episode 57: 11
Episode 58: 14
Episode 59: 11
Episode 60: 11
Episode 61: 14
Episode 62: 14
Episode 63: 10
Episode 64: 13
Episode 65: 14
Episode 66: 10
Episode 67: 13
Episode 6

12.52