In [1]:
#Setting up an environment using OpenAI Gym

In [70]:
import gym
env = gym.make('Taxi-v3') #creates and returns an environment object
state = env.reset() #resets the environment's state and returns the state

In [71]:
print(state) #states are represented as number between 0-499, you can display that number using this command
#try resetting the environment to verify that it starts in a different state

474


In [72]:
env.render() #you can "see" the current state through env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+



Yellow = empty taxi
Green = taxi with a passenger
R, G, B, Y = four possible pick up/drop off locations

In [73]:
env.observation_space #aka state space
env.action_space # action and observation space attributes are both type gym.spaces.discrete.Discrete

Discrete(6)

In [74]:
print("Actions space {}".format(env.action_space))
print("State space {}".format(env.observation_space))

Actions space Discrete(6)
State space Discrete(500)


In [75]:
observation, reward, done, info =  env.step(1) # take the action 1 = Go North. It returns (state, reward, done, info) 
env.render()
print(observation, reward, done, info)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
374 -1 False {'prob': 1.0}


## Creating a baseline agent that acts randomly and does not learn 
## (code name: nincompoop)

In [84]:
def randomeAgent():
    state = env.reset()
    epoch = 0
    reward = 0
    while reward != 20:
        observation, reward, done, info = env.step(env.action_space.sample())
        epoch +=1
#     env.render()
    return epoch

In [85]:
taxi_actions = {
    0 : 'South',
    1 : 'North',
    2 : 'East',
    3 : 'West',
    4 : 'Pickup',
    5 : 'Dropoff',
}
taxi_actions.get(env.action_space.sample())

'East'

### Creating a Q-Learning Agent

In [86]:
import gym
import numpy as np

### Simplified Bellman

In [87]:
def initateEnv():    #create environment
    env = gym.make('Taxi-v3')

    #initialize Q-table with all zeros
    Q = np.zeros([env.observation_space.n, env.action_space.n]) 

    #set parameters
    gamma = 0.1

    #intialize reward
    reward = 0

    #initialize environment
    state = env.reset()
    
    return env, Q , gamma, reward, state

In [88]:
def simplifiedAgent():
    env, Q , gamma, reward, state = initateEnv()
    
    epoch = 0
    #create update loop
    while reward != 20: #while dropoff state has not been reached
        #choose current highest-valued action
        action = np.argmax(Q[state])

        #obtain reward and next state resulting from taking
        next_state, reward, done, info = env.step(action)

        #update Q-value for state-action pair
        Q[state, action] = reward + gamma * np.max(Q[next_state])

        #update state
        state = next_state
        
        epoch += 1

    #render final dropoff state
#     env.render()
#     print('Counter: {}'.format(count))
    return epoch
    

In [89]:
def QlearningAgent(epsilon, alpha): 
    env, Q , gamma, reward, state = initateEnv()
    epoch = 0 
    while reward != 20:
        if np.random.rand() < epsilon:
            #exploration option
            action = env.action_space.sample()
        else:
            #exploitation option
            action = np.argmax(Q[state])

        #obtain reward and next state resulting from taking
        next_state, reward, done, info = env.step(action)

        #update Q-value for state-action pair
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        #update state
        state = next_state
        epoch +=1

    #render final dropoff state
#     env.render()
#     print('Counter: {}'.format(count))
    return epoch

In [100]:
import numpy
score= []
score2= []
score3= []
for _ in range(100):
    score.append(randomeAgent())
    score2.append(simplifiedAgent())
    score3.append(QlearningAgent(0.1, 0.3))
print(numpy.average(score))
print(numpy.average(score2))
print(numpy.average(score3))

2683.02
555.88
722.64


In [112]:
s_randomA100 = []
for _ in range(100):
    s_randomA100.append(randomeAgent())
print(numpy.average(s_randomA100))

s_randomA1000 = []
for _ in range(1000):
    s_randomA1000.append(randomeAgent())
print(numpy.average(s_randomA1000))

2539.23
2451.327


In [113]:
s_QA = []
for _ in range(100):
    s_QA.append(QlearningAgent(0.1, 0.3))
print(numpy.average(s_QA))

671.14


In [114]:
result={}
for eps in [100, 1000, 10000]:
    s_QA = []
    for _ in range(eps):
        s_QA.append(QlearningAgent(0.1, 0.3))
    result[eps] = numpy.average(s_QA)
print(result)

{100: 738.9, 1000: 683.599}


In [119]:
env, Q , gamma, reward, state = initateEnv()

gamma = 0.1
alpha = 0.1
epsilon = 0.1
epsilon_decay = 0.99 #decay factor

total_epochs = 0
episodes = 100

eps = {}
for episode in range(episodes):
    epochs = 0
    reward = 0
    epsilon = epsilon * epsilon_decay #decay step
    state = env.reset()
    
    while reward != 20:
        if np.random.rand() < epsilon:
            #exploration option
            action = env.action_space.sample()
        else:
            #exploitation option
            action = np.argmax(Q[state])

        #obtain reward and next state resulting from taking
        next_state, reward, done, info = env.step(action)

        #update Q-value for state-action pair
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])

        #update state
        state = next_state
        epochs +=1
    eps[episode] = epochs

print(eps) 
    


{0: 1101, 1: 333, 2: 485, 3: 381, 4: 549, 5: 1366, 6: 849, 7: 1999, 8: 1277, 9: 319, 10: 1204, 11: 1081, 12: 1056, 13: 632, 14: 927, 15: 601, 16: 484, 17: 855, 18: 756, 19: 948, 20: 987, 21: 691, 22: 626, 23: 667, 24: 778, 25: 695, 26: 711, 27: 1302, 28: 642, 29: 1471, 30: 545, 31: 989, 32: 886, 33: 1156, 34: 1738, 35: 501, 36: 904, 37: 957, 38: 360, 39: 700, 40: 1717, 41: 645, 42: 933, 43: 1200, 44: 822, 45: 831, 46: 1367, 47: 1152, 48: 1311, 49: 905, 50: 800, 51: 1185, 52: 483, 53: 551, 54: 1084, 55: 1249, 56: 853, 57: 601, 58: 1182, 59: 868, 60: 620, 61: 484, 62: 357, 63: 733, 64: 668, 65: 1356, 66: 587, 67: 665, 68: 575, 69: 928, 70: 281, 71: 627, 72: 1057, 73: 524, 74: 478, 75: 894, 76: 219, 77: 960, 78: 743, 79: 1085, 80: 302, 81: 533, 82: 600, 83: 715, 84: 401, 85: 265, 86: 710, 87: 489, 88: 413, 89: 756, 90: 836, 91: 481, 92: 228, 93: 439, 94: 385, 95: 571, 96: 709, 97: 747, 98: 502, 99: 908}
