In [1]:
import gym
import numpy
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
env = gym.make('Taxi-v2')
new_state = env.reset()
from copy import copy

[2017-12-02 18:53:47,800] Making new env: Taxi-v2


In [2]:
observation = env.reset()
for i in range(100):
    #env.render()
    #print (observation)
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print ("Done")
        break

In [3]:
q_values = numpy.random.rand(env.observation_space.n, env.action_space.n)
s_values = numpy.random.rand(env.observation_space.n)
gamma = 0.9
alpha = 0.5

In [4]:
def q_learn(iterations, q_values):
    total_reward = 0
    for i in range(iterations):
        new_state = env.reset()
        state = new_state 
        while True:
            action = numpy.argmax(q_values[state])
            new_state, reward, done, info = env.step(action)
            q_values[state][action] = reward + 0.5*numpy.max(q_values[new_state]) + 0.5*q_values[state][action]
            state = new_state
            total_reward += reward
            if done:
                break
    return total_reward

In [5]:
def dls(env_copy_1, new_state, curr_depth, max_depth):
    value_list = list()
    env_copy_2 = copy(env_copy_1)
    if curr_depth == max_depth:
        return s_values[new_state]
    else:
        for i in range(env.action_space.n):
            new_state_1, reward, done, info = env_copy_2.step(i)
                
            if done:
                value = s_values[new_state_1]
            else:
                value = reward + s_values[new_state] + gamma * dls(env_copy_2, new_state_1, curr_depth+1, max_depth)
            value_list.append(value)
            env_copy_2 = copy(env_copy_1)
        value = numpy.mean(value_list)
    return value

    

def dls_action(env_copy, state, max_depth):
    # Do depth Limited Search till max_depth on the mb_internal_environment
    value_list = numpy.zeros(env.action_space.n)
    for i in range(env.action_space.n):
        env_copy_1 = copy(env_copy)
        new_state = state
        value = 0
        for _ in range(max_depth):
            new_state, reward, done, info = env_copy.step(numpy.argmax(q_values[new_state]))
            value += reward
        value_list[i] = value
    action = numpy.random.choice(numpy.flatnonzero(value_list == value_list.max()))
    return action

In [15]:
def dual(env, observation, max_depth, num_iters):
    j = 0
    sum_reward = 0
    reward_list = list()
    for i in range(num_iters):
        new_state = env.reset()  
        env_copy = copy(env)        
        state = new_state
        reward_list.append(sum_reward)
        sum_reward = 0
        while True:
            j += 1
            if True:
                action = dls_action(env_copy, state, max_depth)
                new_state, reward, done, info = env.step(action)
                q_values[state][action] = reward + 0.5*numpy.max(q_values[new_state]) + 0.5*q_values[state][action]
                s_values[state] = (1/env.action_space.n) * q_values[state][action] + (env.action_space.n - 1)/env.action_space.n * s_values[state]
                sum_reward += reward
                state = new_state
            else:
                action = numpy.argmax(q_values[state])
                new_state, reward, done, info = env.step(action)
                q_values[state][action] = reward + 0.5*numpy.max(q_values[new_state]) + 0.5*q_values[state][action]
                s_values[state] = (1/env.action_space.n) * q_values[state][action] + (env.action_space.n - 1)/env.action_space.n * s_values[state]
                state = new_state
                sum_reward += reward
            if done:
                break
    return reward_list
                

In [16]:
action = env.action_space.sample()
observation = env.reset()


In [22]:
values = dual(env, observation, max_depth=50, num_iters=50)

In [25]:
print (numpy.mean(values))

-343.56


In [None]:
print (dls_action(env, env.reset(), 4))

In [None]:
env.reset()
print (q_learn(1, q_values))

In [26]:
print (s_values)

[  5.26739424e-01  -5.63214027e+00  -3.08683987e+00  -2.05078067e+00
   2.63808291e-01   7.86796922e-02  -7.78317505e+00  -1.24506897e+01
  -5.30854853e+00  -1.52319740e+01   6.02901361e-01  -1.11126357e+01
   3.27684499e-01  -1.18185935e+01  -4.81844059e+00   6.21234152e-01
   2.50100615e+02  -2.34493505e+00  -2.53467296e+00  -3.84951140e+00
   1.31355958e-01  -6.54885352e+00  -7.20532608e+00  -1.65111128e+00
   7.62186350e-01   9.50180469e-01  -1.09567607e+01  -1.33823697e+01
  -1.34766667e+01  -1.11626643e+01   4.56874928e-01  -9.52630498e+00
   1.24014757e+00  -7.99742500e+00  -2.06193406e+00   1.26484740e-01
   2.22874809e+02  -4.56161536e+00   9.23500230e+00  -4.22901383e+00
   9.26261032e-01  -5.75115922e+00  -7.38284563e+00  -6.33683898e+00
   2.82766580e+01   7.35115195e-01  -1.11061062e+01  -1.15235228e+01
  -7.32857780e+00  -1.18067531e+01   5.27492917e-01  -7.31919979e+00
   7.92334757e+00  -8.25718691e+00   3.97933069e-01   8.15471865e-01
   1.27940229e+02   2.53694241e+01