In [1]:
import gym
import numpy as np

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count

In [4]:
env = gym.make("Taxi-v3").env

In [5]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [6]:
class DQN(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self._layer1 = nn.Linear(input_dim, 48)
        self._layer2 = nn.Linear(48, 32)
        self._layer3 = nn.Linear(32, 32)
        self._output_layer = nn.Linear(32, output_dim)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self._layer1(x))
        x = F.relu(self._layer2(x))
        x = F.relu(self._layer3(x))
        x = self._output_layer(x)
        return x

In [7]:
BATCH_SIZE = 1
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
n_actions = env.action_space.n

In [10]:
# (taxi_row, taxi_col, passenger_location, destination)
# definition of state
# taxi_row/taxi_col
state_dimension = 4

In [11]:
steps_done = 0
policy_net = DQN(state_dimension, n_actions).to(device)
target_net = DQN(state_dimension, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10)

In [12]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

In [13]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [14]:
env.reset()
kk = torch.tensor(np.asarray(list(env.decode(env.s))).reshape(1, -1), dtype=torch.float)
int(env.encode(*kk.tolist()[0]))

166

In [15]:
def state_as_tensor(state):
    return torch.tensor(np.asarray(list(env.decode(state))).reshape(1, -1), device=device, dtype=torch.float)

def decode_tensor_state(state):
    state_list = state.tolist()[0]
    return int(env.encode(*state_list))

In [16]:
episode_durations = []
num_episodes = 50
for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    state = env.s
    state = state_as_tensor(state)
    for t in count():
        # Select and perform an action
        if t % 10000 == 0:
            print("Count: %d for episode: %d" % (t, i_episode))
        action = select_action(state)
        next_state, reward, done, _ = env.step(action.item())
        next_state = state_as_tensor(next_state)
        reward = torch.tensor([reward], device=device)

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

Count: 0 for episode: 0
Count: 0 for episode: 1
Count: 10000 for episode: 1
Count: 20000 for episode: 1
Count: 30000 for episode: 1
Count: 40000 for episode: 1
Count: 0 for episode: 2
Count: 0 for episode: 3
Count: 10000 for episode: 3
Count: 20000 for episode: 3
Count: 30000 for episode: 3
Count: 40000 for episode: 3
Count: 50000 for episode: 3
Count: 60000 for episode: 3
Count: 70000 for episode: 3
Count: 80000 for episode: 3
Count: 90000 for episode: 3
Count: 100000 for episode: 3
Count: 110000 for episode: 3
Count: 0 for episode: 4
Count: 0 for episode: 5
Count: 0 for episode: 6
Count: 0 for episode: 7
Count: 10000 for episode: 7
Count: 20000 for episode: 7
Count: 30000 for episode: 7
Count: 0 for episode: 8
Count: 10000 for episode: 8
Count: 20000 for episode: 8
Count: 30000 for episode: 8
Count: 40000 for episode: 8
Count: 0 for episode: 9
Count: 10000 for episode: 9
Count: 20000 for episode: 9
Count: 30000 for episode: 9
Count: 40000 for episode: 9
Count: 50000 for episode: 9
Co

In [17]:
q_table = np.zeros((env.nS, env.nA))

In [18]:
with torch.no_grad():
    for idx in range(env.nS):
        s = state_as_tensor(idx)
        q_values = policy_net(s)
        q_values = q_values.detach().numpy()
        q_table[idx] = q_values

In [22]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(range(len(ben)))