In [2]:
import gym
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torch
import torch.nn.modules.padding as padding
import matplotlib.pyplot as plt
from collections import namedtuple
from comet_ml import Experiment
%matplotlib inline

ImportError: No module named torch.autograd

In [None]:
experiment = Experiment(api_key="7bEW5h9UoEOpQQyNLpt36lY66", project_name="pytorch")

In [9]:
env = gym.make("MsPacman-v0")

In [10]:
mspacman_color = np.array([210, 164, 74]).mean()

In [11]:
def preprocess_observation(obs):
    #crop and downsize. ::2 takes everyother, 1:176:2 takes every other in that range
    img = obs[1:176:2, ::2]
    # take mean over color channels to get greyscale
    img = img.mean(axis=2)
    # improve contrast of ms pacman
    img[img==mspacman_color] = 0
    # normalize data from -1 to 1
    img = (img-128) / 128 - 1
    return img.reshape(88, 80, 1)

## Replay Memory

Store transitions that the agent observes so can be re-used later. By sampling from this randomly improves stability

In [16]:
Transition = namedtuple('Transition',
                       ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0
        
    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        return np.random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [17]:
class DQN(nn.Module):
    """
    Our network takes in an image and tries to predict the quality
    of taking each of our 9 actions given that state (the image)
    """

    def __init__(self, n_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)
        self.head = nn.Linear(448, n_actions)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))

In [20]:
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
num_episodes = 50
N_ACTIONS = 9

experiment.log_metric("batch size", BATCH_SIZE)
experiment.log_metric("gamma", GAMMA)
experiment.log_metric("eps start", EPS_START)
experiment.log_metric("eps end", EPS_END)
experiment.log_metric("eps decay", EPS_DECAY)
experiment.log_metric("target update", TARGET_UPDATE)
experiment.log_metric("num episodes", num_episodes)


# the policy network is used to play the game - aka actor
policy_net = DQN(N_ACTIONS).cuda()
# the target net is used to predict Q values for next action
# we need 2 otherwise we would be using the same network
# in the actual and predicted values of our loss function
target_net = DQN(N_ACTIONS).cuda()
target_net.load_state_dict(policy_net.state_dict())
# sets training to false 
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0

def select_action(state):
    global steps_done
    # gen random number
    sample = random.random()
    # get threshold which decays from start to end 
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    # if exceed, pick best
    if sample > eps_threshold:
        # No gradients b/c not learning, just getting best one
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1,1)
    # else, random
    else:
        return torch.LongTensor([[np.random.randint(N_ACTIONS)]]).cuda()    

In [22]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    
    transitions = memory.sample(BATCH_SIZE)
    # transpose the batch...
    batch = Transition(*zip(*transitions))
    # compute mask of transitions which didn't lead to ending game
    non_final_mask = torch.IntTensor(tuple(map(lambda s: s is not None,
                                              batch.next_state))).cuda()
    # the next states for non final states
    non_final_next_states = torch.cat([s for s in batch.next_state
                                      if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    
    # calculate the Q value of taking the state, action pairs which were taken
    # the gather basically takes the Q value for the action choosen
    # So if my input choose action 3 for the given state, that is what I would gather.
    # Basically, what is the Q value for what actually happened
    # Q value being the total expected value from taking an action given a state.
    # These are basically our predictions for learning
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    
    next_state_values = torch.zeros(BATCH_SIZE).cuda()
    # get the best actions for the next states
    # detach is for speed so don't calc gradients
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # the expected value of the Q(s,a) given from the policy net is the
    # reward given plus the discounted value of the Q value from taking the best
    # action at the next step
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    
    loss = F.smooth_l1_loss(state_action_values, 
                            expected_state_action_values.unsqueeze(1))
    
    # Optimize the policy net to become better and predicting Q values
    optimizer.zero_grad()
    loss.backward()
    # clip gradients to prevent exploding gradient
    # clipped between -1 and 1
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    
    return loss.data[0]

In [None]:
for i_episode in range(num_episodes):
    state = preprocess_observation(env.reset())
    losses = []
    for t in count():
        action = select_action(state)
        next_state, reward, done, _ = env.step(action.item())
        if not done:
            next_state = preprocess_observation(next_state)
        else:
            next_state = None
        reward = torch.tensor([reward]).cuda()
        memory.push(state, action, next_state, reward)
        state = next_state
        losses.append(optimize_model())
        if done:
            experiment.log_metric("duration", t+1, step=i_episode)
            experiment.log_metric("avg loss", mean(losses), step=i_episode)
            break
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

In [None]:
torch.save(policy_net.state_dict(), "./models/dqn_polich.state")