## Pytorch DDQN
#### Wendell Luckow - 8/15/18

In [None]:
#Import Dependencies
from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
import random
import sys
from unityagents import UnityEnvironment
import numpy as np

In [None]:
#Initialize Unity Environment executable
env = UnityEnvironment(file_name="./Banana_Windows_x86_64\Banana_Windows_x86_64/Banana.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
#Create the neural network
class DDQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DDQN, self).__init__()
        
        hidden_size_0 = 512
        self.layer_0 = nn.Linear(input_size, hidden_size_0)
        torch.nn.init.xavier_uniform_(self.layer_0.weight)
        
        hidden_size_1 = 256
        self.layer_1 = nn.Linear(hidden_size_0, hidden_size_1)
        torch.nn.init.xavier_uniform_(self.layer_1.weight)
        
        self.layer_2 = nn.Linear(hidden_size_1, output_size)
        torch.nn.init.xavier_uniform_(self.layer_2.weight)
    def forward(self, state, is_training = True):
        f0 = F.relu(self.layer_0(state))
        f1 = F.relu(self.layer_1(f0))
        f2 = self.layer_2(f1)
        return f2

In [None]:
#Use a gpu to train and run
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class DDQN_run(object):
    def __init__(self):
        env_info = env.reset(train_mode=True)[brain_name]
        self.state_size = env_info.vector_observations[0].shape[0]
        action_size = brain.vector_action_space_size
        self.action_size = action_size
        
        self.Q_online = DDQN(self.state_size, action_size)
        self.Q_target = DDQN(self.state_size, action_size)
        self.Q_online = self.Q_online.cuda()
        self.Q_target = self.Q_target.cuda()
        self.optimizer = torch.optim.Adam(self.Q_online.parameters(), lr = .0001)
        
        self.update_every = 2000 #number of steps before copying weights
        self.D = [] #initialize experience replay
        self.batch_size = 1
        self.epsilon = 1.0 #initial random action probability
        self.gamma = 0.9
        self.step = 0
        
    def decay_epsilon(self, step):
        """
        Decay epsilon probability (linear)
        Input: step number to calculate epsilon decay (int)
        """
        slope = (-1.0)/(100000)
        self.epsilon = max(0.1, slope * step + 1.0)
        
    def update_replay_memory(self, state, action, reward, next_state, done):
        """
        input is the <state, action, reward, state+1, done> values
        Adds as a tuple (list) to the experience replay memory
        Limit the size of the experience replay
        """
        #negative reward when no bananas are picked up to create urgency
        if reward == 0:
            reward = -0.03
            
        #add to experience replay
        self.D.append([state, action, reward, next_state, done])
        
        #limit the size of the experience replay
        max_replay_size = 75000
        len_replay = len(self.D)
        if len_replay > max_replay_size:
            self.D = self.D[1:]
        
    def sample_replay_memory(self):
        """
        returns a batch for training of size self.batch_size
        """
        return random.sample(self.D, k = self.batch_size)
        
    def copy_weights(self):
        """
        copy weight parameters from Q_online to Q_target
        """
        for target_param, online_param in zip(self.Q_target.parameters(), self.Q_online.parameters()):
            target_param.data.copy_(online_param)
            
    def pick_action(self, state):
        """
        Input: State feature array
        Output: Action
        
        Picked randomly with an epsilon probability
        """
        if random.random() < self.epsilon:
            return int(random.choice(np.arange(self.action_size)))
        else:
            state = torch.from_numpy(state).float().to(device)
            return int(torch.argmax(self.Q_online(state).detach()))
        
    def train_batch(self):
        """
        update online weights using a batch sampled from experience replay
        """
        batch = self.sample_replay_memory()
        x, y, r, x_next, d = [], [], [], [], []
        for i in range(len(batch)):
            x.append(batch[i][0])
            y.append(batch[i][1])
            r.append(batch[i][2])
            x_next.append(batch[i][3])
            d.append(batch[i][4])

        x = torch.from_numpy(np.array(x)).float().to(device)
        y = torch.from_numpy(np.array(y)).float().to(device)
        r = torch.from_numpy(np.array(r)).float().to(device)
        x_next = torch.from_numpy(np.array(x_next)).float().to(device)
        d = torch.from_numpy(np.array(d).astype(int)).float().to(device)
        
        Q_max_index = torch.argmax(self.Q_online(x_next).detach(), dim = 1)
        Q_targets_next = self.Q_target(x_next).gather(1, Q_max_index.type(torch.cuda.LongTensor).unsqueeze(1))
        Q_expected = self.Q_online(x).gather(1, y.type(torch.cuda.LongTensor).unsqueeze(1))
        
        Q_targets = r + (self.gamma * Q_targets_next * (1 - d))
        
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def load_weights(self):
        """
        load the saved weights and copy them to both Q nets
        """
        self.Q_online.load_state_dict(torch.load('checkpoint.pth'))
        self.copy_weights()
        
    def run_saved_model(self):
        """
        Watch a game played by a trained agent
        """
        self.load_weights()
        self.epsilon = 0.1
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]
        done = False
        while not done:
            action = self.pick_action(state)
            env_info = env.step(action)[brain_name]
            state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
                        
    def train(self):
        """
        train the agent according to DDQN
        """
        best_score = float('-inf') #initialize best score
        num_eps = 0 #initialize counter
        total_eps = 1800 #the number of games to simulate
        one_hundred_episode_average = []
        rolling_episode_averages = []
        
        for i in range(total_eps):
            num_eps += 1
            score = 0 #initialize score
            env_info = env.reset(train_mode=True)[brain_name] #set to False to watch slower
            state = env_info.vector_observations[0]
            done = False
            
            while not done:
                self.step += 1 #keep track of total steps executed by the agent
                action = self.pick_action(state)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]
                score += reward

                #update replay memory
                self.update_replay_memory(state, action, reward, next_state, done)

                #sample batch from replay memory and train network
                start_train_step = 10000
                if len(self.D) > self.batch_size * 10 and len(self.D) > start_train_step:
                    self.decay_epsilon(step = self.step - start_train_step)
                    self.train_batch()

                #copy weights from online network to target network
                if self.step % self.update_every == 0:
                    self.copy_weights()

                #transition to the next state
                state = next_state

                #print out current stats
                if len(rolling_episode_averages) > 0:
                    printout_avg = rolling_episode_averages[-1]
                else:
                    printout_avg = 0
                printout = '{}/{} | {:.2f}% | Step {} | Score: {} | Best score: {} | Epsilon: {:.3f} | Rolling 100 avg: {:.3f}'.format(num_eps, 
                                                                                                                                       total_eps,
                                                                                                                                       100 * num_eps / total_eps,
                                                                                                                                       self.step, 
                                                                                                                                       score, 
                                                                                                                                       best_score,
                                                                                                                                       self.epsilon,
                                                                                                                                       printout_avg)
                sys.stdout.write('\r{}          '.format(printout))
                sys.stdout.flush()

            #keep track of ongoing best score
            if score > best_score:
                best_score = score                    
                
            #keep track of ongoing 100-game average score
            one_hundred_episode_average.append(score)
            one_hundred_episode_average = one_hundred_episode_average[-100:]
            if len(one_hundred_episode_average) >= 100:
                rolling_episode_averages.append(np.mean(one_hundred_episode_average))
                if np.mean(one_hundred_episode_average) > 13:
                    #if the 100-game average is greater than 13, print stats and terminate program
                    printout = '{}/{} | {:.2f}% | Step {} | Score: {} | Best score: {} | Epsilon: {:.3f} | Rolling 100 avg: {:.3f}'.format(num_eps, 
                                                                                                                                       total_eps,
                                                                                                                                       100 * num_eps / total_eps,
                                                                                                                                       self.step, 
                                                                                                                                       score, 
                                                                                                                                       best_score,
                                                                                                                                       self.epsilon,
                                                                                                                                       printout_avg)
                    sys.stdout.write('\r{}          '.format(printout))
                    sys.stdout.flush()
                    torch.save(self.Q_online.state_dict(), 'checkpoint.pth')
                    print("")
                    print("Weights saved as 'checkpoint.pth'")
                    print("Solved in {} episodes!".format(num_eps))
                    break
            


        print("")
        print("__________________")
        print("Done.")
        

In [None]:
#Run this cell to train from scratch
DDQN_run().train()

In [None]:
#Run this cell to run and watch a saved model
DDQN_run().run_saved_model()

In [None]:
#Close the environment
env.close()