In [None]:
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
import random as rand

In [None]:
class DQN:
    # Class Attributes
    sess = tf.Session()
    
    @classmethod  
    def create(cls, env, neurons, activ):
        # Create Environment
        cls.env = gym.make(env)
        cls.n_inputs = cls.env.observation_space.shape[0]
        cls.n_actions = cls.env.action_space.n
        cls.n_layers = [neurons]
        cls.activ = activ
              
    def __init__(self, name, trainable):
        # Instance Attributes
        self.name = name
        self.trainable = trainable
        self.X = tf.placeholder(tf.float32, (None,DQN.n_inputs), name='state')
        if trainable:
            self.A = tf.placeholder(tf.int32, name='action')
            # One Hot Encoding Op
            self.onehot = tf.one_hot(self.A, DQN.n_actions)
        else:
            self.R = tf.placeholder(tf.float32, name='reward')
            self.D = tf.placeholder(tf.float32, name='done')
        
    def buildNet(self):
        # Network
        self.layer1 = tf.layers.dense(
                      inputs = self.X, 
                      units = DQN.n_layers[0], 
                      activation = DQN.activ,
                      trainable = self.trainable,
                      name = self.name+'_layer1')
        
        self.logits = tf.layers.dense(
                      inputs = self.layer1,
                      units = DQN.n_actions,
                      trainable = self.trainable,
                      name = self.name+'_logits')
        
        with tf.variable_scope(self.name+'_q_values'):
            if self.trainable:
                self.action = tf.argmax(self.logits, axis=1)
                self.value = tf.reduce_sum(self.onehot * self.logits)
            else:
                self.argmax = tf.reduce_max(self.logits,axis=1)

In [None]:
class OPS:
    
    def __init__(self, gamma, alpha):
        
        # Loss Ops
        self.get_yj = tar.R + gamma * tar.argmax * tar.D
        self.loss = tf.losses.huber_loss(self.get_yj,pol.value)
        
        # Training Ops
        self.optimizer = tf.train.AdamOptimizer(alpha)
        self.train_op = self.optimizer.minimize(self.loss)
        
        # Weight Switcher Op
        self.tar_w = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'tar')
        self.pol_w = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'pol')
        self.assign_op = [tf.assign(a,b) for a, b in zip(self.tar_w,self.pol_w)]
        
        # Summary Ops
        self.writer = tf.summary.FileWriter("/tmp/DQN")
        tf.summary.histogram('target_weights', tf.global_variables()[0])
        tf.summary.histogram('target_logits', tf.global_variables()[2])
        tf.summary.histogram('policy_weights', tf.global_variables()[4])
        tf.summary.histogram('policy_logits', tf.global_variables()[6])
        tf.summary.scalar('Total_Loss', self.loss)
        self.write_op = tf.summary.merge_all()
        self.writer.add_graph(DQN.sess.graph)
        
        DQN.sess.run(tf.global_variables_initializer())

In [None]:
class Train:
    
    def __init__(self, target_len, memory, batch_max, 
                 batch_min, epsilon, e_decay, e_min):
        # Training Hyperparemeters
        self.target_len = target_len
        self.memory = memory
        self.batch_max = batch_max
        self.batch_min = batch_min
        self.batch_size = 0
        self.epsilon = epsilon
        self.e_decay = e_decay
        self.e_min = e_min
        
        # Bookkeeping
        self.batch = []
        self.step = 1
        self.n = 1
        self.Rhist = []
        self.Lhist = []
        
    def egreedy(self,state):
        if rand.random() < self.epsilon:
            action = DQN.env.action_space.sample()
            if self.epsilon > self.e_min:
                self.epsilon -= self.e_decay
        else:
            action = DQN.sess.run(pol.action, feed_dict={pol.X: [state]})[0]
        return action
    
    def backPass(self,batch,end):
        batch = batch[np.random.choice(batch.shape[0], 
                                  self.batch_size, 
                                  replace=False)]
        # Arrange data into a feed
        state = np.asarray(list(batch[:,0]))
        state2 =  np.asarray(list(batch[:,1]))                         
        action = batch[:,2]
        reward = batch[:,3]
        done = [int(not d) for d in batch[:,4]]                         
        feed = {pol.X: state,
                tar.X: state2,
                pol.A: action,
                tar.R: reward,
                tar.D: done}
        
        # Run training op and summary writer
        DQN.sess.run(ops.train_op,feed_dict=feed)
        summary = DQN.sess.run(ops.write_op, feed_dict=feed)
        ops.writer.add_summary(summary, self.step)
        ops.writer.flush()
                                  
    def train(self):
        return len(self.batch) > self.batch_min
                                  
    def start(self, episodes, render):
        for episode in range(episodes):
            # Reset Environment
            state = DQN.env.reset()                     
            # Bookkeeping
            rewardHist = []
            
            while True:
                if render:
                    DQN.env.render()
                                  
                # Get action from policy net and perform
                action = self.egreedy(state)
                state2, reward, done, _ = DQN.env.step(action)
                
                # More Bookkeeping
                rewardHist.append(reward)
                #Overide Reward
#                 if len(rewardHist) != 200 and done:
#                     reward = -1
                self.batch.append([state, state2, action, reward, done])
                self.batch_size = min(len(self.batch), self.batch_max)
                
                if done:
                    if self.train():
                        self.backPass(np.asarray(self.batch), done)
                    break
                    
                if self.train():
                    self.backPass(np.asarray(self.batch), done)
                
                # Trim Memory
                if len(self.batch) >= self.memory:
                    del self.batch[0]
                
                # Copy policy weights to target
                if self.train() and self.step % self.target_len == 0:
                    DQN.sess.run(ops.assign_op)
                
                # Update state and count
                state = state2
                self.step += 1
                
            self.n += 1
            self.Rhist.append(np.sum(rewardHist))
            if self.n % 50 == 0:
                print('Average Reward: {:.2f} Epsilon {:.4f}'.format(np.mean(self.Rhist), self.epsilon))
                if self.n > 100:
                    print('Last 100 reward: {:.2f}'.format(np.mean(self.Rhist[self.n-100:self.n])))          

In [None]:
DQN.create(env='CartPole-v0',
           neurons=100,
           activ=tf.nn.relu)
pol = DQN(name = 'pol', 
          trainable = True)
tar = DQN(name = 'tar', 
          trainable = False)
tar.buildNet()
pol.buildNet()
ops = OPS(gamma=0.999, alpha=0.0002)
train = Train(target_len = 2500,
              memory = 10000,
              batch_max = 1,
              batch_min = 1,
              epsilon = 0.8,
              e_decay =  0.00001,
              e_min = 0.015)
train.start(episodes=20000, render=False)

In [None]:
state = DQN.env.reset()
for i in range(2000):
    DQN.env.render()
    action = np.argmax(DQN.sess.run(policy.logits, feed_dict={pol.X: [state]}))
    state, reward, done, info = DQN.env.step(action)
    if done:
        state = DQN.env.reset()