In [10]:
import gym
import gym.spaces
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import tensorflow as tf

from collections import deque

In [11]:
def preprocess(image):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 2D float array """
    image = image[35:195] # crop
    image = image[::2,::2,0] # downsample by factor of 2
    image = image/255.0
    return np.reshape(image.astype(np.float).ravel(), [80,80])

In [12]:
#Experience Replay
class Replay():
    def __init__(self, buff_size = 100, n = 32):
        self.buffer = deque(maxlen = buff_size)
        self.n = n
        
    def add(self, exp):
        self.buffer.append(exp)
        
    def sample(self):
        output = list(random.sample(self.buffer, self.n)).copy()
        return output
        
    

In [13]:
class History():
    def __init__(self, history_size = 4):
        self.buffer = deque(maxlen = history_size)
        
    def add(self, frame):
        self.buffer.append(frame)
        
    def output(self):
        buffer_list = list(self.buffer)
        output = np.stack(buffer_list, axis = -1).copy()
        return output 

In [14]:
class DQN():
    def __init__(self, lr = 0.01, action_size = 4, history_size = 4, y_shape = 80, name = 'DQN'):
        self.action_size = action_size
        self.epsilon_decay = 0.01
        self.epsilon_step = 0
        self.epsilon_min = 0.01
        
        with tf.variable_scope(name):
            self.scope = name
            self.inputs_ = tf.placeholder(tf.float32, [None, y_shape, 80, history_size], name = "inputs")
            self.expected_rewards_  = tf.placeholder(tf.float32,[None, ], name = "expected_rewards")
            self.Q = tf.placeholder(tf.float32,[None,], name = "action_pred")
            self.actions_ = tf.placeholder(tf.float32,shape = [None, action_size], name = "actions" )
            self.avg_max_Q_ = tf.placeholder(tf.float32, name="avg_max_Q")
            self.reward_ = tf.placeholder(tf.float32, name="reward")
            self.epoch_loss_ = tf.placeholder(tf.float32, name = "epoch_loss")
            
            #CNN
            
            self.conv1 = tf.layers.conv2d(
                inputs = self.inputs_,
                filters = 16,
                kernel_size = [8,8],
                strides = [4,4],
                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                activation = tf.nn.relu
            )
            
            self.conv2 = tf.layers.conv2d(
                inputs = self.conv1,
                filters = 8,
                kernel_size = [4,4],
                strides = [2,2],
                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                activation = tf.nn.relu
            )
            
            self.flatten = tf.layers.flatten(self.conv2)
            
            self.fc1 = tf.layers.dense(self.flatten,units = 512, activation = tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                name="fc1")
            
            self.preds = tf.layers.dense(
                self.fc1, units=action_size,
                kernel_initializer=tf.variance_scaling_initializer(),
                activation=tf.nn.relu)
        
        with tf.variable_scope("Q"): #multiply the output by two
            self.Q = tf.reduce_sum(tf.multiply(self.preds, self.actions_), axis=1)
        
        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(tf.losses.huber_loss(self.expected_rewards_, self.Q))
            
        with tf.variable_scope("train"):
            self.optimizer = tf.train.RMSPropOptimizer(lr, momentum=0.95, epsilon=0.01)
            self.train = self.optimizer.minimize(self.loss)
            
        with tf.variable_scope("summaries"):
            tf.summary.scalar("epoch_loss", self.epoch_loss_)
            tf.summary.scalar("avg_max_Q", self.avg_max_Q_)
            tf.summary.scalar("reward", self.reward_)
            self.summary_op = tf.summary.merge_all()

            
    def predict(self, sess, state):
        result = sess.run(self.preds, feed_dict={self.inputs_: state})
        return result
    
    def update(self, sess, state, action, expected_rewards):
        feed_dict = {self.inputs_: state, 
                    self.actions_: action, 
                    self.expected_rewards_: expected_rewards}
        loss = sess.run([self.loss, self.train], feed_dict=feed_dict)
        return loss
            
    def predict_next(self, sess, state):
        epsilon = self.epsilon_min + (1-self.epsilon_min) * np.exp(-self.epsilon_decay*self.epsilon_step)
        sample = np.random.rand()
        
        if sample < epsilon:
            action = np.random.randint(0,self.action_size)
        else: 
            action = np.argmax(self.predict(sess, [state]))
        self.epsilon_step += 1
        return action
    
    def summary(self, sess, loss, avg_max_Q, reward):
        summary = sess.run(self.summary_op, feed_dict = {self.epoch_loss_: loss, self.avg_max_Q_: avg_max_Q, self.reward_ :reward})
        return loss, summary


        

In [15]:
def param_copier(sess, q_network, target_network):
    
    # Get and sort parameters
    q_params = [t for t in tf.trainable_variables() if t.name.startswith(q_network.scope)]
    q_params = sorted(q_params, key=lambda v: v.name)
    t_params = [t for t in tf.trainable_variables() if t.name.startswith(target_network.scope)]
    t_params = sorted(t_params, key=lambda v: v.name)
    
    # Assign Q-Parameters to Target Network
    updates = []
    for q, t in zip(q_params, t_params):
        update = t.assign(q)
        updates.append(update)
    
    sess.run(updates)

In [16]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

n_epochs = 100
batch_size = 32
discount_rate = 0.95
target_updates = 100

env = gym.make('Breakout-v0')
obs = env.reset()



  result = entry_point.load(False)


In [17]:
tf.reset_default_graph()

QNetwork = DQN(name='QNetwork')
target = DQN(name='Target')


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


In [18]:
saver = tf.train.Saver()

buffer = Replay(buff_size = 1000)
history = History()

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    #count for reseting network
    count = 0
    
    state = env.reset()
    state_pro = preprocess(state)
    for i in range(5):
        history.add(state_pro)
        
    for i in range(1000):
        old_hist = history.output()
        action = QNetwork.predict_next(sess,old_hist)
        new_state, reward, done, _ = env.step(action)
        
        new_pro = preprocess(new_state)
        history.add(new_pro)
        new_output = history.output()
        
        one_hot_action = np.zeros(4)
        one_hot_action[action] = 1
        
        buffer.add([[old_hist, one_hot_action, new_output, reward, done]])
        
        #if done, update the history
        if done:
            state = env.reset()
            state_pro = preprocess(state)
            for i in range(5):
                history.add(prepro)
                
    for epoch in range(n_epochs):
        result = []
        total_reward = 0
        
        state = env.reset()
        state_pro = preprocess(state)
        for i in range(5):
            history.add(state_pro)
            
        while True:
            old_hist = history.output()
            action = QNetwork.predict_next(sess, old_hist)
            new_state, reward, done, _ = env.step(action)
            
            new_pro = preprocess(new_state)
            history.add(new_state)
            new_hist = history.output()
            
            one_hot_action = np.zeros(4)
            one_hot_action[action] = 1
            
            buffer.add([old_hist, one_hot_action, new_hist, reward, done])
            
            sample = np.array(buffer.sample())
            state_2, action_2, new_state_2, reward_2, done_2 = zip(*sample)
            # Find max Q-Value per batch for progress
            Q_preds = sess.run(QNetwork.Q, 
                                feed_dict={QNetwork.inputs_: state_2,
                                QNetwork.actions_: action_2})
            result.append(np.max(Q_preds))
            
            # Q-Network
            Total_preds = []
            Total_preds_batch = target.predict(sess, new_state_2)
            for i in range(batch_size):
                terminal = done_2[i]
                if terminal:
                    Total_preds.append(reward_2[i])
                else:
                    Total_preds.append(reward_2[i] + discount_rate * np.max(Total_preds_batch[i]))

            # Update Q-Network
            loss, _ = QNetwork.update(sess, state_2, action_2, Total_preds) 
            break
        else:
            total_reward = total_reward + reward
            
        count += 1
        if count % target_updates == 0:
            param_copier(sess, QNetwork, target)
        
    
    
    