In [None]:
import gym
import gym.spaces
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
from collections import namedtuple, deque
import itertools
import sys
import random
from tensorflow.python.tools import inspect_checkpoint as chkp
#SELECT GPU
os.environ["CUDA_VISIBLE_DEVICES"]= "2"

#### Preprocess function from tutorial

In [None]:
def preprocess(image):
    ''' prepro 210x160x3 uint8 frame into 6400 (80x80) 2D float array '''
    image = image[35:195] # crop
    image = image[::2,::2,0] # downsample by factor of 2
    image[image == 144] = 0 # erase background (background type 1)
    image[image == 109] = 0 # erase background (background type 2)
    image[image != 0] = 1 # everything else just set to 1
    return np.reshape(image.astype(np.float).ravel(), [80,80])

env = gym.envs.make("Breakout-v0")


In [None]:
def make_epsilon_greedy_policy(q_predictions, q_X, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.
    Args:
        q_predictions: The tf node that gets q predictions for each action given a state
        q_X: The tf node that is the input placeholder for the network (state)
        nA: Number of actions in the environment.
    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        #q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        #Get q values for the various actions
        q_values = sess.run(q_predictions, feed_dict={q_X : np.expand_dims(observation,0)})[0]
        best_action = np.argmax(q_values)
        #Use the linearly decreasing explore factor for the suboptimal actions, and 1-epsilon for optimal
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [None]:
num_games = 1000
experiment_dir = os.path.relpath("./breakout_experiments")
checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
checkpoint_path = os.path.join(checkpoint_dir, "model")
poss_actions = list(range(4))
learning_rate = 2e-4

In [None]:
def build_model(scope):
        """
        Function to build a tensorflow graph and return its components
        """
        #This folder will store the summaries folder.
        summaries_dir = "./breakout_summaries"
        
        with tf.variable_scope(scope):
            # Placeholders for our input
        
            X_pl = tf.placeholder(shape=[None, 80, 80,4], dtype=tf.float32, name="X")
            # The TD target value
            y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
            # Index of selected action
            actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
            
            X = X_pl
            batch_size = tf.shape(X_pl)[0]
            
            # Three convolutional layers
            conv1 = tf.contrib.layers.conv2d(
                X, 32, 8, 2, activation_fn=tf.nn.relu)
            conv2 = tf.contrib.layers.conv2d(
                conv1, 64, 4, 2, activation_fn=tf.nn.relu)
            conv3 = tf.contrib.layers.conv2d(
                conv2, 64, 3, 1, activation_fn=tf.nn.relu)
            # Fully connected layers - 1 hiden layer and an output layer.
            flattened = tf.contrib.layers.flatten(conv3)
            fc1 = tf.contrib.layers.fully_connected(flattened, 512)
            predictions = tf.contrib.layers.fully_connected(fc1, len(poss_actions), activation_fn=None)
            # Get the predictions for the chosen actions only
#             gather_indices = tf.range(batch_size) * tf.shape(predictions)[1] + actions_pl
#             action_predictions = tf.gather(tf.reshape(predictions, [-1]), gather_indices)
            
            #Only take Q for the action we take (multiply with one hot vector)
            action_one_hot = tf.one_hot(actions_pl, len(poss_actions), 1.0, 0.0, name='action_one_hot')
            action_predictions = tf.reduce_sum(tf.multiply(predictions, action_one_hot), axis=1)
            
            # Calculate the loss
            losses = tf.squared_difference(y_pl, action_predictions)
            loss = tf.reduce_mean(losses)
            # Adam optimizer to reduce MSE
            optimizer = tf.train.AdamOptimizer(learning_rate)#, 0.99, 0.0, 1e-6)
            train_op = optimizer.minimize(loss, global_step=tf.contrib.framework.get_global_step())
            
            #Summaries for tensorboard
            summaries = tf.summary.merge([
            tf.summary.scalar("loss", loss),
            tf.summary.histogram("loss_hist", losses),
            tf.summary.histogram("q_values_hist", predictions),
            tf.summary.scalar("max_q_value", tf.reduce_max(predictions))
            ])
            
            #Create summary_dir when building the model
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                summary_writer = tf.summary.FileWriter(summary_dir)

        return predictions, train_op, X_pl, y_pl, actions_pl, summaries, summary_writer, loss


In [None]:
def make_epsilon_greedy_policy(q_predictions, q_X, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.
    Args:
        q_predictions: The tf node that gets q predictions for each action given a state
        q_X: The tf node that is the input placeholder for the network (state)
        nA: Number of actions in the environment.
    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        #q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        #Get q values for the various actions
        q_values = sess.run(q_predictions, feed_dict={q_X : np.expand_dims(observation,0)})[0]
        best_action = np.argmax(q_values)
        #Use the linearly decreasing explore factor for the suboptimal actions, and 1-epsilon for optimal
        A[best_action] += (1.0 - epsilon)
        #A = q_values
        return A
    return policy_fn

In [None]:
tf.reset_default_graph()
# Where we save our checkpoints and graphs
experiment_dir = os.path.relpath("./breakout_experiments")
# Create a global step variable for training iteration count
global_step = tf.Variable(0, name='global_step', trainable=False)
    

q_estimator = build_model(scope="q")
target_estimator = build_model(scope="target_q")


allrewards = []
totrewards = []
with tf.Session() as sess:
        print("SESSION STARTED")
        #sess.run(tf.global_variables_initializer())
        q_predictions, q_train_op, q_X, q_y, q_actions, q_summaries, q_summary_writer, q_loss = q_estimator
        target_predictions, target_train_op, target_X, target_y, target_actions,\
        target_summaries, t_summary_writer, target_loss = target_estimator
        latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
        
        new_saver = tf.train.Saver()
        new_saver.restore(sess, latest_checkpoint)
        
       
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))

        policy = make_epsilon_greedy_policy(
        q_predictions, q_X,
        len(poss_actions))
    
        for game in range(num_games):
            # start game 
            q_predictions, q_train_op, q_X, q_y, q_actions, q_summaries, q_summary_writer, q_loss = q_estimator
            target_predictions, target_train_op, target_X, target_y, target_actions,\
            target_summaries, t_summary_writer, target_loss = target_estimator
            state = env.reset()
            state, _, done, _ = env.step(1)
            state = preprocess(state)
            #Stack the initial frame 4 times initially.
            state = np.stack([state] * 4, axis=2)
            ctr = 0
            reward_total = 0
            while True: 
                
                q_predictions, q_train_op, q_X, q_y, q_actions, q_summaries, q_summary_writer, q_loss = q_estimator
                #print(np.sum(state))
                action_probs = policy(sess, state, 0.1)
               # print(action_probs)
#                 if ctr % 150 == 0:
# #                     plt.gcf()
#                     plt.figure()
#                     plt.imshow(state[:,:,3])
#                     plt.show()
#                     plt.close()
#                     print(ctr)
                    
                action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
                #print(action)
                new_state, reward, done, _ = env.step(action)
                allrewards.append(reward)
                next_state = preprocess(new_state)
                #Stack new frame and pop out fourth frame.
                next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
                
                state = next_state.copy()
                              
                ctr = ctr + 1
                if done:
                    totrewards.append(reward_total + reward)
                    print("episode " + str(game) + ": Reward = " + str(reward_total + reward))
                    with open('breakout_1000_2.txt', 'a') as f:
  
                        f.write(str(reward_total + reward) + "\n")
                    break
                else: 
                    # Update reward total
                    reward_total = reward_total + reward



In [None]:
sum(allrewards)

In [None]:
# with open('breakout_1000_2.txt', 'w') as f:
#     for rew in allrewards:
#         f.write(str(rew) + "\n")

    