In [None]:
# TODO deque(maxlen, pooling vs strides)
from LossHistory import LossHistory
# make sure you don't hog all the video memory
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
from keras import backend as K
K.set_session(sess)
###################################

import random
import gym
import numpy as np
from collections import deque
from keras.optimizers import RMSprop
from keras.layers import Input, Dense, Dropout, BatchNormalization, Flatten, ELU, Activation, Conv2D, MaxPooling2D
from keras.models import Model, load_model


from skimage.transform import resize
from skimage.color import rgb2gray

# import matplotlib
# matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

EPISODES = 1000

class DQNAgent:
    def __init__(self, state_size, action_size, loss_history):
        self.action_size = action_size
        self.memory = deque(maxlen=1000000)
        self.gamma = 0.95    # discount rate
#         self.epsilon = 1.0  # initial exploration rate $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$!!!!
        self.epsilon = 0.7  # initial exploration rate $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$!!!!
        self.epsilon_min = 0.1
#         self.epsilon_decay = 0.995
        self.epsilon_decay = 0.000001
        self.learning_rate = 0.001
        self.dilation_rate = 1
        self.strides = (4,2)
        self.change_action_every = 4
        self.h = []
        self.target_shape = state_size
        self.loss_history = loss_history
        self.model = self._build_model()        

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        input_img = Input(shape=(self.target_shape[0], self.target_shape[1], 4))  # adapt this if using `channels_first` image data format

        x = Conv2D(16, (8, 8), activation='relu', padding='same', dilation_rate=self.dilation_rate, strides=self.strides[0])(input_img)
        x = BatchNormalization()(x)
        # x = MaxPooling2D((2, 2), padding='same')(x)
        x = Conv2D(32, (4, 4), activation='relu', padding='same', dilation_rate=self.dilation_rate, strides=self.strides[1])(x)
        x = BatchNormalization()(x)
        # x = MaxPooling2D((2, 2), padding='same')(x)

        x = Flatten()(x)
        x = Dropout(0.01)(x)
        x = Dense(70, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dense(3, activation='softmax')(x)

        model = Model(input_img, x)
        model.summary()
        model.compile(loss='mse', optimizer=RMSprop(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice([0,1,2], 1)
        act_values = self.model.predict(state)
        # print(np.argmax(act_values))
        return np.argmax(act_values)  # returns action

    def get_state4(self, i, k):
        state4 = [self.memory[i-j][k] for j in [3,2,1,0]]
        state4 = np.stack(state4, axis=2)
        return state4

    def replay1(self, batch_size):
        samp_addr = np.random.choice(len(self.memory), batch_size, replace=False)
        # print(samp_addr)

        for i in samp_addr:
            state = self.get_state4(i,0)[None,:,:,:]
            action = self.memory[i][1]
            reward = self.memory[i][2]
            next_state = self.get_state4(i,3)[None,:,:,:]
            done = self.memory[i][4]

            if done:
                target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)))
            target_f = self.model.predict(state)
            target_f[0, action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0, callbacks = [loss_history])
            self.h.append(loss_history.batch_loss)
        if self.epsilon > self.epsilon_min:
#             self.epsilon *= self.epsilon_decay # exponential epsilon decay
            self.epsilon -= self.epsilon_decay # linear epsilon decay

    def fi(self, observation):
        observation = observation[34:194,:,:]
        observation = resize(observation, self.target_shape, anti_aliasing=True)
        observation = rgb2gray(observation)
        return observation

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state_size = (80,80)
    action_size = 3
    loss_history = LossHistory()
    agent = DQNAgent(state_size, action_size, loss_history)
#     agent.load("Pong-v0.h5")
    done = False
    batch_size = 32

    # choose one random action until it's not determined by the model
    action = np.random.choice([0,1,2], 1)

    for e in range(EPISODES):
        state = env.reset()
        state = agent.fi(state)

        for time in range(2000):
            # env.render()
            
            #every 4th timestep we update the action
            if time % agent.change_action_every == 0 and time > agent.change_action_every:
                last_state_addr = len(agent.memory)-1
                last_state4 = agent.get_state4(last_state_addr,0)[None,:,:,:]
                action = agent.act(last_state4)
#                 print("ACTION ADJUSTED")

#             if time % 400 == 0 and time > 0:
#                 print(
#                     'time:', time,
#                     'loss:', np.mean(agent.h[len(agent.h)-20:len(agent.h)-1]),
#                     'memory', len(agent.memory),
#                     'action', action,
#                     'epsilon', agent.epsilon)

            next_state, reward, done, _ = env.step(action + 1)
#             reward = reward if not done else -10 ############################################!!!!!!!!!!!!!!!!
            next_state = agent.fi(next_state)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                episode_len = time
                break
            if len(agent.memory) > batch_size:
                agent.replay1(batch_size)
        if e % 5 == 0:
            namestr = "Pong-v0.h5" + str(np.mean(agent.h)) + "_eps_" + str(agent.epsilon)
            agent.save(namestr)
            with open("log.txt", "a") as myfile:
                myfile.write("episode " + str(e) + "\t" +
                             "loss " + str(np.mean(agent.h)) + "\t" +
                             " epsilon " + str(agent.epsilon) + "\t" +
                             " mesize " + str(len(agent.memory)) + "\t" +
                             " time " + str(time) +
                             "\n")

plt.plot(agent.h)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 80, 80, 4)         0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 20, 20, 16)        4112      
_________________________________________________________________
batch_normalization_7 (Batch (None, 20, 20, 16)        64        
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 10, 10, 32)        8224      
_________________________________________________________________
batch_normalization_8 (Batch (None, 10, 10, 32)        128       
_________________________________________________________________
flatten_3 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 3200)              0         
__________

  warn("The default mode, 'constant', will be changed to 'reflect' in "


episode: 0/1000, score: 1339, e: 0.7
episode: 1/1000, score: 1183, e: 0.7
episode: 2/1000, score: 1184, e: 0.7
episode: 3/1000, score: 1345, e: 0.69
episode: 4/1000, score: 1179, e: 0.69
episode: 5/1000, score: 1263, e: 0.69
episode: 6/1000, score: 1585, e: 0.69
episode: 7/1000, score: 1186, e: 0.69
episode: 8/1000, score: 1106, e: 0.69
episode: 9/1000, score: 1248, e: 0.69
episode: 10/1000, score: 1356, e: 0.69
episode: 11/1000, score: 1078, e: 0.68
episode: 12/1000, score: 1354, e: 0.68
episode: 13/1000, score: 1303, e: 0.68
episode: 14/1000, score: 1239, e: 0.68
episode: 15/1000, score: 1428, e: 0.68
episode: 16/1000, score: 1005, e: 0.68
episode: 17/1000, score: 1113, e: 0.68
episode: 18/1000, score: 1326, e: 0.68
episode: 19/1000, score: 1215, e: 0.67
episode: 20/1000, score: 1346, e: 0.67
episode: 21/1000, score: 1254, e: 0.67
episode: 22/1000, score: 1388, e: 0.67
episode: 23/1000, score: 1187, e: 0.67
episode: 24/1000, score: 1332, e: 0.67
episode: 25/1000, score: 1146, e: 0.67