In [36]:
import sys
#change it to your own path to where 'gym' is installed
sys.path.append('/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages')
import random
import gym
import numpy as np

from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.utils import multi_gpu_model

#from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display
from multiprocessing import Pool

In [37]:
EPISODES = 1000
TIME_LIMIT = 1000

In [38]:
from keras.utils import Sequence

class DataSequence(Sequence):
    def __init__(self, dataset, model, batch_size):
        self.data=dataset
        self.batch_size = batch_size
        self.model=model
        self.gamma = 0.95 # discount rate

    def __len__(self):
        return int(np.ceil(len(self.data) / float(self.batch_size)))
    

    def __getitem__(self, idx):        
        minibatch=self.data[idx * self.batch_size:(idx + 1) * self.batch_size]

        states=[]
        targets_f=[]

        for state, action, reward, next_state, done in minibatch:
            states.append(state)
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))

            target_f = self.model.predict(state)
            target_f[0][action] = target

            targets_f.append(target_f)
        return np.array(states), np.array(targets_f)


In [39]:
class DQNAgent:
    def __init__(self, state_size, action_size,batch_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=5000) # maximum number of samples stored in dataset
        self.gamma = 0.95 # discount rate
        self.epsilon = 0.2 # exploration rate
        #self.epsilon = 1.0 # exploration rate
        self.epsilon_min = 0.01 # minimum exploration rate
        #self.epsilon_decay = 0.995 # decay rate for exploration
        self.epsilon_decay = 1 # decay rate for exploration
        self.learning_rate = 0.001
        self.batch_size=batch_size
        self.model = self.normal_model(4)

    def _build_model_2L(self):
        """2-layer Neural Net for Deep-Q learning Model."""
        model = Sequential()
        model.add(Dense(units=24, input_dim=self.state_size, activation='relu')) # input layer
        model.add(Dense(units=self.action_size, activation='linear')) # output layer
        #model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) # loss function = mean squared error
        return model
    
    def _build_model_3L(self):
        """3-layer Neural Net for Deep-Q learning Model."""
        model = Sequential()
        model.add(Dense(units=24, input_dim=self.state_size, activation='relu')) # input layer
        model.add(Dense(units=24, activation='relu'))
        model.add(Dense(units=self.action_size, activation='linear')) # output layer
        #model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) # loss function = mean squared error
        return model

    def _build_model_4L(self):
        """4-layer Neural Net for Deep-Q learning Model."""
        model = Sequential()
        model.add(Dense(units=24, input_dim=self.state_size, activation='relu')) # input layer
        model.add(Dense(units=24, activation='relu'))
        model.add(Dense(units=24, activation='relu'))
        model.add(Dense(units=self.action_size, activation='linear')) # output layer
        #model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) # loss function = mean squared error
        return model
    
    def normal_model(self,layer=4):
        if layer==2:
            model=self._build_model_2L()
        elif layer==3:
            model = self._build_model_3L()
        elif layer==4:
            model = self._build_model_4L()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) # loss function = mean squared error
        return model
        
    def parallel_model(self,layer=4):
        if layer==2:
            model=self._build_model_2L()
        elif layer==3:
            model = self._build_model_3L()
        elif layer==4:
            model = self._build_model_4L()
        # Not needed to change the device scope for model definition:
        p_model = multi_gpu_model(model, cpu_relocation=True)
        p_model.compile(loss='mse',optimizer='Adam(lr=self.learning_rate)')
        return p_model

    def remember(self, state, action, reward, next_state, done):
        """Store s,a,r,s' by appending to self.memory."""
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        """Choose action randomly (explore) or by model prediction (exploit)."""
        if np.random.rand() <= self.epsilon: # explore with probabiluty self.epsilon
            return random.randrange(self.action_size)

        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def fit_model(self,minibatch):
#         for state, action, reward, next_state, done in minibatch:
#             target = reward
#             if not done:
#                 target = (reward + self.gamma *
#                           np.amax(self.model.predict(next_state)[0]))

#             target_f = self.model.predict(state)
#             target_f[0][action] = target
#         multiprocess_batchsize=500
    
        seq=DataSequence(minibatch,self.model,self.batch_size)
        
        self.model.fit_generator(generator=seq, 
                epochs = 1, 
                verbose=1,
                workers=8,
                use_multiprocessing=True)
            #self.model.fit(state, target_f, epochs=1, verbose=0) # epochs = number of iterations over the minibatch
            
    def replay(self, batch_size):
        """Train the neural net on the episodes in self.memory. 
           Only N samples defined by batch_size are sampled from self.memory for training.
        """
        minibatch = random.sample(self.memory, 5000)
        #chunks = [minibatch[i:i + 100] for i in range(0, len(minibatch), 100)]
        #pool=Pool()
        #pool.map(self.fit_model, chunks)

        self.fit_model(self.minibatch)
              
        # Decaying exploration rate
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)



In [40]:
env = gym.make('Phoenix-ram-v0')

In [None]:
# state_size = env.observation_space.shape[0]
# action_size = env.action_space.n
# agent = DQNAgent(state_size, action_size)
# done = False
# batch_size = 500
# scores = [] # store the score for each completed episode

In [None]:
import time
if __name__ == '__main__':
    env = gym.make('Phoenix-ram-v0')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size,batch_size)
    done = False
    batch_size = 5000
    scores = [] # store the score for each completed episode

    for episode in range(EPISODES):
        #start_time=time.clock()
        
        print('episode = {}'.format(episode))
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        score = 0

        for time in range(TIME_LIMIT):
            # env.render()
            action = agent.act(state) # DQN agent chooses next action 
            next_state, reward, done, _ = env.step(action) # observe rewards and successor state
            score += reward # keep track of game score
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done) # add s,a,r,s' to dataset (agent.memory)
            state = next_state        
                    
            if done or time==TIME_LIMIT:
                print('episode: {}/{}, scores: {}, exploration rate: {:.2}'
                          .format(episode, EPISODES, scores, agent.epsilon))
                scores.append(score)
                #print('scores = {}'.format(scores))
                break

            # Train NN after each episode or timeout by randomly sampling a batch from the dataset in agent.memory
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
                
        #print ('time elpased={}'.format(time.clock()-start_time))
        
        if episode%20==0: #save every 20 episodes
            # Save weights after training is complete
            agent.save('phoenix_dqn_3L.h5')


episode = 0
episode: 0/1000, scores: [], exploration rate: 0.2
episode = 1
episode = 2
episode: 2/1000, scores: [520.0], exploration rate: 0.2
episode = 3
episode: 3/1000, scores: [520.0, 420.0], exploration rate: 0.2
episode = 4
episode: 4/1000, scores: [520.0, 420.0, 460.0], exploration rate: 0.2
episode = 5
episode: 5/1000, scores: [520.0, 420.0, 460.0, 560.0], exploration rate: 0.2
episode = 6
episode: 6/1000, scores: [520.0, 420.0, 460.0, 560.0, 140.0], exploration rate: 0.2
episode = 7
episode = 8
episode: 8/1000, scores: [520.0, 420.0, 460.0, 560.0, 140.0, 460.0], exploration rate: 0.2
episode = 9
episode: 9/1000, scores: [520.0, 420.0, 460.0, 560.0, 140.0, 460.0, 380.0], exploration rate: 0.2
episode = 10
episode: 10/1000, scores: [520.0, 420.0, 460.0, 560.0, 140.0, 460.0, 380.0, 440.0], exploration rate: 0.2
episode = 11
episode = 12
episode = 13
episode: 13/1000, scores: [520.0, 420.0, 460.0, 560.0, 140.0, 460.0, 380.0, 440.0, 660.0], exploration rate: 0.2
episode = 14
episod

In [None]:
# Calculate average score
print('AVERAGE SCORE = {}'.format(np.mean(np.asarray(scores))))