In [1]:
'''
Deep Q-learning approach to the cartpole problem
using OpenAI's gym environment.
As part of the basic series on reinforcement learning @
https://github.com/vmayoral/basic_reinforcement_learning
This code implements the algorithm described at:
Mnih, V., Kavukcuoglu, K., Silver, D., Rusu, A. A., Veness, J., Bellemare, M. G., ... & Petersen,
S. (2015). Human-level control through deep reinforcement learning. Nature, 518(7540), 529-533.
Code based on @wingedsheep's work at https://gist.github.com/wingedsheep/4199594b02138dd427c22a540d6d6b8d
        @author: Victor Mayoral Vilches <victor@erlerobotics.com>
'''

import gym
import random
import numpy as np
from keras.models import Sequential
from keras import optimizers
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.regularizers import l2
from hearts.single import SingleEnv
from gym import spaces
import numpy as np
import random

# import os
# os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=gpu,floatX=float32"
# import theano



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class Memory:
    """
    This class provides an abstraction to store the [s, a, r, s'] elements of each iteration.
    Instead of using tuples (as other implementations do), the information is stored in lists
    that get returned as another list of dictionaries with each key corresponding to either
    "state", "action", "reward", "nextState" or "isFinal".
    """

    def __init__(self, size):
        self.size = size
        self.currentPosition = 0
        self.states = []
        self.actions = []
        self.rewards = []
        self.newStates = []
        self.finals = []

    def getMiniBatch(self, size):
        indices = random.sample(np.arange(len(self.states)).tolist(), min(size, len(self.states)))
        miniBatch = []
        for index in indices:
            miniBatch.append({'state': self.states[index], 'action': self.actions[index], 'reward': self.rewards[index],
                              'newState': self.newStates[index], 'isFinal': self.finals[index]})
        return miniBatch

    def getCurrentSize(self):
        return len(self.states)

    def getMemory(self, index):
        return {'state': self.states[index], 'action': self.actions[index], 'reward': self.rewards[index],
                'newState': self.newStates[index], 'isFinal': self.finals[index]}

    def addMemory(self, state, action, reward, newState, isFinal):
        if self.currentPosition >= self.size - 1:
            self.currentPosition = 0
        if len(self.states) > self.size:
            self.states[self.currentPosition] = state
            self.actions[self.currentPosition] = action
            self.rewards[self.currentPosition] = reward
            self.newStates[self.currentPosition] = newState
            self.finals[self.currentPosition] = isFinal
        else:
            self.states.append(state)
            self.actions.append(action)
            self.rewards.append(reward)
            self.newStates.append(newState)
            self.finals.append(isFinal)

        self.currentPosition += 1
    
    def dump_memory(self,file_name='temp.npz'):
        data_size = len(self.states)
        s = np.array(self.states[:data_size])
        a = np.array(self.actions[:data_size])
        r = np.array(self.rewards[:data_size])
        ns = np.array(self.newStates[:data_size])
        f = np.array(self.finals[:data_size])
        np.savez_compressed(file_name,
                           states=s,
                           actions=a,
                           rewards =r,
                           newStates=ns,
                           finals=f)
        
    def load_from_npz(self, file_name='temp.npz'):
        memory_dump = np.load(file_name)
        s = memory_dump['states']
        a = memory_dump['actions']
        r = memory_dump['rewards']
        ns = memory_dump['newStates']
        f = memory_dump['finals']
        
        for _ in s:
            self.states.append(_)
        
        for _ in a:
            self.actions.append(_)
        
        for _ in r:
            self.rewards.append(_)
        
        for _ in ns:
            self.newStates.append(_)
        
        for _ in f:
            self.finals.append(_)

In [3]:
def observation_to_flatten(observation):
    num_players = 4
    game_info_size = 8
    player_states = observation[0]
    table_states = observation[1]
    
    scroes = np.zeros(num_players)
    for i in range(num_players-1):
        scroes[i]=player_states[0][2*i]
    scroes[num_players-1] = player_states[1]
    
    hand = np.array(player_states[2]).flatten()
    
    income = np.zeros((4,13,4))
    for i in range(num_players-1):
        for num, suit in player_states[0][2*i+1]:
            if num != -1:
                income[i, int(num), int(suit)] = 1
            else:
                break

    for num, suit in player_states[3]:
        if num != -1:
            income[num_players-1, int(num), int(suit)] = 1
        else:
            break
    
    game_info = np.zeros(game_info_size)
    for i in range(game_info_size):
        game_info[i] = table_states[i]
        # n_round
        # start_pos
        # cur_pos
        # exchanged
        # heart_occured
        # n_games
        # finish_expose
        # heart_exposed

    board = np.array(table_states[8]).flatten()
    first_draw = np.array(table_states[9]).flatten()
    backup = np.array(table_states[10]).flatten()
    
    flatten = np.concatenate((scroes, hand.flatten(), income.flatten(), game_info, board, first_draw,  backup))
    return flatten

In [4]:
def get_candidate_cards(hand_cards, n_round, first_draw, heart_broken):
    '''
    hand_cards: 2d array
    n_round: int
    first_draw: 1d array
    heart_broken: int
    
    return: 2d array
    '''
    hand_cards_available = []
    if [0, 3] in hand_cards.tolist():
        hand_cards_available.append([0,3])
    else:
        if first_draw[1] != -1:
            for c in hand_cards:
                 if c[1] == first_draw[1]:
                        hand_cards_available.append(c)
        
        if not hand_cards_available:
            if n_round == 0:
                for c in hand_cards:
                    if c[1] !=1 and not np.array_equal(c,np.array([10,0])):
                        hand_cards_available.append(c)
            elif heart_broken!=1: # not heart break
                for c in hand_cards:
                    if c[1] !=1:
                        hand_cards_available.append(c)
    
    if not hand_cards_available:
        hand_cards_available = hand_cards
        
    return np.array(hand_cards_available)

In [5]:
# if action is none, it should new new state case
# which is uded for evaluate feature case
def flatten_state_to_all_feature_with_action(state, action=None):
    '''
    state: 1d array (flatten observation)
    action: 1d array
    '''
    income = state[30:238]
    game_info = state[238:246]
    board_cards = np.array(state[246:254]).reshape(4,2)
    
    start_pos = game_info[1]
    cur_pos = game_info[2]
    cards_on_board = cur_pos - start_pos
    if cur_pos<start_pos:
        cards_on_board+=4
    
    board = np.zeros(18)
    for i in range(int(cards_on_board)):
        card_pos = int((start_pos +i)%4)
        num, suit = board_cards[card_pos]
        #print(num, suit)
        board[6*i + 0] = 1 # card existed
        board[6*i + 1 + int(suit)] = 1 # suit
        board[6*i + 5] = num
    
    if action is not None:
        all_feature = np.concatenate((income, game_info, board, action))
    else:
        all_feature = np.concatenate((income, game_info, board))
        
    return all_feature

In [6]:
# action of new state should be based on gaming rule
# return all posible (s,a) pairs
def flatten_state_to_all_feature(state):
    '''
    state: 1d array (flatten observation)
    '''

    feature_1d = flatten_state_to_all_feature_with_action(state, None)
    
    game_info = state[238:246]
    n_round = int(game_info[0])
    heart_broken = int(game_info[4])
    first_draw = state[254:256]
    hand_cards_2d = state[4:30].reshape(13,2)
    hand_cards = hand_cards_2d[:-n_round] if n_round>0 else hand_cards_2d

    candidate_cards = get_candidate_cards(hand_cards, n_round, first_draw, heart_broken)
    
    feature_2d = feature_1d.reshape(1,-1).repeat(candidate_cards.shape[0], axis=0)
    
    return np.concatenate((feature_2d,candidate_cards),axis=1)

In [7]:
def get_hand_cards_available_from_flatten(state):
    '''
    state: 1d array (flatten observation)
    '''
    game_info = state[238:246]
    n_round = int(game_info[0])
    heart_broken = int(game_info[4])
    first_draw = state[254:256]
    hand_cards_2d = state[4:30].reshape(13,2)
    hand_cards = hand_cards_2d[:-n_round] if n_round>0 else hand_cards_2d

    candidate_cards = get_candidate_cards(hand_cards, n_round, first_draw, heart_broken)
    
    return candidate_cards

In [10]:
class DeepQ:
    """
    DQN abstraction.
    As a quick reminder:
        traditional Q-learning:
            Q(s, a) += alpha * (reward(s,a) + gamma * max(Q(s') - Q(s,a))
        DQN:
            target = reward(s,a) + gamma * max(Q(s')
    """

    def __init__(self, inputs, outputs, memorySize, discountFactor, learningRate, learnStart):
        """
        Parameters:
            - inputs: input size
            - outputs: output size
            - memorySize: size of the memory that will store each state
            - discountFactor: the discount factor (gamma)
            - learningRate: learning rate
            - learnStart: steps to happen before for learning. Set to 128
        """
        self.input_size = inputs
        self.output_size = outputs
        self.memory = Memory(memorySize)
        self.discountFactor = discountFactor
        self.learningRate = learningRate
        self.learnStart = learnStart

    def initNetworks(self, hiddenLayers):
        model = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", self.learningRate)
        self.model = model

        targetModel = self.createModel(self.input_size, self.output_size, hiddenLayers, "relu", self.learningRate)
        self.targetModel = targetModel

    def createRegularizedModel(self, inputs, outputs, hiddenLayers, activationType, learningRate):
        bias = True
        dropout = 0
        regularizationFactor = 0.01
        model = Sequential()
        if len(hiddenLayers) == 0:
            model.add(Dense(self.output_size, input_shape=(self.input_size,), init='lecun_uniform', bias=bias))
            model.add(Activation("linear"))
        else:
            if regularizationFactor > 0:
                model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform',
                                W_regularizer=l2(regularizationFactor), bias=bias))
            else:
                model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform', bias=bias))

            if (activationType == "LeakyReLU"):
                model.add(LeakyReLU(alpha=0.01))
            else:
                model.add(Activation(activationType))

            for index in range(1, len(hiddenLayers)):
                layerSize = hiddenLayers[index]
                if regularizationFactor > 0:
                    model.add(Dense(layerSize, init='lecun_uniform', W_regularizer=l2(regularizationFactor), bias=bias))
                else:
                    model.add(Dense(layerSize, init='lecun_uniform', bias=bias))
                if (activationType == "LeakyReLU"):
                    model.add(LeakyReLU(alpha=0.01))
                else:
                    model.add(Activation(activationType))
                if dropout > 0:
                    model.add(Dropout(dropout))
            model.add(Dense(self.output_size, init='lecun_uniform', bias=bias))
            model.add(Activation("linear"))
        optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06)
        model.compile(loss="mse", optimizer=optimizer)
        model.summary()
        return model

    def createModel(self, inputs, outputs, hiddenLayers, activationType, learningRate):
        model = Sequential()
        if len(hiddenLayers) == 0:
            model.add(Dense(self.output_size, input_shape=(self.input_size,), init='lecun_uniform'))
            model.add(Activation("linear"))
        else:
            model.add(Dense(hiddenLayers[0], input_shape=(self.input_size,), init='lecun_uniform'))
            if (activationType == "LeakyReLU"):
                model.add(LeakyReLU(alpha=0.01))
            else:
                model.add(Activation(activationType))

            for index in range(1, len(hiddenLayers)):
                # print("adding layer "+str(index))
                layerSize = hiddenLayers[index]
                model.add(Dense(layerSize, init='lecun_uniform'))
                if (activationType == "LeakyReLU"):
                    model.add(LeakyReLU(alpha=0.01))
                else:
                    model.add(Activation(activationType))
            model.add(Dense(self.output_size, init='lecun_uniform'))
#             model.add(Activation("linear"))
        optimizer = optimizers.RMSprop(lr=learningRate, rho=0.9, epsilon=1e-06)
        model.compile(loss="mse", optimizer=optimizer)
        model.summary()
        return model

    def printNetwork(self):
        i = 0
        for layer in self.model.layers:
            weights = layer.get_weights()
            print("layer ", i, ": ", weights)
            i += 1

    def backupNetwork(self, model, backup):
        weightMatrix = []
        for layer in model.layers:
            weights = layer.get_weights()
            weightMatrix.append(weights)
        i = 0
        for layer in backup.layers:
            weights = weightMatrix[i]
            layer.set_weights(weights)
            i += 1

    def updateTargetNetwork(self):
        self.backupNetwork(self.model, self.targetModel)
    
    # predict Q values for all the actions
    def getQValues(self, state, is_flatten=True):
        if is_flatten:
            predicted = self.model.predict(flatten_state_to_all_feature(state))
        else:
            predicted = self.model.predict(get_all_feature_by_observation(state))
        return predicted
    
    # predict Q values for all the actions using target network
    def getTargetQValues(self, state, is_flatten=True):
        if is_flatten:
            predicted = self.targetModel.predict(flatten_state_to_all_feature(state))
        else:
            predicted = self.targetModel.predict(get_all_feature_by_observation(state))
        return predicted

    def getMaxQ(self, qValues):
        return np.max(qValues)

    def getMaxIndex(self, qValues):
        return np.argmax(qValues)

    # calculate the target function
    def calculateTarget(self, qValuesNewState, reward, isFinal):
        """
        target = reward(s,a) + gamma * max(Q(s')
        """
        if isFinal:
            return reward
        else:
            return reward + self.discountFactor * self.getMaxQ(qValuesNewState)

    # select the action with the highest Q value
    def selectAction(self, qValues, explorationRate,action_space,hand_cards):
        rand = random.random()
        if rand < explorationRate:
            action = action_space.sample()
        else:
            card_index = self.getMaxIndex(qValues)
            action = [hand_cards[card_index]]
        return action

    def selectActionByProbability(self, qValues, bias):
        qValueSum = 0
        shiftBy = 0
        for value in qValues:
            if value + shiftBy < 0:
                shiftBy = - (value + shiftBy)
        shiftBy += 1e-06

        for value in qValues:
            qValueSum += (value + shiftBy) ** bias

        probabilitySum = 0
        qValueProbabilities = []
        for value in qValues:
            probability = ((value + shiftBy) ** bias) / float(qValueSum)
            qValueProbabilities.append(probability + probabilitySum)
            probabilitySum += probability
        qValueProbabilities[len(qValueProbabilities) - 1] = 1

        rand = random.random()
        i = 0
        for value in qValueProbabilities:
            if (rand <= value):
                return i
            i += 1

    def addMemory(self, state, action, reward, newState, isFinal):
        self.memory.addMemory(state, action, reward, newState, isFinal)

    def learnOnLastState(self):
        if self.memory.getCurrentSize() >= 1:
            return self.memory.getMemory(self.memory.getCurrentSize() - 1)

    def learnOnMiniBatch(self, miniBatchSize, useTargetNetwork=True):
        # Do not learn until we've got self.learnStart samples
        if self.memory.getCurrentSize() > self.learnStart:
            # learn in batches of 128
            miniBatch = self.memory.getMiniBatch(miniBatchSize)
            X_batch = np.empty((0, self.input_size), dtype=np.float64)
            Y_batch = np.empty((0, self.output_size), dtype=np.float64)
            for sample in miniBatch:
                isFinal = sample['isFinal']
                state = sample['state']
                action = sample['action']
                reward = sample['reward']
                newState = sample['newState']
                
                if newState[238] == 0: # new state is new round
                    isFinal = True

                # qValues = self.getQValues(state) # not used
                if useTargetNetwork:
                    qValuesNewState = self.getTargetQValues(newState, True)
                else:
                    qValuesNewState = self.getQValues(newState, True)
                targetValue = self.calculateTarget(qValuesNewState, reward, isFinal)

                # prepare training dataset
                X_batch = np.append(X_batch, np.array([flatten_state_to_all_feature_with_action(state.copy(), action.copy())]), axis=0)
                Y_sample = targetValue
                Y_batch = np.append(Y_batch, np.array([Y_sample]))

            self.model.fit(X_batch, Y_batch, batch_size=len(miniBatch), nb_epoch=1, verbose=0)

In [11]:
env = SingleEnv()

epochs = 1000
steps = 1000

updateTargetNetwork = 10000
explorationRate = 1
minibatch_size = 128

learnStart = 1300
learningRate = 0.00025
discountFactor = 0.99
memorySize = 1000000

last100Scores = [0] * 100
last100ScoresIndex = 0
last100Filled = False

deepQ = DeepQ(236, 1, memorySize, discountFactor, learningRate, learnStart)
# deepQ.initNetworks([30,30,30])
# deepQ.initNetworks([30,30])
deepQ.initNetworks([300, 300])

stepCounter = 0

# number of reruns
for epoch in range(epochs):
    observation = env.reset()
    done = False
#     print(explorationRate)
    # number of timesteps
    for t in range(steps):
        
        # end of game
        if done:
            env.render()
            last100Scores[last100ScoresIndex] = t
            last100ScoresIndex += 1
            if last100ScoresIndex >= 100:
                last100Filled = True
                last100ScoresIndex = 0
            if not last100Filled:
                print("Episode ", epoch, " finished after {} timesteps".format(t + 1))
            else:
                print("Episode ", epoch, " finished after {} timesteps".format(t + 1), 
                      " last 100 average: ",(sum(last100Scores) / len(last100Scores)))
            break
        
        if observation[1][6]==0:
            action = env.action_space.sample()
            # do the action and get the reward and go to next state
            # this is an exange case
            # Unable to be trained now
            newObservation, reward, done, info = env.step(action)
            observation = newObservation
            continue

        # select action based on highest Q value
        observation_f = observation_to_flatten(observation)
        qValues = deepQ.getQValues(observation_f,is_flatten=True)
        hand_cards = get_hand_cards_available_from_flatten(observation_f)
        action = deepQ.selectAction(qValues, explorationRate,env.action_space,hand_cards)


        # do the action and get the reward and go to next state
        newObservation, reward, done, info = env.step(action)
            

        # put states and rewareds into memory buffer
        newObservation_f = observation_to_flatten(newObservation)
        deepQ.addMemory(observation_f, action[0], reward, newObservation_f, done)

        # regularly update network
        if stepCounter >= learnStart:
            if stepCounter <= updateTargetNetwork:
                deepQ.learnOnMiniBatch(minibatch_size, False)
            else:
                deepQ.learnOnMiniBatch(minibatch_size, True)

        observation = newObservation

        

        stepCounter += 1
        if stepCounter % updateTargetNetwork == 0:
            deepQ.updateTargetNetwork()
            print("updating target network")

    explorationRate *= 0.995
    # explorationRate -= (2.0/epochs)
    explorationRate = max(0.05, explorationRate)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 300)               71100     
_________________________________________________________________
activation_5 (Activation)    (None, 300)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 300)               90300     
_________________________________________________________________
activation_6 (Activation)    (None, 300)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 301       
Total params: 161,701
Trainable params: 161,701
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dens



Game 16
Round 13
  Player 0 score -175
  
   [30m🂩[39m [30m🂪[39m [31m🂾[39m [34m🃂[39m [34m🃅[39m [34m🃇[39m [34m🃈[39m [34m🃋[39m [34m🃍[39m [34m🃎[39m [32m🃔[39m [32m🃗[39m 

  Player 1 score -227
  
   [30m🂨[39m [30m🂫[39m [30m🂭[39m [30m🂮[39m [32m🃖[39m [32m🃘[39m [32m🃛[39m [32m🃞[39m 

> Player 2 score -165
  
   [30m🂢[39m [30m🂣[39m [30m🂤[39m [30m🂥[39m [30m🂦[39m [31m🂵[39m [31m🂶[39m [31m🂷[39m [31m🂸[39m [31m🂹[39m [31m🂻[39m [34m🃄[39m [34m🃆[39m [34m🃉[39m [34m🃊[39m [32m🃒[39m [32m🃓[39m [32m🃕[39m [32m🃚[39m [32m🃑[39m 

  Player 3 score -243
  
   [30m🂧[39m [30m🂡[39m [31m🂲[39m [31m🂳[39m [31m🂴[39m [31m🂺[39m [31m🂽[39m [31m🂱[39m [34m🃃[39m [34m🃁[39m [32m🃙[39m [32m🃝[39m 

 NA NA NA NA 

Episode  6  finished after 226 timesteps
Game 16
Round 13
> Player 0 score -122
  
   [30m🂩[39m [30m🂡[39m [31m🂲[39m [31m🂴[39m [31m🂽[39m [31m🂾[39m [31m🂱[39m [34m🃃[39m [34m🃍[39m [34m🃁[39m [32m🃘[39m [32m🃝

Game 16
Round 13
  Player 0 score -148
  
   [30m🂣[39m [30m🂦[39m [30m🂨[39m [30m🂩[39m [30m🂭[39m [30m🂮[39m [30m🂡[39m [32m🃗[39m 

  Player 1 score -195
  
   [30m🂢[39m [30m🂤[39m [30m🂥[39m [30m🂧[39m [30m🂪[39m [30m🂫[39m [31m🂲[39m [31m🂴[39m [31m🂸[39m [31m🂹[39m [31m🂱[39m [34m🃂[39m [34m🃈[39m [34m🃋[39m [34m🃎[39m [32m🃔[39m 

  Player 2 score 125
  
   [31m🂳[39m [31m🂵[39m [31m🂷[39m [31m🂺[39m [31m🂻[39m [32m🃒[39m [32m🃓[39m [32m🃖[39m [32m🃙[39m [32m🃛[39m [32m🃝[39m [32m🃑[39m 

> Player 3 score -199
  
   [31m🂶[39m [31m🂽[39m [31m🂾[39m [34m🃃[39m [34m🃄[39m [34m🃅[39m [34m🃆[39m [34m🃇[39m [34m🃉[39m [34m🃊[39m [34m🃍[39m [34m🃁[39m [32m🃕[39m [32m🃘[39m [32m🃚[39m [32m🃞[39m 

 NA NA NA NA 

Episode  16  finished after 227 timesteps
Game 16
Round 13
> Player 0 score -332
  
   [30m🂢[39m [30m🂣[39m [30m🂤[39m [30m🂦[39m [30m🂡[39m [31m🂴[39m [31m🂵[39m [31m🂶[39m [31m🂺[39m [31m🂻[39m [31m🂽[39m [31m🂾

Game 16
Round 13
  Player 0 score -205
  
   [30m🂦[39m [30m🂨[39m [30m🂭[39m [30m🂡[39m [34m🃉[39m [32m🃓[39m [32m🃕[39m [32m🃚[39m 

  Player 1 score -182
  
   [30m🂢[39m [30m🂤[39m [30m🂥[39m [30m🂧[39m [30m🂩[39m [30m🂪[39m [30m🂫[39m [30m🂮[39m [31m🂲[39m [31m🂳[39m [31m🂵[39m [31m🂶[39m [31m🂺[39m [31m🂻[39m [31m🂽[39m [31m🂱[39m [34m🃃[39m [34m🃄[39m [34m🃅[39m [34m🃆[39m [34m🃇[39m [34m🃈[39m [34m🃊[39m [34m🃋[39m [34m🃍[39m [34m🃎[39m [34m🃁[39m [32m🃘[39m [32m🃙[39m [32m🃛[39m [32m🃝[39m [32m🃞[39m 

  Player 2 score -200
  
   

> Player 3 score -184
  
   [30m🂣[39m [31m🂴[39m [31m🂷[39m [31m🂸[39m [31m🂹[39m [31m🂾[39m [34m🃂[39m [32m🃒[39m [32m🃔[39m [32m🃖[39m [32m🃗[39m [32m🃑[39m 

 NA NA NA NA 

Episode  26  finished after 224 timesteps
Game 16
Round 13
  Player 0 score -148
  
   [30m🂤[39m [30m🂥[39m [30m🂧[39m [30m🂨[39m [31m🂲[39m [31m🂵[39m [31m🂷[39m [31m🂹[39m [34m🃂[39m [34m🃄[39m [34m🃆[39m [34m

Game 16
Round 13
> Player 0 score -242
  
   [30m🂢[39m [30m🂣[39m [30m🂦[39m [30m🂩[39m [30m🂫[39m [31m🂴[39m [31m🂵[39m [31m🂶[39m [31m🂸[39m [31m🂻[39m [34m🃃[39m [32m🃖[39m 

  Player 1 score -246
  
   [30m🂤[39m [30m🂧[39m [30m🂨[39m [30m🂪[39m [30m🂭[39m [30m🂮[39m [30m🂡[39m [31m🂱[39m [34m🃂[39m [34m🃆[39m [34m🃈[39m [34m🃎[39m 

  Player 2 score 220
  
   [32m🃕[39m [32m🃗[39m [32m🃚[39m [32m🃛[39m 

  Player 3 score -125
  
   [30m🂥[39m [31m🂲[39m [31m🂳[39m [31m🂷[39m [31m🂹[39m [31m🂺[39m [31m🂽[39m [31m🂾[39m [34m🃄[39m [34m🃅[39m [34m🃇[39m [34m🃉[39m [34m🃊[39m [34m🃋[39m [34m🃍[39m [34m🃁[39m [32m🃒[39m [32m🃓[39m [32m🃔[39m [32m🃘[39m [32m🃙[39m [32m🃝[39m [32m🃞[39m [32m🃑[39m 

 NA NA NA NA 

Episode  36  finished after 226 timesteps
Game 16
Round 13
  Player 0 score -181
  
   [34m🃂[39m [34m🃃[39m [34m🃆[39m [34m🃈[39m [34m🃊[39m [34m🃋[39m [34m🃎[39m [34m🃁[39m [32m🃒[39m [32m🃚[39m [32m🃛[39m [32m🃝

Game 16
Round 13
  Player 0 score -251
  
   [30m🂩[39m [30m🂪[39m [30m🂭[39m [31m🂳[39m [31m🂵[39m [31m🂶[39m [31m🂸[39m [31m🂻[39m [31m🂱[39m [32m🃔[39m [32m🃕[39m [32m🃘[39m 

  Player 1 score -196
  
   [30m🂧[39m [30m🂨[39m [30m🂫[39m [30m🂮[39m 

  Player 2 score -67
  
   [34m🃂[39m [34m🃃[39m [34m🃈[39m [34m🃉[39m [32m🃒[39m [32m🃚[39m [32m🃞[39m [32m🃑[39m 

> Player 3 score -202
  
   [30m🂢[39m [30m🂣[39m [30m🂤[39m [30m🂥[39m [30m🂦[39m [30m🂡[39m [31m🂲[39m [31m🂴[39m [31m🂷[39m [31m🂹[39m [31m🂺[39m [31m🂽[39m [31m🂾[39m [34m🃄[39m [34m🃅[39m [34m🃆[39m [34m🃇[39m [34m🃊[39m [34m🃋[39m [34m🃍[39m [34m🃎[39m [34m🃁[39m [32m🃓[39m [32m🃖[39m [32m🃗[39m [32m🃙[39m [32m🃛[39m [32m🃝[39m 

 NA NA NA NA 

Episode  46  finished after 226 timesteps
Game 16
Round 13
> Player 0 score -263
  
   [30m🂢[39m [30m🂣[39m [30m🂤[39m [30m🂦[39m [30m🂪[39m [30m🂫[39m [30m🂡[39m [31m🂷[39m [31m🂸[39m [31m🂺[39m [31m🂻[39m [31m🂽

Game 16
Round 13
  Player 0 score -117
  
   [30m🂢[39m [30m🂣[39m [30m🂤[39m [30m🂧[39m [30m🂩[39m [31m🂳[39m [31m🂴[39m [31m🂵[39m [31m🂶[39m [31m🂷[39m [31m🂸[39m [31m🂱[39m [32m🃒[39m [32m🃔[39m [32m🃚[39m [32m🃑[39m 

  Player 1 score -209
  
   

  Player 2 score -161
  
   [30m🂥[39m [30m🂦[39m [30m🂨[39m [30m🂫[39m [30m🂭[39m [30m🂮[39m [30m🂡[39m [31m🂽[39m [34m🃃[39m [34m🃄[39m [34m🃉[39m [34m🃎[39m [32m🃘[39m [32m🃙[39m [32m🃛[39m [32m🃝[39m 

> Player 3 score -338
  
   [30m🂪[39m [31m🂲[39m [31m🂹[39m [31m🂺[39m [31m🂻[39m [31m🂾[39m [34m🃂[39m [34m🃅[39m [34m🃆[39m [34m🃇[39m [34m🃈[39m [34m🃊[39m [34m🃋[39m [34m🃍[39m [34m🃁[39m [32m🃓[39m [32m🃕[39m [32m🃖[39m [32m🃗[39m [32m🃞[39m 

 NA NA NA NA 

Episode  56  finished after 225 timesteps
Game 16
Round 13
> Player 0 score -138
  
   [30m🂤[39m [30m🂩[39m [30m🂭[39m [30m🂡[39m [31m🂲[39m [31m🂵[39m [31m🂷[39m [31m🂻[39m [34m🃈[39m [32m🃔[39m [32m🃗[39m [32m

Game 16
Round 13
> Player 0 score -235
  
   [30m🂥[39m [30m🂦[39m [30m🂧[39m [30m🂮[39m [30m🂡[39m [31m🂷[39m [31m🂺[39m [31m🂻[39m [31m🂾[39m [31m🂱[39m [34m🃂[39m [34m🃃[39m [34m🃄[39m [34m🃅[39m [34m🃆[39m [34m🃇[39m [34m🃈[39m [34m🃉[39m [34m🃊[39m [34m🃋[39m [34m🃍[39m [34m🃎[39m [34m🃁[39m [32m🃒[39m [32m🃓[39m [32m🃔[39m [32m🃕[39m [32m🃖[39m [32m🃗[39m [32m🃘[39m [32m🃙[39m [32m🃚[39m [32m🃛[39m [32m🃝[39m [32m🃞[39m [32m🃑[39m 

  Player 1 score -203
  
   [30m🂢[39m [30m🂣[39m [30m🂪[39m [30m🂭[39m 

  Player 2 score -203
  
   

  Player 3 score -121
  
   [30m🂤[39m [30m🂨[39m [30m🂩[39m [30m🂫[39m [31m🂲[39m [31m🂳[39m [31m🂴[39m [31m🂵[39m [31m🂶[39m [31m🂸[39m [31m🂹[39m [31m🂽[39m 

 NA NA NA NA 

Episode  66  finished after 222 timesteps
Game 16
Round 13
  Player 0 score -185
  
   [31m🂲[39m [31m🂴[39m [31m🂷[39m [31m🂸[39m [31m🂺[39m [31m🂻[39m [31m🂽[39m [31m🂾[39m 

> Player 1 score -31
  
   [30m🂭[39m 

Game 16
Round 13
  Player 0 score -223
  
   [30m🂣[39m [30m🂤[39m [30m🂧[39m [30m🂨[39m [30m🂩[39m [30m🂪[39m [30m🂫[39m [31m🂲[39m [31m🂵[39m [31m🂷[39m [31m🂸[39m [31m🂹[39m [31m🂽[39m [31m🂾[39m [34m🃂[39m [34m🃃[39m [34m🃆[39m [34m🃍[39m [34m🃎[39m [32m🃑[39m 

> Player 1 score -197
  
   [30m🂥[39m [30m🂮[39m [31m🂳[39m [31m🂴[39m [31m🂶[39m [31m🂺[39m [31m🂻[39m [31m🂱[39m [34m🃄[39m [34m🃇[39m [34m🃈[39m [34m🃁[39m [32m🃔[39m [32m🃕[39m [32m🃘[39m [32m🃙[39m [32m🃚[39m [32m🃛[39m [32m🃝[39m [32m🃞[39m 

  Player 2 score -129
  
   [32m🃒[39m [32m🃓[39m [32m🃖[39m [32m🃗[39m 

  Player 3 score -117
  
   [30m🂢[39m [30m🂦[39m [30m🂭[39m [30m🂡[39m [34m🃅[39m [34m🃉[39m [34m🃊[39m [34m🃋[39m 

 NA NA NA NA 

Episode  76  finished after 227 timesteps
Game 16
Round 13
  Player 0 score -34
  
   [30m🂥[39m [30m🂧[39m [30m🂡[39m [31m🂳[39m [31m🂵[39m [31m🂶[39m [31m🂷[39m [31m🂻[39m [31m🂱[39m [34m🃂[39m [34m🃄[39m [34m🃅

Game 16
Round 13
> Player 0 score -307
  
   [31m🂴[39m [31m🂶[39m [31m🂷[39m [31m🂸[39m [31m🂻[39m [34m🃂[39m [34m🃄[39m [34m🃉[39m [34m🃊[39m [34m🃎[39m [34m🃁[39m [32m🃚[39m 

  Player 1 score -216
  
   [31m🂲[39m [31m🂳[39m [31m🂺[39m [31m🂾[39m 

  Player 2 score -149
  
   [30m🂣[39m [30m🂤[39m [30m🂦[39m [30m🂨[39m [30m🂪[39m [30m🂮[39m [34m🃅[39m [34m🃆[39m [34m🃇[39m [34m🃈[39m [34m🃋[39m [34m🃍[39m [32m🃒[39m [32m🃓[39m [32m🃘[39m [32m🃛[39m 

  Player 3 score -115
  
   [30m🂢[39m [30m🂥[39m [30m🂧[39m [30m🂩[39m [30m🂫[39m [30m🂭[39m [30m🂡[39m [31m🂵[39m [31m🂹[39m [31m🂽[39m [31m🂱[39m [34m🃃[39m [32m🃔[39m [32m🃕[39m [32m🃖[39m [32m🃗[39m [32m🃙[39m [32m🃝[39m [32m🃞[39m [32m🃑[39m 

 NA NA NA NA 

Episode  86  finished after 222 timesteps
Game 16
Round 13
  Player 0 score -198
  
   [34m🃄[39m [34m🃈[39m [34m🃋[39m [32m🃓[39m 

  Player 1 score -161
  
   [31m🂲[39m [31m🂳[39m [31m🂴[39m [31m🂵[39m [31m🂹[39m

updating target network
Game 16
Round 13
  Player 0 score -188
  
   [30m🂢[39m [30m🂣[39m [30m🂦[39m [30m🂨[39m [30m🂩[39m [30m🂪[39m [30m🂡[39m [31m🂵[39m [31m🂶[39m [31m🂷[39m [31m🂸[39m [31m🂹[39m [31m🂻[39m [31m🂽[39m [31m🂱[39m [34m🃂[39m [34m🃅[39m [34m🃇[39m [34m🃊[39m [32m🃒[39m [32m🃓[39m [32m🃕[39m [32m🃛[39m [32m🃑[39m 

  Player 1 score -207
  
   [30m🂤[39m [30m🂫[39m [30m🂭[39m [34m🃉[39m [34m🃎[39m [34m🃁[39m [32m🃔[39m [32m🃗[39m [32m🃘[39m [32m🃙[39m [32m🃝[39m [32m🃞[39m 

> Player 2 score -284
  
   [31m🂲[39m [31m🂳[39m [31m🂴[39m [31m🂺[39m [31m🂾[39m [34m🃃[39m [34m🃆[39m [34m🃈[39m [34m🃋[39m [34m🃍[39m [32m🃖[39m [32m🃚[39m 

  Player 3 score -166
  
   [30m🂥[39m [30m🂧[39m [30m🂮[39m [34m🃄[39m 

 NA NA NA NA 

Episode  96  finished after 223 timesteps


KeyboardInterrupt: 

In [170]:
# dump previous gaming states
deepQ.memory.dump_memory("20180823.npz")
# dumped gaming state chould be shared with others
# so that our bots could increase diversity of gaming state

In [167]:
deepQ.memory.getCurrentSize()

4085

In [None]:
learnStart = 9999
learningRate = 0.00025
discountFactor = 0.99
memorySize = 1000000

deepQ = DeepQ(87, 1, memorySize, discountFactor, learningRate, learnStart)
deepQ.initNetworks([512, 300])

# load models whose network arch should be the same as your definitation
# deepQ.model = load_model('my_model_87_0830.h5')
# deepQ.targetModel = load_model('my_target_model_87_0830.h5')

# load dumped memory(previous gaming status) for re-trianin model
deepQ.memory.load_from_npz('20180823.npz')

In [None]:
# re-trianin model
# re_training_time could be modified based on requirements
re_training_time = 20
use_target_network = False
for i in range(re_training_time):
    print("%s th training..." %i)
    deepQ.learnOnMiniBatch(12800, use_target_network)
    deepQ.updateTargetNetwork()
    use_target_network = True
    print("%s th done"%i)