In [2]:
from Connect4 import Connect4
from Robots import Robots
from GamePlay import GamePlay
from tensorflow.keras.models import Sequential, save_model, load_model
import numpy as np
import random

import datetime


import keras
from keras import layers
from keras import models

In [83]:
#class that creates the connect4 environment
class DeepQLAgent():

    def __init__(self,player=1,gameplay=GamePlay(),memory=None):
        self.player=player
        
        self.gameplay=gameplay
        self.ROWS = self.gameplay.ROWS
        self.COLUMNS = self.gameplay.COLUMNS
        
        self.batch_size = 100
        self.lr=.01
        self.INPUT_SHAPE=(self.ROWS, self.COLUMNS, 1)
        self.action_size =self.COLUMNS
        self.gamma=.5
        self.epsilon = 1.0 #starting explore probability
        self.explore_stop = .01 #stopping explore rate
        self.decay_rate = 0.00001 #decay rate # 0.00001

        self.memory_size = 5000 # number of experiences to keep
        self.memory_start = 200 # starting size of memory bank
        self.memory=self._memory_initiate()
        self.avgrewards=[]
        self.model=self._build_model()
        self.losses=[]

    #build model
    def _build_model(self):
        model=models.Sequential()
        
        model.add(layers.Flatten(input_shape=self.INPUT_SHAPE))
        #model.add(layers.Dense(128, activation = 'relu'))
        #model.add(layers.Dense(100, activation = 'relu'))
        model.add(layers.Dense(64, activation = 'relu'))
        model.add(layers.Dense(32, activation = 'relu'))
        #model.add(layers.Dense(16, activation = 'relu'))
        model.add(layers.Dense(self.action_size,activation='linear'))

        '''
        model.add(layers.Conv2D(4, kernel_size = (4, 4), activation='relu',
                                padding='same',strides=(2,2), input_shape=self.INPUT_SHAPE))
        #model.add(layers.Dropout(0.25))
        model.add(layers.Flatten()) 
        model.add(layers.Dense(self.action_size,activation='softmax'))'''

        optimizer = keras.optimizers.Adam(lr=self.lr)
        model.compile(loss='mse', optimizer=optimizer)
        
        return model


    def _memory_initiate(self):
        memory=[]
        turn = 0
        #player=1
        status = None
        while len(memory)<self.memory_start:
            if status != 'Keep Playing!': #if we are in a terminal state restart game
                self.gameplay.reset()
                turn = 0

            state = self.gameplay.BOARD.copy()
            actions = self.gameplay.Get_Legal_Moves(self.gameplay.BOARD)
            action = np.random.choice(actions,1)[0]
            #take random actions. record their states,actions, rewards, next states, and status

            #player1
            if turn % 2 ==0:
                self.gameplay.Add_Piece(1,action,self.gameplay.BOARD)
                #turn +=1

            #player2
            elif turn % 2 ==1:
                self.gameplay.Add_Piece(2,action,self.gameplay.BOARD)
                #turn +=1


            status = self.gameplay.Check_Goal(self.gameplay.BOARD)
            if status !='Keep Playing!':
                state_prime = np.zeros((self.gameplay.ROWS,self.gameplay.COLUMNS))
                state_prime[state_prime < 1] = self.player
            else:
                state_prime = self.gameplay.BOARD.copy() #gamestate
            reward = self.gameplay.Get_Score(self.player,state_prime)

            if turn % 2 ==0 and self.player==1:
                state_prime = self.gameplay.BOARD.copy()
                reward = self.gameplay.Get_Score(self.player,state_prime)
                memory.append((state,action,reward,state_prime,status)) #memorize this
            if turn % 2 ==1 and self.player==2:
                state_prime = self.gameplay.BOARD.copy()
                reward = self.gameplay.Get_Score(self.player,state_prime)
                memory.append((state,action,reward,state_prime,status)) #memorize this
            turn +=1
        return memory    
    

    #function to hold previous states/actions/rewards/nextstates/status
    def memorize(self,state,action,reward,state_prime,done):
        if len(self.memory)>=self.memory_size: #if memory is full remove random element before adding new one
            self.memory.pop(random.randrange(len(self.memory)))  
        self.memory.append((state,action,reward,state_prime,done)) #append memory

    def load(self,name):
        self.model.load_weights(name)

    def save(self,name):
        self.model.save_weights(name)
    

    #predict values of state using model
    #I hate shapes/shaping with numpy/keras; can never get it to work so this is a function to reshape it for me
    def model_predict(self,state):
        state_reshape=np.expand_dims(np.expand_dims(state, axis=3),axis=0)
        return self.model.predict(state_reshape)
    
    #function to take action given state                  
    def make_move(self,state): 
        if np.random.rand() <=self.epsilon:
            #explore
            #print('explore, epsilon is',self.epsilon)
            legal_actions=self.gameplay.Get_Legal_Moves(state)
            return random.randrange(len(legal_actions))
        else:
            #don't explore
            qvals = self.model_predict(state)
            return np.argmax(qvals)


    def replay(self,batch_size):
        batch = random.sample(self.memory,self.batch_size)
        for el in batch:
            if self.player==1: #replace 2's with -1s
                el[0][el[0]>1]=-1
                el[3][el[3]>1]=-1
            if self.player==2: #replace 1s with -1s. replace 2's with 1s
                el[0][(el[0] < 2) & (el[0] > 0)] = -1
                el[0][el[0]>1]=1                
                el[3][(el[3] < 2) & (el[3] > 0)] = -1
                el[3][el[3]>1]=1                
        
        states = []
        targets_f = []
        #working
        #for state,action, reward, state_prime,status in batch:
        #    qs=self.model_predict(state)[0]
        #    legal=gameplay.Get_Legal_Moves(state)
        #    #update qs... i think
        #    for i in range(len(qs)): 
        #        sp=gameplay.Add_Piece(self.player,i,state)
        #        r=gameplay.get_reward(self.player,sp)
        #        newstatus=gameplay.Check_Goal(sp)
        #        if i not in legal: #if not a legal move
        #            qs[i]=r #q equals reward
        #        elif newstatus != 'Keep Playing!' or status != 'Keep Playing!': #if terminal status
        #            qs[i]=r #q equals reward
        #        elif action==i: #otherwise, do the q update
        #            qs[action]=(reward + self.gamma * np.argmax(self.model_predict(state_prime)[0]))
        #    
        #    targets_f.append(qs)
        #    states.append(state)
                           
        
        for state,action, reward, state_prime,status in batch:
            legal=gameplay.Get_Legal_Moves(state)
            if status != 'Keep Playing!':
                target = reward 
            elif status == 'Keep Playing!':
                if action not in gameplay.Get_Legal_Moves(state): #if not a legal move make target the negative reward
                    target = reward
                else:
                    target = (reward + self.gamma * np.argmax(self.model_predict(state_prime)[0])) #idk why they have first element.
            
            self.avgrewards.append(reward)
            target_f = self.model_predict(state_prime)
            target_f[0][action]=target #update target value
            for i in range(len(target_f[0])):
                if i not in legal:
                    target_f[0][i]=min(target_f[0])
                #if
            targets_f.append(target_f[0])
            states.append(state)
            
        states=np.expand_dims(np.array(states), axis=3) #reshaping to train model
        targets_f=np.array(targets_f)

        history = self.model.fit(states,targets_f,epochs=1,verbose=0)
        loss = history.history['loss'][0]
        self.losses.append(loss)
        if self.epsilon <= self.explore_stop:
            self.epsilon=self.explore_stop
        else:
            self.epsilon = self.epsilon-self.decay_rate
        #print('epsilon:',self.epsilon)



In [84]:
gameplay=GamePlay()
agent=DeepQLAgent()
agent.replay(agent.batch_size)

agent2=DeepQLAgent(player=2)
agent2.replay(agent.batch_size)


from Robots import Robots
bot=Robots(depth=1)
#bot.MiniMaxAlphaBeta_bot(state)
agent.model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_48 (Flatten)         (None, 42)                0         
_________________________________________________________________
dense_90 (Dense)             (None, 64)                2752      
_________________________________________________________________
dense_91 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_92 (Dense)             (None, 7)                 231       
Total params: 5,063
Trainable params: 5,063
Non-trainable params: 0
_________________________________________________________________


In [None]:
#secondgo
EPISODES = 200000
episode=0
threshold=60

#batch_size=50
turn=0
p1wins=0
p2wins=0

p1prev=0
p2prev=0

agent.epsilon=.5

while episode<EPISODES:

    #player1
    if turn % 2 ==0:
        state=gameplay.BOARD.copy() #get state
        action=agent.make_move(state) #get action
        gameplay.Add_Piece(1,action,gameplay.BOARD) #do action
        status=gameplay.Check_Goal(gameplay.BOARD) #get status

    #if it's player 1's turn and player 1 wins, 
    #state prime/reward is immediately after action instead after the other player's action
        if status == 'Player 1 wins!':
            state_prime = gameplay.BOARD.copy()
            reward = gameplay.Get_Score(1,state_prime)
            agent.memorize(state,action,reward,state_prime,status)


    if turn % 2 ==1:
        action2=bot.Rando_bot(state)
        #action2=agent2.make_move(state)
        #action2=bot.MiniMaxAlphaBeta_bot(state,2)
        gameplay.Add_Piece(2,action2,gameplay.BOARD) #do action for other player
        state_prime = gameplay.BOARD.copy() #get state prime 
        if action not in gameplay.Get_Legal_Moves(state): #punish agent for trying to make an illegal move
            reward=-5
            state_prime = state.copy()
        else:
            reward = gameplay.Get_Score(1,state_prime) #get reward for bot
        status=gameplay.Check_Goal(gameplay.BOARD) #get status
        
        agent.memorize(state,action,reward,state_prime,status) #memorize state/reward/stateprime/status


    #check if it is the end of the episode, then reset (either status change or threshold moves played)
    if status !='Keep Playing!' or turn>threshold:
        if status == 'Player 1 wins!':
            p1wins=p1wins+1
        if status == 'Player 2 wins!':
            p2wins=p2wins+1
        
        #reset game
        turn=0
        gameplay.reset()
        episode=episode+1
        
        loss=agent.replay(agent.batch_size) # do the thing
        agent2.epsilon=agent.epsilon #update agent2
        agent.save('weights.h5')
        agent2.load('weights.h5') # load weights of the updated agent for player 2


    turn = turn + 1 #next turn
    
    #the prints and stuff
    if episode % 1000==0 and status !='Keep Playing!':
        print('--------------------------------------------------------')
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        print('EPISODE: ',episode)
        print('Avg loss last 1000:',round(sum(agent.losses)/(len(agent.losses)+.001),2),'epsilon:',round(agent.epsilon,3))
        print('Avg reward:',round(sum(agent.avgrewards)/(len(agent.avgrewards)+.001),2))
        
        agent.losses=[]
        agent.avgrewards=[]
        winrate=round((p1wins-p1prev)/(p1wins+p2wins-p1prev-p2prev+.001),2)
        print('player 1 wins:',p1wins,'player 2 wins:',p2wins, 'winrate:',winrate)
        p1prev=p1wins
        p2prev=p2wins

    if episode==100 and status !='Keep Playing!':
        print('EPISODE:',episode,'epsilon:',round(agent.epsilon,3))
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        keras.models.save_model(agent.model,'mymodel_'+str(episode)+'.h5')
        
    if episode==10000 and status !='Keep Playing!':
        print('EPISODE:',episode,'epsilon:',round(agent.epsilon,3))
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        keras.models.save_model(agent.model,'mymodel_'+str(episode)+'.h5')
    if episode==20000 and status !='Keep Playing!':
        print('EPISODE:',episode,'epsilon:',round(agent.epsilon,3))
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        keras.models.save_model(agent.model,'mymodel_'+str(episode)+'.h5')
    if episode==50000 and status !='Keep Playing!':
        print('EPISODE:',episode,'epsilon:',round(agent.epsilon,3))
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        keras.models.save_model(agent.model,'mymodel_'+str(episode)+'.h5')
        
    if episode % 100000==0 and status !='Keep Playing!':
        print('EPISODE:',episode,'epsilon:',round(agent.epsilon,3))
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        keras.models.save_model(agent.model,'mymodel_'+str(episode)+'.h5')


In [76]:
for mem in agent.memory:
    print('MEMORY:')
    print(mem[0])
    print('ACTION:',mem[1],'REWARD:',mem[2])
    print('Qs:')
    print(agent.model_predict(mem[0]))
    print('pred:',np.argmax(agent.model_predict(mem[0])))

MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  1.  0. -1. -1.  0.  0.]]
ACTION: 5 REWARD: 0.25
Qs:
[[-2.4810693 -2.5444367 -1.474736  -2.645671  -2.377594  -2.0255926
  -2.9881983]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0. -1.  0. -1.  0.]
 [ 1.  1. -1.  1. -1. -1.  0.]
 [ 1.  1.  1. -1. -1.  1.  0.]]
ACTION: 3 REWARD: -0.75
Qs:
[[-2.8857534 -4.0283165 -1.1920003 -4.230981  -4.8621635 -1.8137944
  -3.4866014]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  1.  0.  0.  0.]]
ACTION: 4 REWARD: 1.25
Qs:
[[-0.78141797 -1.5069183  -0.63121045 -0.40267828  0.0372297  -1.2855872
  -1.17481   ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  



pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1. -1. -1.  0.]
 [ 0.  0.  0. -1.  1. -1.  0.]
 [ 1.  0.  0. -1.  1.  1.  0.]
 [ 1.  0.  0.  1.  1. -1. -1.]]
ACTION: 3 REWARD: -0.75
Qs:
[[-2.7082512 -4.6455913 -1.8734235 -5.106502  -3.8728323 -3.8680334
  -6.9673934]]
pred: 2
MEMORY:
[[ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0. -1.  0.]
 [ 0.  0.  0. -1. -1. -1.  0.]
 [ 1.  0.  0. -1.  1. -1.  0.]
 [ 1.  0.  0. -1.  1.  1.  0.]
 [ 1. -1.  0.  1.  1. -1. -1.]]
ACTION: 4 REWARD: -1.5
Qs:
[[-1.9321607  -5.9095364  -0.84055203 -3.8652787  -4.592725   -3.5506787
  -5.284964  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.]
 [-1.  1. -1. -1.  1.  1. -1.]]
ACTION: 6 REWARD: 1.75
Qs:
[[-1.339597  -1.9502689 -1.446936  -1.0955474 -0.3590261 -1.9070122
  -2.503065 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  

[[-4.4632773 -4.237311  -2.2465336 -3.165017  -4.198529  -4.3684144
  -5.4427633]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  1. -1.  0. -1. -1.]]
ACTION: 3 REWARD: -50
Qs:
[[-1.7934859  -1.498861   -0.6918976  -2.1563487  -0.93463844 -0.7760475
  -2.229115  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  1. -1.  0. -1. -1.]]
ACTION: 3 REWARD: -1.75
Qs:
[[-1.7934859  -1.498861   -0.6918976  -2.1563487  -0.93463844 -0.7760475
  -2.229115  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  1. -1.  0.  0.]
 [ 1. -1.  0. -1. -1.  1.  1.]]
ACTION: 3 REWARD: -1.25
Qs:
[[-4.670171  -3.8005915 -2.2717404 -3.5159738 -3

pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 1.  0.  1. -1. -1.  1.  0.]]
ACTION: 4 REWARD: -1.5
Qs:
[[-5.1819987 -4.780803  -2.6994402 -4.3242097 -4.9166026 -3.4228284
  -5.2011886]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 1.  0. -1.  0. -1.  0.  1.]
 [ 1. -1.  1. -1. -1.  1. -1.]]
ACTION: 3 REWARD: -1.0
Qs:
[[-3.5667827 -4.0896134 -1.3369988 -5.4424424 -5.6728435 -2.2975676
  -5.570394 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0. -1.]
 [ 1.  0. -1.  1. -1.  1.  1.]
 [ 1. -1.  1. -1. -1.  1. -1.]]
ACTION: 1 REWARD: -1.25
Qs:
[[-3.799624  -4.828444  -1.5129625 -6.1880965 -6.0716586 -2.3719423
  -5.8900046]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.

  -3.60471  ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1. -1.  1.  0.]
 [ 1.  0.  0.  1.  1.  1.  0.]
 [ 1.  0.  0. -1.  1.  1. -1.]
 [-1.  0.  0. -1.  1. -1. -1.]]
ACTION: 1 REWARD: -0.25
Qs:
[[-1.4881235 -4.0880313 -1.9691749 -5.953031  -5.7422748 -2.40653
  -5.5923524]]
pred: 0
MEMORY:
[[ 0.  0.  0.  0. -1.  0.  0.]
 [-1.  0.  0.  1. -1.  0.  0.]
 [ 1.  0.  0. -1. -1.  1.  0.]
 [ 1.  1.  0.  1.  1.  1. -1.]
 [ 1. -1.  0. -1.  1.  1. -1.]
 [-1.  1. -1. -1.  1. -1. -1.]]
ACTION: 1 REWARD: 0.25
Qs:
[[-0.40762955 -6.976906   -3.4696088  -8.426712   -8.573736   -3.6798635
  -8.0797825 ]]
pred: 0
MEMORY:
[[ 0.  0.  0. -1. -1.  0.  0.]
 [-1. -1.  0.  1. -1. -1.  0.]
 [ 1.  1.  0. -1. -1.  1.  0.]
 [ 1.  1.  0.  1.  1.  1. -1.]
 [ 1. -1.  0. -1.  1.  1. -1.]
 [-1.  1. -1. -1.  1. -1. -1.]]
ACTION: 3 REWARD: -10
Qs:
[[-0.07747336 -7.464117   -4.0932946  -8.430638   -6.431956   -3.8257842
  -8.558926  ]]
pred: 0
MEMORY:
[[ 0. -1.  0. -1. -1

pred: 2
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  1.  1.  0.  0.  0.]
 [-1.  0.  1. -1.  0.  0.  0.]
 [-1.  0. -1.  1.  0. -1. -1.]
 [ 1.  0.  1.  1.  0.  1.  1.]
 [-1. -1. -1.  1.  1. -1. -1.]]
ACTION: 2 REWARD: -50
Qs:
[[-5.00526    -1.1793355   0.34942174 -3.5449028   0.15121737 -1.7788278
  -3.1742702 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1. -1.  0.  0.  0.  0. -1.]]
ACTION: 6 REWARD: -1.0
Qs:
[[-0.5980759  -1.0296255   1.0210894  -1.2123096  -0.80695504 -1.3990756
  -2.908945  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  0.  0.  0.  0.  1.]
 [ 1. -1.  0.  0.  0.  0. -1.]]
ACTION: 3 REWARD: -0.25
Qs:
[[-1.0929453 -1.6826825  1.59988   -2.0884979 -2.2503548 -1.9942504
  -3.945912 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [

Qs:
[[-1.4592439 -1.7610384 -1.0052913 -1.5769441 -1.1778297 -1.7143553
  -2.533153 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1. -1.  0.]
 [ 0.  0.  0.  1. -1.  1. -1.]
 [ 0.  1.  0. -1.  1.  1. -1.]]
ACTION: 3 REWARD: -1.25
Qs:
[[-4.9126196 -4.5344543 -2.6327572 -2.2273922 -2.447166  -4.28275
  -4.0627995]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  0.  1.  0. -1.  1.]
 [ 0.  0.  0.  1. -1. -1.  1.]
 [ 0.  0.  0.  1. -1.  1. -1.]
 [-1.  1. -1. -1.  1.  1. -1.]]
ACTION: 3 REWARD: 50
Qs:
[[-3.841506   -2.8384938  -1.3457973   0.64674723  0.9565669  -4.400093
  -2.9826274 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.]
 [ 0.  1.  0.  0. -1. -1.  0.]
 [ 0.  1. -1. -1. -1.  1.  0.]]
ACTION: 2 REWARD: 0.75
Qs:
[[-1.1529598 -1.4841704 -1.3000946 -3.245776  -2.764360

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  1.  0.  1.]
 [ 1. -1.  0. -1. -1.  1.  1.]
 [-1. -1.  0.  1. -1. -1.  1.]]
ACTION: 3 REWARD: 50
Qs:
[[-3.6978722 -3.8627937 -3.557836  -1.3204373 -1.4395055 -3.5463827
  -3.0994444]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]]
ACTION: 5 REWARD: -1.25
Qs:
[[-1.6178929  -1.0849923  -0.28347576 -0.3953192  -0.7436177  -1.5500398
  -1.5076101 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  0.  1.  0.  0.]
 [ 1. -1.  0.  0.  1.  0.  0.]
 [-1. -1.  1.  0. -1.  1.  0.]]
ACTION: 3 REWARD: -1.5
Qs:
[[-4.019757   -2.059308   -0.9113974  -0.83440316 -0.9921928  -2.8384132
  -2.9403625 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 

[[-3.7684886 -2.899655  -1.3567635 -3.2817938 -3.5499663 -3.2333534
  -3.6253343]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0. -1. -1.  0.  0.]
 [ 0.  1.  0.  1.  1.  0.  0.]
 [-1. -1.  0. -1. -1.  0.  0.]
 [ 1.  1.  1. -1. -1.  0.  1.]]
ACTION: 3 REWARD: -3.25
Qs:
[[-5.310476  -4.7157545 -3.8303914 -2.9929225 -1.334799  -5.439517
  -5.100792 ]]
pred: 4
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  1.  0.  1.  0.  0.  0.]
 [ 0. -1.  0. -1. -1.  0.  0.]
 [ 0.  1.  0.  1.  1.  0.  0.]
 [-1. -1.  1. -1. -1.  0. -1.]
 [ 1.  1.  1. -1. -1.  0.  1.]]
ACTION: 4 REWARD: -1.75
Qs:
[[-5.2548227 -4.1461964 -3.6786776 -7.168537  -3.684106  -5.619519
  -7.662073 ]]
pred: 2
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  1.  0.  1.  1.  0.  0.]
 [ 0. -1.  0. -1. -1.  0.  0.]
 [-1.  1.  0.  1.  1.  0. -1.]
 [-1. -1.  1. -1. -1.  0. -1.]
 [ 1.  1.  1. -1. -1.  1.  1.]]
ACTION: 0 REWARD: -2.5
Qs:
[[-6.9509325 -5.7314053 -3.8566153 -7.263051  -3.932158  -6.4

[[-0.650052   -0.05318867 -0.5240102  -0.12704049  1.3538506  -0.5153664
  -0.78985876]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  1.  0.  0.]
 [-1.  0. -1.  1.  1.  0. -1.]]
ACTION: 3 REWARD: 1.5
Qs:
[[-0.39684108 -1.9221514  -0.69254565 -0.46787637  0.87447435 -1.5882888
  -1.7358023 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  1. -1.  1. -1. -1.]
 [-1.  0. -1.  1.  1. -1. -1.]]
ACTION: 5 REWARD: 4.25
Qs:
[[-1.1902038   0.5882169   2.045604    0.88373643  5.902816   -1.0541686
  -1.0377585 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0. -1. -1.]]
ACTION: 6 REWARD: -1.25
Qs:
[[-0.96239233 -1.28163     0.06022802 -1.57

ACTION: 2 REWARD: 1.75
Qs:
[[-2.4383385 -2.9318616 -1.2086271 -4.397947  -5.4596553 -1.4422308
  -4.112471 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  1.  0.  1.  0.  0.  0.]
 [ 0.  1.  1. -1.  0. -1.  0.]
 [ 0.  1. -1. -1. -1.  1. -1.]
 [ 0. -1.  1. -1. -1.  1. -1.]]
ACTION: 4 REWARD: -50
Qs:
[[-2.9168758 -3.1064487 -1.6008229 -4.6224947 -5.7001777 -1.4965914
  -4.3462486]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  0.  1.  1. -1. -1.]]
ACTION: 5 REWARD: 0.25
Qs:
[[-1.3901013  -0.44945356 -0.98639745 -0.95461845  1.9007633  -1.5501088
  -2.3639803 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  1.  0.  0.  0. -1.]
 [ 1.  1.  1.  0.  0.  0. -1.]
 [ 1. -1.  1. -1. -1.  0. -1.]]
ACTION: 1 REWARD: -50
Qs:
[[-4.258823   -6.3233976  -0.3

Qs:
[[-2.3419116 -1.3779604 -2.509526  -2.3315547 -1.2663776 -2.5479624
  -3.7763987]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0.  1.  0.  0.]
 [-1. -1.  0. -1. -1.  0.  1.]]
ACTION: 4 REWARD: 0.25
Qs:
[[-2.7468219  -1.8201121  -0.33674967 -1.2087067  -2.2841752  -2.365493
  -2.6971588 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0. -1.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0.  1.  0.  0.]
 [-1. -1.  0. -1. -1.  0.  1.]]
ACTION: 6 REWARD: 0.5
Qs:
[[-2.3643508  -2.0758488  -0.12301625 -0.69628197 -2.0337303  -2.4245467
  -2.3050394 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0. -1.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0.  1.  0.  1.]
 [-1. -1.  0. -1. -1. -1.  1.]]
ACTION: 6 REWARD: 0.75
Qs:
[[-2.5505774  -1.9329059  -0.17867506 -0.945563

Qs:
[[-1.8311615  -2.1744068  -1.1273527  -0.42560852 -1.2925493  -1.906899
  -1.4832244 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 1.  0. -1.  0.  0.  0.  0.]
 [ 1.  0.  1.  0.  0.  0.  0.]
 [ 1.  0. -1.  1.  0. -1.  0.]
 [-1.  1.  1. -1. -1. -1.  0.]]
ACTION: 2 REWARD: -1.0
Qs:
[[-3.5948112 -4.3325434 -2.5876524 -2.4734917 -3.4543808 -4.108915
  -4.4280243]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  1. -1.  0.]
 [-1.  0.  1.  0.  1. -1.  0.]]
ACTION: 6 REWARD: -1.5
Qs:
[[-1.8307033   1.1544491   0.27130342 -0.7450339   2.2838452  -1.2643747
  -0.9132199 ]]
pred: 4
MEMORY:
[[ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0. -1.  0.]
 [ 1.  0.  0.  0.  0.  1.  0.]
 [-1.  0.  0.  0.  0. -1.  0.]
 [-1.  0.  0.  0.  1. -1.  0.]
 [-1. -1.  1.  0.  1. -1.  1.]]
ACTION: 5 REWARD: -1.0
Qs:
[[-2.563014   -1.9159273  -0.5785821   0.287745

pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  1.  0. -1.  0.  0.  0.]]
ACTION: 2 REWARD: -3.25
Qs:
[[-3.0338256 -1.7801193 -1.163358  -2.9347327 -1.1901954 -3.3986962
  -3.4366045]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  1.  0. -1.  0. -1.]
 [ 0.  1. -1.  0.  1.  1. -1.]
 [ 0.  1.  1. -1. -1.  1. -1.]]
ACTION: 1 REWARD: -1.5
Qs:
[[-6.3003044 -7.2346225 -2.5361745 -5.1407795 -7.34889   -4.826524
  -5.264975 ]]
pred: 2
MEMORY:
[[ 0. -1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  1.  0. -1.  0. -1.]
 [ 0.  1. -1.  0.  1.  1. -1.]
 [-1.  1.  1. -1. -1.  1. -1.]]
ACTION: 4 REWARD: -50
Qs:
[[-7.0343738 -7.61058   -2.8159657 -6.0125813 -8.704238  -4.743847
  -6.244987 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0

[[-2.394647  -1.8816831 -2.9119325 -2.008646  -1.1020956 -2.7226925
  -3.4687881]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0. -1.  1.  0.  0.  0.  0.]
 [ 0.  1. -1. -1.  1.  0.  0.]]
ACTION: 6 REWARD: -2.5
Qs:
[[-4.386474  -3.0662498 -3.858564  -2.6468983 -1.5712799 -4.7113867
  -5.0474534]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0. -1.  1.  0.  0.  0. -1.]
 [ 1.  1. -1. -1.  1.  0.  1.]]
ACTION: 4 REWARD: -50
Qs:
[[-7.2520967 -7.1631575 -6.350601  -4.342494  -3.652283  -7.881989
  -8.09994  ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0. -1.  1.  0.  0.  0.]]
ACTION: 1 REWARD: -2.25
Qs:
[[-1.2463983 -2.2385056 -0.8885614 -1.3581474 -0.9310816 -2.23

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]]
ACTION: 2 REWARD: -3.0
Qs:
[[-1.763893  -1.3727279 -1.0032021 -1.0312546 -1.2208196 -1.7102317
  -1.9565617]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  1. -1.  0.  0.  0.]]
ACTION: 1 REWARD: -5.75
Qs:
[[-3.2608814 -2.3388925 -1.2092099 -1.9374083 -2.411853  -2.7694077
  -2.808601 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0. -1.  0. -1.  0.  0.  0.]
 [ 1.  1.  1. -1.  0.  0.  0.]]
ACTION: 3 REWARD: -4.0
Qs:
[[-5.150483  -4.796318  -4.5688066 -4.104102  -3.456561  -5.000302
  -5.4531207]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  1. -1.]]
ACTION: 3 REWARD: -0.25
Qs:
[[-1.4270413 -1.5940162 -0.0359631 -1.0988595 -1.3969386 -1.7622948
  -2.171649 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0. -1. -1.]
 [ 1. -1.  0.  0.  1.  1. -1.]
 [ 1. -1. -1.  1. -1.  1. -1.]]
ACTION: 4 REWARD: 1.5
Qs:
[[-1.3813236  -5.330943   -0.05608763 -1.84464    -3.485726   -4.844476
  -5.728121  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  0.  0. -1.  1.  1.]
 [ 0.  1.  0. -1.  1. -1. -1.]
 [ 1. -1.  0.  1.  1.  1. -1.]
 [ 1. -1. -1.  1. -1.  1. -1.]]
ACTION: 2 REWARD: -1.0
Qs:
[[-0.92050856 -6.7250986   1.982758   -2.352806   -5.659437   -6.341023
  -7.217859  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0

pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  1. -1.  0.  0.]
 [ 0.  0.  1. -1.  1.  0. -1.]]
ACTION: 3 REWARD: -0.75
Qs:
[[-2.0105748 -2.0334458 -2.4365273 -4.351653  -2.4982426 -1.1181326
  -3.8687239]]
pred: 5
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0. -1. -1.  0.  0.]
 [ 0.  0.  0.  1. -1.  0.  0.]
 [ 0.  0.  0. -1.  1.  0.  1.]
 [ 1.  0.  0.  1. -1.  0. -1.]
 [ 1.  1.  1. -1.  1. -1. -1.]]
ACTION: 2 REWARD: -2.0
Qs:
[[-3.3884203 -3.254313  -5.619036  -8.052042  -3.629882  -2.5405102
  -7.3680444]]
pred: 5
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0. -1. -1.  0.  0.]
 [ 0.  0.  0.  1. -1.  0.  0.]
 [ 0.  0.  0. -1.  1.  0.  1.]
 [ 1.  0.  1.  1. -1. -1. -1.]
 [ 1.  1.  1. -1.  1. -1. -1.]]
ACTION: 0 REWARD: -2.5
Qs:
[[-3.6018696 -2.7213678 -5.1054807 -8.594361  -3.5010386 -2.3316827
  -7.7250667]]
pred: 5
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 1.  0.  0.

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0. -1.  0.  0.  1.]
 [ 1.  1. -1.  1.  1.  0. -1.]
 [ 1.  1. -1.  1. -1.  1. -1.]
 [-1. -1. -1.  1. -1. -1.  1.]]
ACTION: 4 REWARD: -1.0
Qs:
[[-1.5985167 -2.7083876 -0.7056639 -2.4200008 -2.9570262 -1.372949
  -2.2757235]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]]
ACTION: 3 REWARD: -0.25
Qs:
[[-1.2080454  -0.6345537  -0.63385075 -1.2155232  -0.19525144 -1.1228237
  -1.8709403 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  1. -1. -1.  0.]]
ACTION: 1 REWARD: 0.0
Qs:
[[-0.99548906 -1.0024402   1.0563806  -1.0304189  -0.48717514 -0.48133796
  -1.7161535 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0. -1.  1.  1.  0.]
 [-1.  1.  0. -1.  1. -1.  1.]]
ACTION: 3 REWARD: -1.0
Qs:
[[-6.3563395 -4.7926016 -6.18928   -4.4493437 -2.7677178 -6.2372155
  -6.0272794]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0. -1.  0.]
 [ 0. -1.  0. -1.  0.  1.  0.]
 [ 0. -1.  0.  1.  1. -1.  0.]
 [ 1. -1.  0. -1.  1.  1.  0.]
 [-1.  1.  1. -1.  1. -1.  1.]]
ACTION: 5 REWARD: -4.25
Qs:
[[-6.289825  -6.266194  -5.6962957 -3.7172558 -1.3809104 -6.4860244
  -5.885495 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0. -1.  0. -1.  0.]
 [ 0. -1.  0. -1.  0.  1.  0.]
 [ 0. -1.  0.  1.  1. -1.  0.]
 [ 1. -1.  0. -1.  1.  1. -1.]
 [-1.  1.  1. -1.  1. -1.  1.]]
ACTION: 2 REWARD: -50
Qs:
[[-7.3908787 -6.549496  -6.351446  -3.826888  -1.2899616 -6.316264
  -5.495312 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  

pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0. -1. -1.  0.  0.]
 [-1.  1.  0. -1.  1.  1. -1.]]
ACTION: 4 REWARD: -1.5
Qs:
[[-2.3138015 -2.8073468 -1.7853689 -2.53855   -2.441843  -3.1226277
  -3.4591165]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 1.  1.  0. -1. -1.  0.  0.]
 [-1.  1. -1. -1.  1.  1. -1.]]
ACTION: 5 REWARD: -1.5
Qs:
[[-1.6089245 -1.6940144 -1.4643103 -3.951116  -2.7409434 -1.274099
  -4.048005 ]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  1. -1.  0.  1.  0.  0.]
 [ 1.  1. -1. -1. -1.  1.  0.]
 [-1.  1. -1. -1.  1.  1. -1.]]
ACTION: 3 REWARD: 0.0
Qs:
[[-1.3166854 -2.8027682 -1.8635027 -4.3953032 -2.974347  -1.0351293
  -4.023159 ]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0

pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0. -1.  0.]
 [ 0.  1.  1.  0.  0.  1.  0.]
 [ 0. -1. -1. -1.  0. -1.  1.]]
ACTION: 5 REWARD: -50
Qs:
[[-4.4450655 -3.1091335 -1.6626903 -3.0253255 -3.7991197 -3.2588422
  -4.959878 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0. -1.  0.]
 [ 0.  1.  1.  0.  0.  1.  0.]
 [ 0. -1. -1. -1.  0. -1.  1.]]
ACTION: 5 REWARD: -0.75
Qs:
[[-4.4450655 -3.1091335 -1.6626903 -3.0253255 -3.7991197 -3.2588422
  -4.959878 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  1.  0. -1.  1.  0. -1.]]
ACTION: 4 REWARD: -1.5
Qs:
[[-2.0113866 -1.6798092 -2.367726  -3.0782301 -1.4543571 -2.2339547
  -3.6092663]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0. -1.  0.  0.]
 [ 1. -1. -1.  0. -1.  1. -1.]
 [ 1.  1. -1.  0.  1.  1. -1.]]
ACTION: 0 REWARD: -5.25
Qs:
[[-3.3788774 -7.4581475 -3.4316528 -2.148635  -3.0937362 -6.3156695
  -5.824873 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  1.  0.]
 [ 0.  1.  0.  0.  1. -1.  0.]
 [-1. -1.  1. -1. -1. -1.  0.]]
ACTION: 2 REWARD: 2.75
Qs:
[[-1.0707518  -0.05042185  1.6426631  -0.8311982   0.41181344 -0.4539081
  -1.7607852 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  1.  1.  0.]
 [ 0.  1.  1.  0.  1. -1. -1.]
 [-1. -1.  1. -1. -1. -1.  1.]]
ACTION: 2 REWARD: 4.0
Qs:
[[-2.396455    0.12216983  1.0923599  -2.4516666  -1.2042153  -0.9465722
  -2.4783168 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [

MEMORY:
[[ 0.  0. -1. -1.  0.  1. -1.]
 [-1.  1. -1. -1.  0.  1.  1.]
 [ 1.  1. -1. -1.  0. -1. -1.]
 [-1. -1.  1.  1.  0.  1.  1.]
 [ 1.  1. -1. -1. -1.  1. -1.]
 [ 1. -1. -1.  1.  1.  1. -1.]]
ACTION: 2 REWARD: -10
Qs:
[[-8.647099  -7.989908  -7.444168  -6.195356  -4.8431525 -6.7980223
  -9.663912 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]]
ACTION: 3 REWARD: 0.25
Qs:
[[-0.8159964  -0.7810423   0.51443243 -0.44507203 -0.9356542  -1.2616115
  -1.7801524 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1. -1. -1.  1.  1.  0.  0.]]
ACTION: 6 REWARD: 0.0
Qs:
[[-0.94041306 -1.7048415  -0.6442593  -0.4944471  -0.15381837 -1.5798345
  -1.5210227 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [-1.  1. -1.  1.  1. -1.  0.]]
ACTION: 3 REWARD: 0.0
Qs:
[[-1.3256545  -0.45814273 -2.058402   -1.5936422   1.9442291  -1.8663042
  -2.4454694 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0. -1.  1.  0.  0.  0.]
 [-1.  1. -1.  1.  1. -1.  0.]]
ACTION: 6 REWARD: -0.5
Qs:
[[-1.2970449 -0.6362475 -2.072383  -1.5579302  1.7293968 -1.7828366
  -2.4960718]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.]
 [-1.  0. -1.  0.  0.  0.  1.]
 [-1. -1. -1.  1.  1.  0. -1.]
 [-1.  1. -1.  1.  1. -1.  1.]]
ACTION: 3 REWARD: -50
Qs:
[[-3.4088008 -2.3968523 -5.000394  -2.7669683  2.9243507 -3.1768181
  -3.8536034]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. 

  -1.6897998 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  1.  0.  0.  0.  0.]
 [ 0.  1. -1.  1.  0.  0. -1.]
 [ 0. -1.  1.  1.  0. -1. -1.]]
ACTION: 2 REWARD: 2.75
Qs:
[[-2.2233956  -2.617305    0.20430093 -1.1182433  -0.41243747 -0.9177635
  -2.4005728 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0. -1.  1.  0.  0.  0. -1.]
 [ 0. -1.  1.  1.  0. -1.  1.]
 [ 1.  1. -1.  1.  1. -1. -1.]
 [-1. -1.  1.  1. -1. -1. -1.]]
ACTION: 2 REWARD: 50
Qs:
[[-4.172608   -1.4616677   2.235359    0.10375731  1.5310013  -2.303078
  -1.6642982 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]]
ACTION: 6 REWARD: -1.75
Qs:
[[-0.9370887  -0.6978083  -0.67560554  0.13985325  0.4011968  -1.1431375
  -0.67387974]]
pred: 4
MEMORY:
[[ 0.  0.  0.

Qs:
[[-2.0847528  -1.4842349  -0.42870808 -1.2411891  -1.2090347  -2.1332164
  -2.2677712 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 1. -1. -1.  0. -1. -1.  1.]]
ACTION: 3 REWARD: 0.75
Qs:
[[-2.0427501 -1.4345939  0.5184947 -2.4829466 -2.5542622 -2.0192077
  -3.400467 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]]
ACTION: 1 REWARD: -1.75
Qs:
[[-1.2080454  -0.6345537  -0.63385075 -1.2155232  -0.19525144 -1.1228237
  -1.8709403 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0. -1.]
 [ 0.  1.  0.  0.  1.  1.  1.]
 [-1.  1.  0. -1. -1. -1.  1.]
 [-1. -1.  0. -1.  1.  1.  1.]
 [-1.  1.  1. -1.  1. -1. -1.]]
ACTION: 3 REWARD: 50
Qs:
[[-3.5366538  -2.501732   -4.262019   -3.01065

ACTION: 4 REWARD: -2.25
Qs:
[[-2.2063475 -1.9760685 -2.3807402 -1.7637879 -0.7962367 -2.95647
  -3.1987967]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1. -1.  1.  0.  0.]]
ACTION: 1 REWARD: -2.25
Qs:
[[-1.5312219 -1.6631725 -1.7925075 -1.9522873 -1.3267648 -2.2974029
  -3.460998 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0. -1. -1.  0.  0.  0.  0.]
 [ 1.  1. -1. -1.  1.  0. -1.]]
ACTION: 1 REWARD: -1.25
Qs:
[[-1.4007719  -3.5345647  -2.5781512  -1.8967936  -0.45879048 -3.9272122
  -4.7370653 ]]
pred: 4
MEMORY:
[[ 1.  0. -1.  0.  0. -1.  0.]
 [ 1.  0. -1. -1. -1.  1.  0.]
 [-1.  0.  1. -1.  1. -1.  0.]
 [-1.  0.  1. -1.  1.  1.  0.]
 [ 1.  0. -1.  1. -1. -1. -1.]
 [-1.  1.  1. -1.  1. -1.  1.]]
ACTION: 1 REWARD: -1.25
Qs:
[[-7.5962467 -5.7485833 -8

[[-0.4398154 -1.8179911 -1.5078241 -2.7280726 -1.6294909 -1.3970068
  -3.3666806]]
pred: 0
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  1. -1. -1.  0. -1. -1.]]
ACTION: 4 REWARD: -1.75
Qs:
[[-0.55231583 -2.2194448  -1.9173036  -2.8578513  -1.7140201  -1.914947
  -3.9725523 ]]
pred: 0
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0.  0.  0.  0. -1.]
 [-1.  0.  0.  0. -1.  1. -1.]
 [ 1.  1.  0.  0. -1.  1.  1.]
 [ 1.  1. -1. -1.  1. -1. -1.]]
ACTION: 5 REWARD: -0.25
Qs:
[[-4.2835054 -3.3625255 -2.0924954 -5.035268  -3.590149  -3.9764948
  -6.8301635]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0.  0.  0.  1. -1.]
 [-1.  0.  0.  0. -1.  1. -1.]
 [ 1.  1. -1.  0. -1.  1.  1.]
 [ 1.  1. -1. -1.  1. -1. -1.]]
ACTION: 4 REWARD: 0.0
Qs:
[[-3.922362  -3.694304  -1.7208027 -5.2273774 -4.045809

  -1.1840297 ]]
pred: 2
MEMORY:
[[ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  1.  0.  0.]
 [-1. -1.  1. -1.  1. -1.  0.]]
ACTION: 4 REWARD: 3.0
Qs:
[[-1.6623017  -1.9794154  -0.3868158   0.13786492  0.6173437  -2.2804885
  -2.5717492 ]]
pred: 4
MEMORY:
[[ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  1.  0.  0.]
 [ 0.  1. -1.  0.  1. -1.  0.]
 [-1. -1.  1. -1.  1. -1.  0.]]
ACTION: 5 REWARD: 1.25
Qs:
[[-1.5910718  -1.1996698   0.378546   -0.07399891  1.7018249  -1.7767118
  -2.5458403 ]]
pred: 4
MEMORY:
[[ 0.  1. -1.  0.  0.  0.  0.]
 [ 0. -1.  1.  0.  0.  0.  0.]
 [ 0.  1. -1.  0.  0. -1.  0.]
 [ 1. -1.  1.  0.  1.  1.  0.]
 [-1.  1. -1.  0.  1. -1.  0.]
 [-1. -1.  1. -1.  1. -1. -1.]]
ACTION: 3 REWARD: 50
Qs:
[[-0.80630004 -1.1063803  -0.9187166   1.8392565   3.455987   -2.4010024
  -2.3259897 ]]
pred: 4
MEMORY:
[[ 0.  0.  0. 

[[-3.0384367  -1.5120096   0.06167518 -1.5830274  -0.01297161 -2.5513444
  -3.3162446 ]]
pred: 2
MEMORY:
[[ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [-1.  0.  1.  0.  0. -1.  0.]
 [ 1.  0.  1.  1.  0. -1. -1.]
 [ 1.  1. -1. -1.  0. -1. -1.]]
ACTION: 1 REWARD: 50
Qs:
[[-3.1970525  -1.5299337   0.19714345 -2.836197   -1.5136755  -2.8942342
  -3.9463549 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  1. -1.  0.]]
ACTION: 2 REWARD: -2.25
Qs:
[[-2.0465944 -1.3980561 -2.270122  -2.5764349 -1.2772007 -2.3194976
  -3.60471  ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1. -1.  1. -1.  0.]]
ACTION: 3 REWARD: -1.25
Qs:
[[-2.6440349 -2.0931478 -2.734035  -2.9578288 -1.

ACTION: 1 REWARD: -0.25
Qs:
[[-6.38319    -2.6043715  -0.06936974 -2.2400327   0.47514722 -3.173623
  -2.8107839 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0. -1.  0.  0.]
 [-1.  1.  1.  0.  1. -1.  0.]
 [ 1. -1.  1.  0. -1.  1. -1.]
 [-1. -1.  1.  1.  1. -1.  1.]]
ACTION: 2 REWARD: 50
Qs:
[[-4.9750147  -1.9216247   0.76164794 -1.5366023  -0.43142468 -2.886644
  -2.4685779 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0. -1.  0.  1.]]
ACTION: 5 REWARD: -1.5
Qs:
[[-2.392     -1.7515825 -1.4528409 -1.0123839 -1.5849503 -2.2050743
  -2.2761106]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0.  0.  0. -1.  1.]
 [ 1.  0.  0.  0.  0. -1. -1.]]
ACTION: 3 REWARD: -1.0
Qs:
[[-1.3661298  -2.173721  

ACTION: 6 REWARD: -1.75
Qs:
[[-1.6348482  -0.47932115 -0.21157585 -0.51965857 -0.4479415  -1.3485327
  -1.6187052 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0. -1.  0. -1.  0.]
 [-1. -1.  1.  1. -1.  1. -1.]
 [-1. -1.  1. -1.  1.  1.  1.]]
ACTION: 5 REWARD: -3.0
Qs:
[[-4.778856  -3.65636   -3.2561028 -3.6541877 -4.2156763 -3.9411561
  -4.812201 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  1.  0.  0. -1.  0.  0.]]
ACTION: 4 REWARD: 0.25
Qs:
[[-1.85895    -2.0514479  -1.3384625  -0.24522655 -0.6260986  -1.9997317
  -1.4257073 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  1.  0.  0.]
 [ 1.  1.  0. -1.  1.  0.  0.]
 [-1.  1. -1. -1. -1.  0.  0.]]
ACTION: 5 REWARD: -1.5
Qs:
[[-1.7366774 -2.27296

 [ 1.  0.  1. -1.  1. -1.  1.]]
ACTION: 5 REWARD: -1.75
Qs:
[[-5.6662974 -2.5364995 -4.7356596 -8.02947   -2.3438315 -1.3493084
  -5.4226375]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0.  0.  0.  0.  1.]
 [-1.  0. -1. -1.  1.  1.  1.]
 [-1.  0. -1.  1. -1. -1. -1.]
 [ 1.  0.  1. -1.  1. -1.  1.]]
ACTION: 0 REWARD: -3.5
Qs:
[[ -5.2931623  -2.9403784  -7.330581  -10.232075   -3.784342   -2.457702
   -7.6480603]]
pred: 5
MEMORY:
[[-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0.  0.  0.  0.  1.]
 [-1.  0. -1. -1.  1.  1.  1.]
 [-1.  0. -1.  1. -1. -1. -1.]
 [ 1.  0.  1. -1.  1. -1.  1.]]
ACTION: 0 REWARD: -10
Qs:
[[ -7.814199   -5.244941   -9.881689  -11.335178   -4.63343    -5.2074656
  -10.019612 ]]
pred: 4
MEMORY:
[[-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0. -1.  0.  0.  1.]
 [-1.  0. -1. -1.  1.  1.  1.]
 [-1.  0. -1.  1. -1. -1. -1.]
 [ 1.  0.  1. -1.  1. -1.  1.]]
ACTION: 4 REWARD

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]]
ACTION: 0 REWARD: -1.5
Qs:
[[-1.2080454  -0.6345537  -0.63385075 -1.2155232  -0.19525144 -1.1228237
  -1.8709403 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0. -1. -1.  0.]]
ACTION: 0 REWARD: -2.0
Qs:
[[-1.3754666  -1.3615986   0.26137948 -1.797777   -1.0552969  -1.2596484
  -2.4084454 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]]
ACTION: 0 REWARD: -1.5
Qs:
[[-0.80901515 -1.3022456  -0.84834325 -0.8192767  -0.8225403  -1.4638016
  -1.8977557 ]]
pred: 0
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  

[[-1.0470115  -1.1877669  -0.585796   -0.9848192   0.2881569  -0.60733503
  -1.5005506 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]]
ACTION: 3 REWARD: -0.25
Qs:
[[-1.763893  -1.3727279 -1.0032021 -1.0312546 -1.2208196 -1.7102317
  -1.9565617]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1. -1.  0.  0.]
 [ 0. -1.  1.  1.  1.  0.  0.]
 [-1.  1.  1.  1. -1.  0.  0.]
 [-1. -1.  1. -1.  1. -1.  0.]]
ACTION: 4 REWARD: -1.25
Qs:
[[-4.670513   -0.82607347  1.6333476  -0.80627376  0.14282742 -3.7706985
  -3.2769895 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0. -1. -1. -1.  0.  0.]
 [ 0. -1.  1.  1.  1.  0.  0.]
 [-1.  1.  1.  1. -1.  0.  0.]
 [-1. -1.  1. -1.  1. -1.  0.]]
ACTION: 5 REWARD: -50
Qs:
[[-5.165164   -0.91853076  1.9481494  -0.712909

[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0. -1.  1.  1.  0. -1.]]
ACTION: 4 REWARD: -0.75
Qs:
[[-0.33689493 -1.6223465  -1.0515865  -0.46934712  1.2121612  -1.3228693
  -1.7269878 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  1.  1.  0. -1.]
 [ 0.  0. -1.  1.  1. -1. -1.]]
ACTION: 5 REWARD: -1.75
Qs:
[[ 0.02998835 -0.7257058  -0.9661263   0.1232612   2.7729092  -0.84057635
  -1.9260219 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0. -1.  1.  1.  1. -1.]
 [ 0.  0. -1.  1.  1. -1. -1.]]
ACTION: 3 REWARD: -2.0
Qs:
[[-0.49441826 -1.9436091  -2.868641   -2.1227574   0.30492768 -1.1364455
  -3.409481  ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.

MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  1.  1.  0.  0.]
 [-1. -1.  0.  1. -1. -1.  0.]
 [-1.  1.  1.  1. -1. -1.  1.]]
ACTION: 0 REWARD: 0.25
Qs:
[[-2.890727   -0.6678552   0.09359561  0.24171291  2.2878222  -2.9548907
  -1.4003384 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 1.  0.  0.  1.  1. -1.  0.]
 [-1. -1.  0.  1. -1. -1.  0.]
 [-1.  1.  1.  1. -1. -1.  1.]]
ACTION: 3 REWARD: 1.75
Qs:
[[-3.3081274  -1.309558   -0.3729577   0.23770241  1.7789125  -3.1853752
  -1.7972625 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 1.  0.  0.  1.  1. -1.  0.]
 [-1. -1.  0.  1. -1. -1. -1.]
 [-1.  1.  1.  1. -1. -1.  1.]]
ACTION: 0 REWARD: -0.75
Qs:
[[-3.4090648e+00 -1.0491279e+00  3.2973140e-03 -1.8870186e-02
   1.3547046e+00 -3.2383866e+00 -1.8952678e+00]]
pred: 4
MEMORY:
[[ 0.  0.  0

MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [-1.  0. -1. -1.  0.  0.  0.]
 [-1.  0.  1.  1.  0.  1.  0.]
 [ 1.  1. -1.  1.  0. -1.  0.]]
ACTION: 5 REWARD: -4.5
Qs:
[[-3.9499881 -3.1446476 -4.0235043 -5.798724  -1.8610497 -0.804942
  -4.3665915]]
pred: 5
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0. -1.  1.  0.  0.  0.]
 [ 1.  0. -1. -1.  0.  1.  0.]
 [-1.  0. -1. -1.  0.  1.  0.]
 [-1.  0.  1.  1.  0.  1.  0.]
 [ 1.  1. -1.  1.  0. -1. -1.]]
ACTION: 2 REWARD: -1.75
Qs:
[[ -1.3420674  -2.8953457  -3.826956  -10.6048     -5.1549544  -0.1383204
   -7.605453 ]]
pred: 5
MEMORY:
[[ 0.  0.  1. -1.  0.  0.  0.]
 [ 0.  0. -1.  1.  0. -1.  0.]
 [ 1.  0. -1. -1.  0.  1.  0.]
 [-1.  0. -1. -1.  0.  1.  0.]
 [-1.  0.  1.  1.  0.  1.  0.]
 [ 1.  1. -1.  1.  0. -1. -1.]]
ACTION: 1 REWARD: -1.25
Qs:
[[-1.0493193  -2.9652886  -3.3245895  -9.952281   -4.4964957  -0.02137847
  -6.7298484 ]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  

[[-3.4714801  -0.60068643 -1.0982225  -2.2996182   0.2905557  -1.8745492
  -1.7773963 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [-1.  0.  0.  0. -1.  0.  0.]
 [ 1.  0.  0.  0.  1.  0.  0.]
 [-1.  1.  0.  0. -1.  0.  0.]
 [-1. -1.  1.  0.  1.  1.  0.]]
ACTION: 6 REWARD: 0.0
Qs:
[[-2.6573038  -0.6091245   1.1430769  -1.7201962  -0.45100346 -0.6812101
  -1.2929054 ]]
pred: 2
MEMORY:
[[ 1.  0.  0.  0.  1.  0.  0.]
 [-1.  0.  0.  0. -1.  0.  0.]
 [-1.  0.  0.  0. -1.  0.  0.]
 [ 1.  0.  0.  0.  1.  0. -1.]
 [-1.  1. -1.  0. -1.  0.  1.]
 [-1. -1.  1.  0.  1.  1.  1.]]
ACTION: 0 REWARD: -10
Qs:
[[-2.3898823  -2.2954292  -0.10240103 -2.5366964  -1.7050626  -1.6521617
  -2.4031925 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]]
ACTION: 2 REWARD: -1.25
Qs:
[[-1.763893  -1.3727279 -1.0032021 -1.031254

Qs:
[[-0.3046831  -0.3246235   0.6372239  -1.0468038   1.0319645  -0.36876583
  -2.0069566 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 1.  1.  0.  0.  0.  1. -1.]
 [-1. -1. -1.  1. -1. -1.  1.]
 [ 1. -1. -1. -1.  1.  1. -1.]]
ACTION: 1 REWARD: 1.0
Qs:
[[ 0.684052   -1.2151212   0.23090257 -2.0819278  -1.0760086  -0.810541
  -3.0924873 ]]
pred: 0
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 1.  1.  0.  0.  0. -1.  0.]
 [ 1.  1.  0. -1.  0.  1. -1.]
 [-1. -1. -1.  1. -1. -1.  1.]
 [ 1. -1. -1. -1.  1.  1. -1.]]
ACTION: 4 REWARD: -0.75
Qs:
[[ 0.2682528  -0.01783582 -1.8600447  -5.257946   -2.202206   -0.30534446
  -4.5842576 ]]
pred: 0
MEMORY:
[[ 0. -1.  0.  0.  0.  0.  0.]
 [ 1.  1. -1.  0.  1. -1.  0.]
 [ 1.  1.  1. -1.  1. -1. -1.]
 [ 1.  1. -1. -1.  1.  1. -1.]
 [-1. -1. -1.  1. -1. -1.  1.]
 [ 1. -1. -1. -1.  1.  1. -1.]]
ACTION: 1 REWARD: -10
Qs:
[[-0.98546773 -1.9278805  -2.9085155  -

pred: 4
MEMORY:
[[ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [-1.  0.  0.  1.  0.  0.  0.]
 [ 1.  0.  0. -1.  0. -1.  0.]
 [ 1.  1. -1. -1.  0. -1.  0.]
 [ 1.  1. -1. -1. -1.  1.  1.]]
ACTION: 3 REWARD: -10
Qs:
[[-5.9151554 -4.7240043 -3.8735392 -7.3758693 -6.374191  -3.7611458
  -5.810326 ]]
pred: 5
MEMORY:
[[ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [-1.  0.  0.  1.  0. -1.  0.]
 [ 1.  0.  0. -1.  0. -1.  0.]
 [ 1.  1. -1. -1.  0. -1.  0.]
 [ 1.  1. -1. -1. -1.  1.  1.]]
ACTION: 1 REWARD: -50
Qs:
[[-6.672086  -5.189135  -3.8454144 -7.5839844 -6.6871285 -4.903519
  -6.197834 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]]
ACTION: 3 REWARD: 0.0
Qs:
[[-0.9370887  -0.6978083  -0.67560554  0.13985325  0.4011968  -1.1431375
  -0.67387974]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0

pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0. -1.]
 [-1.  0.  0.  0.  0.  0. -1.]
 [-1.  0.  1.  0.  0.  0.  1.]
 [ 1.  1. -1.  0.  1.  0. -1.]
 [ 1. -1. -1.  1.  1.  1. -1.]
 [-1.  1.  1. -1. -1.  1. -1.]]
ACTION: 5 REWARD: 2.25
Qs:
[[-2.098303  -7.15966   -1.8168417 -1.7493143 -3.609634  -4.3272734
  -4.356195 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0. -1.]
 [-1.  0.  0.  0.  0.  0. -1.]
 [-1.  0.  1.  0.  0. -1.  1.]
 [ 1.  1. -1.  0.  1.  1. -1.]
 [ 1. -1. -1.  1.  1.  1. -1.]
 [-1.  1.  1. -1. -1.  1. -1.]]
ACTION: 3 REWARD: -1.25
Qs:
[[-1.9980037 -6.727939  -2.4302642 -1.2499733 -2.1091015 -5.033037
  -4.6182933]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  1. -1.  0.  0.  0.  0.]]
ACTION: 3 REWARD: -0.25
Qs:
[[-0.8825321 -0.7759795 -1.3403289 -0.6424884  0.6700322 -1.5351837
  -1.6111784]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.

  -1.7801524 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  0. -1.  1.  0.]]
ACTION: 3 REWARD: -1.75
Qs:
[[-2.1892393 -2.064438  -1.1564196 -1.0635288 -2.1473944 -2.143171
  -2.572853 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 1.  1.  0. -1.  0.  0.  0.]
 [-1. -1.  0.  1. -1.  1.  0.]]
ACTION: 6 REWARD: -2.75
Qs:
[[-3.321351  -3.6730187 -1.0617092 -2.7171762 -4.233979  -2.4314227
  -3.8797255]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  1.]
 [ 1.  1.  0. -1.  0.  0. -1.]
 [-1. -1. -1.  1. -1.  1.  1.]]
ACTION: 5 REWARD: -2.75
Qs:
[[-3.7343013 -3.8821537 -2.0316238 -2.3709378 -3.728578  -3.215772
  -3.3748074]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0

pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  1.]
 [ 0.  1. -1. -1.  0. -1.  1.]]
ACTION: 0 REWARD: -50
Qs:
[[-3.8907387 -2.1605844 -3.974373  -4.019025  -2.3029532 -3.5816398
  -4.877911 ]]
pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1. -1.  0.  0.  0.]]
ACTION: 1 REWARD: -4.5
Qs:
[[-2.609616  -2.652784  -1.6948584 -2.2715442 -2.0028753 -1.6759182
  -2.1576195]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  1.  1. -1.  0.  0.  0.]]
ACTION: 2 REWARD: -2.5
Qs:
[[-2.7409747 -3.1187088 -2.2945633 -3.2300406 -2.4250128 -1.9442586
  -2.9539223]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  

pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  1.  0.  0.]
 [ 1.  0. -1.  1.  1.  1. -1.]
 [-1. -1.  1. -1. -1. -1.  1.]]
ACTION: 5 REWARD: -0.25
Qs:
[[-1.8981166 -2.2248106 -1.9263608 -1.951847  -1.0125487 -1.028723
  -1.9187753]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0. -1. -1.  0.  1.  1.  0.]
 [ 1.  1. -1.  1.  1.  1. -1.]
 [-1. -1.  1. -1. -1. -1.  1.]]
ACTION: 0 REWARD: -0.5
Qs:
[[-2.9656355 -2.789591  -1.7081984 -3.3451545 -2.2639837 -1.1818814
  -2.7100105]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1. -1.  0.  1.]]
ACTION: 1 REWARD: -2.25
Qs:
[[-2.9748344 -2.155647  -1.5999011 -1.4948272 -2.197565  -2.289408
  -2.5629954]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 

Qs:
[[-2.1032038  -1.0112691   4.2964425  -4.475784    0.11277547 -2.7232556
  -3.3203459 ]]
pred: 2
MEMORY:
[[ 0.  0.  0. -1. -1.  0.  0.]
 [-1.  0.  0.  1.  1.  0.  0.]
 [-1.  0.  0. -1.  1.  1. -1.]
 [ 1.  1.  0.  1.  1. -1. -1.]
 [ 1. -1. -1.  1. -1. -1.  1.]
 [-1. -1. -1.  1.  1.  1. -1.]]
ACTION: 2 REWARD: 50
Qs:
[[-1.9225894 -1.9067043  4.106947  -4.815933  -1.2891605 -2.8551807
  -3.3885913]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0. -1.  0.  1.]
 [-1.  0.  0.  0. -1.  0. -1.]
 [-1.  1.  0.  0.  1.  1. -1.]
 [ 1.  1.  1.  0. -1.  1. -1.]]
ACTION: 4 REWARD: -50
Qs:
[[-7.478419  -6.4002666 -3.3354015 -6.5506897 -6.343534  -4.473204
  -6.658673 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [-1.  0.  0.  0. -1.  1.  1.]]
ACTION: 2 REWARD: -3.5
Qs:
[[-2.548877   -2.4878151  -1.9148781  -0.41334814 -1.21

ACTION: 2 REWARD: -0.5
Qs:
[[-0.9370887  -0.6978083  -0.67560554  0.13985325  0.4011968  -1.1431375
  -0.67387974]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  1.  0.  0.  0.  0.]]
ACTION: 0 REWARD: -0.25
Qs:
[[-1.4497651  -0.4476627  -0.4054063  -0.28696162  0.67565864 -0.8236231
  -0.3237258 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  1.  1. -1. -1.  0.]]
ACTION: 5 REWARD: -0.25
Qs:
[[-1.1004492  -1.5826199  -0.96910924  0.1487602   0.28653085 -1.22207
  -0.7372091 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0. -1. -1.  0.  0.  0.]
 [-1.  0. -1.  1.  0.  1.  0.]
 [-1.  0.  1.  1. -1. -1.  0.]]
ACTION: 0 REWARD: -3.5
Qs:
[[-1.3378285  -2

[[-0.45499212 -1.1093165  -0.35765147 -2.650099   -1.13754    -0.13474904
  -2.4928126 ]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  1.]
 [ 0. -1. -1.  1. -1. -1. -1.]]
ACTION: 6 REWARD: -1.75
Qs:
[[-0.45499212 -1.1093165  -0.35765147 -2.650099   -1.13754    -0.13474904
  -2.4928126 ]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  1.  0.  0.  0.  0.  0.]
 [ 1. -1.  0. -1.  0.  0.  0.]]
ACTION: 2 REWARD: -0.75
Qs:
[[-2.630365   -1.2246007   0.50836784 -3.0346415  -2.4397826  -1.76398
  -3.7779565 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  1.  0.  0.  0.  0.  0.]
 [ 1. -1.  1. -1. -1.  0.  0.]]
ACTION: 2 REWARD: 0.25
Qs:
[[-2.776487  -1.6298193  0.4605211 -3.552

[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 1.  0.  0.  1.  0.  0.  0.]
 [-1.  0.  0.  1.  0.  0.  0.]
 [-1. -1. -1.  1.  0.  0.  0.]
 [ 1.  1. -1. -1. -1.  1.  0.]
 [ 1. -1. -1.  1. -1.  1.  0.]]
ACTION: 0 REWARD: -50
Qs:
[[-7.731756  -5.6069818 -2.9335384 -5.5861626 -4.59301   -3.8435683
  -6.87328  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  1.  0. -1.]]
ACTION: 6 REWARD: -0.5
Qs:
[[-0.7140798  -0.5502216   0.6356318  -0.3513021   0.471605   -0.92071724
  -1.6448    ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.]
 [ 0. -1. -1.  1.  1. -1. -1.]]
ACTION: 2 REWARD: 0.25
Qs:
[[-1.2563002  -1.8162323  -0.19267957 -1.7030298  -0.3864584  -1.3556914
  -2.817779  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. 

ACTION: 0 REWARD: -1.25
Qs:
[[-3.17092   -6.2492003 -2.5526817 -2.7937474 -3.8172784 -6.137048
  -5.8733997]]
pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 2. 0. 0. 0. 0.]]
ACTION: 1 REWARD: -1.25
Qs:
[[-2.9315464 -1.6577369 -0.4673646 -1.341218  -0.8400289 -1.4491274
  -1.4179529]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  1.  0. -1.]
 [ 1.  1. -1.  0. -1.  0. -1.]]
ACTION: 0 REWARD: -1.25
Qs:
[[-1.5969682 -2.6017776 -1.0880405 -2.1639476 -1.2156138 -2.572837
  -3.1855135]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 1. -1.  0.  0.  1.  0. -1.]
 [ 1.  1. -1.  0. -1.  0. -1.]]
ACTION: 4 REWARD: -10
Qs:
[[-2.6410062 -4.606956  -1.7673491 -1.5102886 -1.4889363 -4.6383085
  -3.849

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  1.  0.  0.  0. -1.]]
ACTION: 2 REWARD: 0.75
Qs:
[[-1.2399086  -0.8507038  -0.03263955 -0.5364945   0.02895221 -0.6506706
  -0.66194266]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [-1.  0.  1.  0.  0. -1. -1.]]
ACTION: 2 REWARD: 2.75
Qs:
[[-1.7440004  -0.11933772  0.38203412 -1.0566924   0.13102384 -0.6709678
  -1.1576242 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [-1. -1.  1.  0.  0. -1. -1.]]
ACTION: 0 REWARD: 0.25
Qs:
[[-1.5676227  -0.46734288  1.1492748  -0.42211422  0.21069829 -1.3216833
  -1.555236  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  

pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  1.  0. -1.  0. -1.  0.]
 [ 0. -1.  0.  1.  0.  1.  0.]
 [ 1.  1. -1. -1.  1.  1.  0.]
 [ 1. -1. -1.  1. -1. -1.  0.]]
ACTION: 2 REWARD: -1.0
Qs:
[[-4.1196747 -4.4978104 -1.5862885 -2.89488   -2.058241  -4.824065
  -5.886899 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0. -1.  0.  0.  0.]
 [ 0.  1.  0. -1.  0. -1.  0.]
 [ 0. -1.  1.  1.  0.  1.  0.]
 [ 1.  1. -1. -1.  1.  1.  0.]
 [ 1. -1. -1.  1. -1. -1.  0.]]
ACTION: 0 REWARD: -1.5
Qs:
[[-2.9793093 -4.9819655 -1.484003  -2.113167  -1.3202538 -5.336923
  -5.7582035]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0. -1.  0.  0.  0.]
 [-1.  1.  0. -1.  0. -1.  0.]
 [ 1. -1.  1.  1.  0.  1.  0.]
 [ 1.  1. -1. -1.  1.  1.  0.]
 [ 1. -1. -1.  1. -1. -1.  0.]]
ACTION: 5 REWARD: -50
Qs:
[[-3.937728  -4.925846  -1.8375931 -2.4349127 -1.7879581 -5.6913934
  -5.236178 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0. -1.

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0. -1.  1.]
 [-1.  1.  0.  1. -1.  1. -1.]
 [-1.  1.  0. -1. -1. -1.  1.]]
ACTION: 4 REWARD: 0.75
Qs:
[[-4.979082  -3.9204104 -3.521342  -3.6418586 -2.902987  -3.3165433
  -4.1936183]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  1.  1.  0.  0.  0.  0.]
 [ 0.  1. -1.  0.  0.  0.  0.]
 [-1.  1. -1. -1.  0.  1. -1.]]
ACTION: 3 REWARD: 2.25
Qs:
[[-0.6767927 -2.4855878 -1.1970232 -0.904138  -0.5327257 -1.8756963
  -2.4499393]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1. -1.  0.  0.  0.]
 [ 0.  1.  1.  1.  0.  0.  0.]
 [ 0.  1. -1.  1.  0.  0.  0.]
 [-1.  1. -1. -1. -1.  1. -1.]]
ACTION: 2 REWARD: 2.0
Qs:
[[-1.8105179  -3.1785522  -0.59692174 -1.3511088  -0.70031816 -2.510958
  -2.5791016 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. 

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  0.  1.  1.]
 [-1.  0.  0. -1.  0. -1.  1.]]
ACTION: 1 REWARD: -3.25
Qs:
[[-3.839477  -2.8306894 -3.0464962 -1.9172759 -1.8114128 -3.3255615
  -3.1584496]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [-1.  0.  0. -1.  0.  1.  1.]
 [-1.  1.  0. -1.  0. -1.  1.]]
ACTION: 2 REWARD: -3.25
Qs:
[[-4.5114317 -3.3240006 -3.9521382 -2.82083   -1.3884141 -3.7554417
  -3.7741394]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.]
 [-1.  0.  0.  0.  0. -1.  0.]
 [-1.  0.  0. -1.  0.  1.  1.]
 [-1.  1.  1. -1.  0. -1.  1.]]
ACTION: 0 REWARD: -2.75
Qs:
[[-5.747567  -3.6875796 -3.6636102 -3.506851  -1.5861658 -3.4923236
  -3.562858 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  

 [ 1. -1. -1.  1.  0.  0.  0.]]
ACTION: 2 REWARD: -0.75
Qs:
[[-1.3674033 -2.7339342 -0.4828022 -1.6950694 -1.6089926 -2.418536
  -2.9717712]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 1. -1. -1.  1.  1.  0.  0.]]
ACTION: 0 REWARD: -3.0
Qs:
[[-0.9235046  -4.4463744  -0.57968813 -1.0815006  -1.9263923  -4.078417
  -3.9602714 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 1.  0. -1. -1.  0.  0.  0.]
 [ 1. -1. -1.  1.  1.  0.  0.]]
ACTION: 4 REWARD: -3.5
Qs:
[[-2.1802552 -5.723576  -0.6162205 -2.0058439 -3.4555323 -5.5800223
  -5.393578 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0. -1.  0.  0.]
 [ 1.  0. -1. -1.  1.  0.  0.]
 [ 1. -1. -1.  1.  1.  0.  0.]]
ACTION: 2 REWARD: -3.75


[[ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0. -1.  0.  0.]
 [-1.  0.  1.  0. -1.  1. -1.]
 [ 1.  0. -1. -1.  1. -1.  1.]
 [-1. -1.  1. -1.  1.  1. -1.]]
ACTION: 0 REWARD: -2.0
Qs:
[[-3.7983    -3.0721056 -1.8278533 -1.4032226 -2.0933564 -3.5887282
  -4.1788063]]
pred: 3
MEMORY:
[[ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 1.  0.  1.  0. -1.  0. -1.]
 [-1.  0.  1.  0. -1.  1. -1.]
 [ 1.  0. -1. -1.  1. -1.  1.]
 [-1. -1.  1. -1.  1.  1. -1.]]
ACTION: 2 REWARD: -10
Qs:
[[-4.098792  -3.3610914 -2.0737424 -1.3713194 -2.2879024 -3.5956647
  -4.155891 ]]
pred: 3
MEMORY:
[[ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 1.  0.  1.  0. -1. -1. -1.]
 [-1.  0.  1.  0. -1.  1. -1.]
 [ 1.  0. -1. -1.  1. -1.  1.]
 [-1. -1.  1. -1.  1.  1. -1.]]
ACTION: 5 REWARD: -2.5
Qs:
[[-4.6283975 -3.4066048 -2.3035853 -1.1427778 -1.8370982 -4.463609
  -4.3545117]]
pred: 3
MEMORY:
[[ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  1.  0.]
 

[[-1.1758015  -1.7254161  -0.9835726  -1.8326423  -0.05256146 -1.2038847
  -2.173596  ]]
pred: 4
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  1.]
 [-1.  0. -1.  1.  1.  0. -1.]]
ACTION: 4 REWARD: -0.25
Qs:
[[-1.1418703 -2.2869601 -0.4355936 -5.747547  -1.2469454 -1.6283435
  -5.001997 ]]
pred: 2
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0. -1.]
 [ 0.  0.  0.  1. -1.  0. -1.]
 [ 0.  0.  1. -1.  1.  0.  1.]
 [-1.  1. -1.  1.  1.  0. -1.]]
ACTION: 5 REWARD: -2.25
Qs:
[[-2.735003  -2.5262878 -0.5677363 -7.1772223 -2.8356707 -2.5805342
  -5.8020287]]
pred: 2
MEMORY:
[[ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0. -1.]
 [ 0.  0.  0. -1.  0.  0. -1.]
 [ 0.  0.  0.  1. -1.  0. -1.]
 [ 0.  0.  1. -1.  1.  0.  1.]
 [-1.  1. -1.  1.  1.  1. -1.]]
ACTION: 5 REWARD: -2.25
Qs:
[[-3.5233657 -3.7744365 -0.9175711 -7.552272  -4.902

pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0. -1.  0.  0.  0.]
 [ 0. -1.  0.  1.  1.  0.  0.]
 [ 1.  1. -1. -1. -1.  0. -1.]]
ACTION: 1 REWARD: -0.75
Qs:
[[-0.6282461 -2.0731792 -1.7794447 -2.2348323 -0.8637437 -2.0578132
  -3.6540885]]
pred: 0
MEMORY:
[[ 0. -1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0. -1.  0.  0.  0.]
 [-1. -1.  0.  1.  1.  0.  0.]
 [ 1.  1. -1. -1. -1.  0. -1.]]
ACTION: 1 REWARD: -10
Qs:
[[-1.8725168  -0.91404    -2.2558312  -5.652569   -2.5189168  -0.30961812
  -4.084373  ]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]]
ACTION: 4 REWARD: -1.0
Qs:
[[-0.5866834  -0.63385135  0.22351299 -0.18692547  0.53011113 -0.656272
  -1.0228806 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [

[[-4.2637424 -2.6483958 -3.824975  -2.5921187 -2.4804637 -5.4590063
  -5.236698 ]]
pred: 4
MEMORY:
[[-1.  0.  0.  0.  0. -1.  1.]
 [-1.  0.  0.  0.  0.  1. -1.]
 [-1.  0. -1.  1.  0. -1. -1.]
 [ 1.  1.  1. -1.  0.  1.  1.]
 [ 1.  1. -1.  1. -1.  1. -1.]
 [-1.  1. -1. -1.  1.  1. -1.]]
ACTION: 0 REWARD: -10
Qs:
[[-3.754174  -1.8284061 -3.238009  -2.9083252 -2.702024  -4.5929174
  -4.9555454]]
pred: 1
MEMORY:
[[-1.  0.  1. -1.  0. -1.  1.]
 [-1.  0.  1. -1.  0.  1. -1.]
 [-1.  0. -1.  1.  0. -1. -1.]
 [ 1.  1.  1. -1. -1.  1.  1.]
 [ 1.  1. -1.  1. -1.  1. -1.]
 [-1.  1. -1. -1.  1.  1. -1.]]
ACTION: 1 REWARD: -1.0
Qs:
[[-4.200211  -1.0394728 -3.4685147 -6.506254  -5.0085053 -3.9424415
  -6.4495997]]
pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  1. -1.]]
ACTION: 4 REWARD: -1.75
Qs:
[[-1.0285163 -0.8587299  0.660247  -0.8196335 -1.1706176 -1.4

pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  0.  0.  1.  0.]
 [-1.  1.  0.  0.  0. -1.  1.]
 [ 1. -1.  0.  0. -1. -1.  1.]]
ACTION: 3 REWARD: -1.25
Qs:
[[-3.429991   -1.5378712   0.21747778 -3.9535065  -2.5959327  -1.7115954
  -4.716425  ]]
pred: 2
MEMORY:
[[-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  0.  1.  1. -1.]
 [-1.  1.  0.  0. -1. -1.  1.]
 [ 1. -1. -1.  1. -1. -1.  1.]]
ACTION: 2 REWARD: -0.5
Qs:
[[-5.861251  -3.8523207 -1.5150576 -4.278887  -2.0828314 -3.3584843
  -5.640695 ]]
pred: 2
MEMORY:
[[-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  1.  0. -1.]
 [-1. -1.  0.  0.  1.  1. -1.]
 [-1.  1.  1. -1. -1. -1.  1.]
 [ 1. -1. -1.  1. -1. -1.  1.]]
ACTION: 5 REWARD: 0.0
Qs:
[[-5.976992  -4.106305  -1.7520717 -4.577087  -2.810274  -3.465646
  -6.580481 ]]
pred: 2
MEMORY:
[[-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.

pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 1.  0.  1.  0.  0. -1.  0.]
 [ 1.  0.  1.  0.  0. -1.  0.]
 [-1.  1. -1. -1.  0. -1.  1.]]
ACTION: 2 REWARD: -1.25
Qs:
[[-3.6678176 -2.682684  -3.4216104 -3.231137  -3.3375857 -4.6140733
  -5.5041847]]
pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 1.  0.  1.  0.  0. -1.  0.]
 [ 1.  0.  1.  0.  0. -1. -1.]
 [-1.  1. -1. -1.  0. -1.  1.]]
ACTION: 3 REWARD: -50
Qs:
[[-3.7823386 -2.295049  -3.5732918 -4.0147057 -3.6906946 -4.1991844
  -5.5236   ]]
pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 1.  0.  1.  0.  0. -1.  0.]
 [ 1.  0.  1.  0.  0. -1. -1.]
 [-1.  1. -1. -1.  0. -1.  1.]]
ACTION: 3 REWARD: -1.0
Qs:
[[-3.7823386 -2.295049  -3.5732918 -4.0147057 -3.6906946 -4.1991844
  -5.5236   ]]
pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 

pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  0.  1.  1.]
 [ 0.  1. -1.  0.  0. -1. -1.]
 [-1. -1. -1.  0.  1.  1.  1.]]
ACTION: 3 REWARD: 50
Qs:
[[-1.0912496  -2.7850413  -1.610027   -0.95672476 -1.2007095  -2.481782
  -2.2485557 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]]
ACTION: 1 REWARD: -0.25
Qs:
[[-0.8159964  -0.7810423   0.51443243 -0.44507203 -0.9356542  -1.2616115
  -1.7801524 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0. -1.]]
ACTION: 3 REWARD: -0.25
Qs:
[[-1.0233043  -0.67050225  1.1740668  -0.8673027  -0.76035744 -0.80012053
  -2.0949726 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  

pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  0.  0. -1.  0.]
 [ 1. -1. -1.  0.  1.  1.  0.]]
ACTION: 5 REWARD: -1.25
Qs:
[[-1.5832127  -2.5828466  -0.04830833 -1.1347902  -1.6008668  -3.6278546
  -3.2016077 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  1.  0.]
 [-1. -1.  0.  0.  0. -1.  0.]
 [ 1. -1. -1.  0.  1.  1. -1.]]
ACTION: 1 REWARD: 0.25
Qs:
[[ 0.13893251 -2.0355623   0.19514684 -0.9358814  -0.6279038  -2.5220432
  -3.059939  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  1.  0.  0.  0.  1.  0.]
 [-1. -1.  0.  0.  0. -1.  0.]
 [ 1. -1. -1. -1.  1.  1. -1.]]
ACTION: 2 REWARD: 0.5
Qs:
[[ 0.5306683  -2.6240876  -0.12602276 -1.408972   -1.079241   -2.5337842
  -3.92304   ]]
pred: 0
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  

[[-1.733421   -2.5412652  -1.8675569  -2.77079    -0.20188466 -1.731329
  -3.266324  ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0. -1.  0.  0.]
 [-1. -1. -1.  0.  1.  1.  0.]
 [ 1. -1.  1.  1.  1. -1. -1.]]
ACTION: 6 REWARD: -2.75
Qs:
[[-1.3761256 -3.437717  -2.207181  -1.9935265 -0.9670838 -2.0686085
  -3.1595907]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [-1.  0.  1.  0. -1.  0.  0.]
 [-1. -1. -1.  0.  1.  1.  1.]
 [ 1. -1.  1.  1.  1. -1. -1.]]
ACTION: 2 REWARD: -3.25
Qs:
[[-2.4903233  -2.3492193  -1.5103974  -2.467307    0.22364417 -1.436166
  -3.2969432 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [-1.  0.  1.  0. -1. -1.  0.]
 [-1. -1. -1.  0.  1.  1.  1.]
 [ 1. -1.  1.  1.  1. -1. -1.]]
ACTION: 3 REWARD: 50
Qs:
[[-3.6387906 -2.004452  -1.6274143 -2.2238262  1.08

Qs:
[[-3.4225197 -3.5427089 -3.2664883 -6.6669416 -3.8302193 -3.0202513
  -6.0971746]]
pred: 5
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0. -1.  0.]
 [ 1.  0.  1.  0.  0. -1.  0.]]
ACTION: 3 REWARD: -0.5
Qs:
[[-2.6205354  -0.21737602  0.28802907 -2.2812495   0.2688858  -1.360456
  -1.9435185 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0. -1. -1.]
 [ 1.  0.  1.  1.  0. -1. -1.]]
ACTION: 1 REWARD: -1.75
Qs:
[[-1.1684338   0.15636241  0.38482624 -1.011336    1.6056194  -1.4517106
  -1.854876  ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]]
ACTION: 0 REWARD: -2.25
Qs:
[[-1.763893  -1.3727279 -1.0032021 -1.031254

 [-1.  0.  0. -1.  1.  1.  0.]]
ACTION: 5 REWARD: -2.5
Qs:
[[-2.6198647  -1.922064   -1.3889023  -1.0700715  -0.93549746 -3.266065
  -2.5663924 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  1.  0.]
 [-1.  0. -1. -1.  1.  1.  0.]]
ACTION: 4 REWARD: -4.0
Qs:
[[-2.6042094 -3.382234  -2.0504158 -2.1838462 -2.5647929 -3.4976063
  -3.7196999]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1. -1.  0.]
 [ 0.  0.  0. -1.  1.  1.  0.]
 [-1.  0. -1. -1.  1.  1.  1.]]
ACTION: 1 REWARD: -6.0
Qs:
[[-5.142821  -4.466866  -3.4755409 -2.8581336 -4.311112  -5.3064113
  -5.05558  ]]
pred: 3
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 2. 2. 0.]
 [0. 0. 2. 2. 1. 1. 0.]
 [2. 1. 2. 2. 1. 1. 1.]]
ACTION: 2 REWARD: -3.75
Qs:
[[-6.0136876 -3.96054   -3.7723465 -2.

 [2. 1. 1. 1. 2. 1. 2.]]
ACTION: 5 REWARD: -0.75
Qs:
[[-1.9621322 -0.773871  -0.5134807  1.6436635  3.678266  -2.3580668
  -1.2289046]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1. -1.  0.  1.  0.  0.  0.]]
ACTION: 3 REWARD: 1.75
Qs:
[[-0.52625954 -0.8728821   0.10567854 -0.22861443 -0.003288   -0.578599
  -0.46210927]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [-1. -1.  0.  1.  0. -1.  0.]]
ACTION: 3 REWARD: 4.5
Qs:
[[-0.35983464 -0.21553522  0.0084462  -0.59392035  0.6610407  -0.09829719
  -0.74574697]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [-1. -1. -1.  1.  0. -1.  0.]]
ACTION: 1 REWARD: 4.75
Q

ACTION: 0 REWARD: -10
Qs:
[[-1.5110185 -4.6528525 -3.589868  -5.285574  -1.9530582 -1.4360316
  -6.629963 ]]
pred: 5
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 2. 0. 0. 0. 0.]]
ACTION: 0 REWARD: -1.5
Qs:
[[-2.9315464 -1.6577369 -0.4673646 -1.341218  -0.8400289 -1.4491274
  -1.4179529]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0. -1.  0.  0.  0.  0.]]
ACTION: 5 REWARD: -2.0
Qs:
[[-1.6351074  -1.3840102  -0.88298357 -2.7765357  -1.603791   -1.3537481
  -2.8655777 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  1.  0.  0. -1.  0.]
 [ 1.  0. -1.  0.  0.  1. -1.]]
ACTION: 3 REWARD: -0.25
Qs:
[[-0.96839356  0.75325596  1.6550863  -1.7153295   0.00772069 -0.79812

 [-1. -1.  1.  1.  1. -1.  1.]]
ACTION: 4 REWARD: -0.75
Qs:
[[-5.689909  -0.6831122  1.3781449 -1.7433677  1.621436  -2.4972267
  -1.9811821]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0.  0.  0.  0. -1.]]
ACTION: 1 REWARD: -1.75
Qs:
[[-2.935034  -3.3634963 -0.7025631 -1.9306824 -1.3960934 -2.7553384
  -3.764586 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0.  0.  0.  0. -1.]
 [ 1.  1.  0.  0.  0.  0. -1.]]
ACTION: 1 REWARD: -50
Qs:
[[-3.5681167 -4.830001  -2.0045972 -1.8920555 -1.7710783 -3.942142
  -4.387488 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0. -1.]
 [ 1.  0.  0.  0.  0.  0. -1.]
 [ 1.  1.  0.  0.  0.  0. -1.]]
ACTION: 1 REWARD: -1.0
Qs:
[[

[[-2.925027  -1.6956111 -1.8757433 -1.8470972 -2.0380654 -2.687025
  -3.1060796]]
pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0. -1. -1.  0.  0.  1.]]
ACTION: 1 REWARD: -2.25
Qs:
[[-3.8293357 -2.442046  -2.339233  -3.6363788 -2.9724393 -2.6957016
  -4.1254196]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0. -1.]
 [ 1.  1. -1. -1.  0.  0.  1.]]
ACTION: 6 REWARD: -2.75
Qs:
[[-4.3777795 -3.0144987 -3.8969235 -4.1047664 -2.6901963 -3.4948096
  -4.4226236]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [2. 2. 0. 0. 0. 0. 2.]
 [1. 1. 2. 2. 0. 0. 1.]]
ACTION: 3 REWARD: -0.75
Qs:
[[-5.2508388 -5.867814  -1.1821686 -1.4568853 -2.4911892 -4.8090835
  -3.4186726]]
pred: 2
MEMORY:
[[

[[-4.798081  -4.852295  -3.5571258 -2.3235242 -1.6384368 -4.1889586
  -3.407333 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  1. -1.]
 [ 0.  0. -1. -1.  0. -1.  1.]]
ACTION: 1 REWARD: 0.0
Qs:
[[-3.4295034 -2.9042842 -2.6562338 -2.9803479 -3.0172796 -3.2705564
  -4.334401 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  1.  0.  0.  1. -1.]
 [ 0.  1. -1. -1.  0. -1.  1.]]
ACTION: 0 REWARD: -1.0
Qs:
[[-3.6494224 -2.9331489 -3.3954628 -3.4160433 -2.7126672 -3.791869
  -4.786387 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 2. 2.]
 [0. 2. 1. 0. 0. 1. 2.]
 [1. 1. 2. 2. 0. 2. 1.]]
ACTION: 3 REWARD: -0.75
Qs:
[[-3.608088  -4.564905  -2.9523609 -4.5417943 -3.5448804 -3.2117777
  -4.025054 ]]
pred: 2
MEMORY:
[[ 0.

ACTION: 1 REWARD: 0.25
Qs:
[[-0.3695379   0.20479846 -1.8155249  -2.033684    2.3185005  -1.4586612
  -2.7334228 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 2. 0. 0. 0. 0.]]
ACTION: 1 REWARD: -1.0
Qs:
[[-2.9315464 -1.6577369 -0.4673646 -1.341218  -0.8400289 -1.4491274
  -1.4179529]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [-1.  1. -1.  0.  0.  0. -1.]]
ACTION: 3 REWARD: -0.75
Qs:
[[-0.5634904  -0.13369733 -1.161851   -1.0988988   0.6316974  -0.6246902
  -1.3999091 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 2. 0. 0. 0. 0.]
 [0. 0. 2. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [2. 1. 2. 1. 0. 1. 2.]]
ACTION: 1 REWARD: -2.0
Qs:
[[-6.949621   -6.7115335  -3.751149   -0.52375716 -1.8224679  -8.661851
  -6.296606  ]]
pred: 3
MEMORY:
[

pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]]
ACTION: 0 REWARD: -1.5
Qs:
[[-1.6178929  -1.0849923  -0.28347576 -0.3953192  -0.7436177  -1.5500398
  -1.5076101 ]]
pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [2. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 2. 0. 0.]]
ACTION: 1 REWARD: -1.5
Qs:
[[-2.872283   -4.1740108  -2.4986432  -0.80776834 -0.97445506 -5.0069704
  -4.4006248 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  1.  0.  0. -1.  0.  0.]]
ACTION: 1 REWARD: -0.75
Qs:
[[-3.8467858 -1.3958174 -0.7706284 -3.35361   -1.0937119 -2.0343735
  -3.329949 ]]
pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [

ACTION: 2 REWARD: -1.25
Qs:
[[-0.13848308 -0.8494602   0.07653002  0.22991036  2.0218134  -0.4241069
  -0.1681841 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]]
ACTION: 1 REWARD: -2.25
Qs:
[[-0.80901515 -1.3022456  -0.84834325 -0.8192767  -0.8225403  -1.4638016
  -1.8977557 ]]
pred: 0
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1. -1.  0.  0.  0.  0.]
 [ 0.  1. -1.  0.  0.  0. -1.]]
ACTION: 5 REWARD: -1.25
Qs:
[[-0.36743402 -1.4676024  -0.7336234  -1.2720134  -0.05075547 -0.56941354
  -2.1308126 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1. -1.  0.  0.  0.  0.]
 [ 0.  1. -1. -1.  0.  1. -1.]]
ACTION: 2 REWARD: -0.5
Qs:
[[-0.8982830

 [-1.  1. -1.  1.  0.  1.  1.]]
ACTION: 4 REWARD: 50
Qs:
[[-5.8840985  -4.495026   -3.71292    -0.07643439  3.2912855  -6.556233
  -3.864317  ]]
pred: 4
MEMORY:
[[ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1. -1.  0.  0.  0.]
 [ 0.  0.  1. -1.  0.  1. -1.]
 [-1.  0.  1.  1.  0.  1. -1.]
 [ 1. -1. -1. -1.  0. -1. -1.]
 [-1.  1. -1.  1.  0.  1.  1.]]
ACTION: 4 REWARD: -1.0
Qs:
[[-5.8840985  -4.495026   -3.71292    -0.07643439  3.2912855  -6.556233
  -3.864317  ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 0. 0.]]
ACTION: 4 REWARD: -0.75
Qs:
[[-2.3988702  -1.7177547  -1.9467471  -0.8874283   0.37287128 -2.94737
  -2.3015108 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 1. 2. 2.]]
ACTION: 3 REWARD: -1.25
Qs:
[[-5.1926165e+00 -4.5462208e+00 -3.1910493e+00 -1.1411068e+00
   6.73679

[[-2.0337696  -1.7436182  -0.61295044 -4.0742164  -4.159082   -2.0398562
  -3.9448767 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0. -1.  0. -1.]
 [ 0.  0.  0.  0.  1.  0.  1.]
 [ 0.  0.  0.  0. -1.  0. -1.]
 [ 1.  0.  0.  1. -1. -1.  1.]
 [ 1. -1. -1.  1.  1. -1.  1.]
 [-1. -1.  1. -1.  1.  1. -1.]]
ACTION: 0 REWARD: -0.75
Qs:
[[-3.0309749 -1.6172825  1.7221516 -2.0871105 -1.2931631 -2.4739141
  -2.1600013]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0. -1.  0. -1.]
 [ 0.  0.  0.  0.  1.  0.  1.]
 [ 1.  0.  0.  0. -1. -1. -1.]
 [ 1.  0.  0.  1. -1. -1.  1.]
 [ 1. -1. -1.  1.  1. -1.  1.]
 [-1. -1.  1. -1.  1.  1. -1.]]
ACTION: 4 REWARD: -10
Qs:
[[-2.293146  -2.294445   0.7141705 -2.4021635 -2.252959  -2.804759
  -3.1577082]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0. -1.  0. -1.]
 [ 0.  0.  0.  0.  1.  0.  1.]
 [ 1.  0.  0.  0. -1. -1. -1.]
 [ 1.  0.  0.  1. -1. -1.  1.]
 [ 1. -1. -1.  1.  1. -1.  1.]
 [-1. -1.  1. -1.  1.  1. -1.]]
ACTION: 4 REWARD: -10
Qs:
[[-2.293146  -2.294445   0.7141705 -2.4021635 -2.252959  

 [2. 1. 2. 2. 1. 2. 1.]]
ACTION: 2 REWARD: 3.25
Qs:
[[-3.6216245  -6.6874437  -0.29448724 -2.7692442  -3.3158767  -5.073983
  -3.0747833 ]]
pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 2. 0. 0. 0.]
 [0. 2. 0. 1. 0. 0. 0.]
 [2. 1. 1. 1. 0. 1. 2.]
 [2. 2. 2. 1. 2. 1. 1.]
 [2. 1. 2. 2. 1. 2. 1.]]
ACTION: 3 REWARD: 0.0
Qs:
[[-2.3086822 -4.7118807  1.1408207 -2.0125237 -2.5116315 -3.365783
  -1.8530225]]
pred: 2
MEMORY:
[[ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0. -1.  0.  1.  0.  0.  0.]
 [-1.  1.  1.  1. -1.  1. -1.]
 [-1. -1. -1.  1. -1.  1.  1.]
 [-1.  1. -1. -1.  1. -1.  1.]]
ACTION: 1 REWARD: -0.5
Qs:
[[-4.088519  -2.8932142 -3.5525382 -2.7502687 -0.5982384 -3.437642
  -4.023217 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  1.  0. -1.  0.  0.  0.]
 [ 0. -1.  0.  1. -1.  0.  0.]
 [-1.  1.  1.  1. -1.  1. -1.]
 [-1. -1. -1.  1. -1.  1.  1.]
 [-1.  1. -1. -1.  1. -1.  1.]]
ACTION: 2 REWARD: -1.25
Qs:
[[-6.0471034 -4.801956  -4.7368994 -3.6700287 -2

MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0. -1.  0. -1.  0.  1.  0.]
 [ 0. -1.  1.  1.  1. -1. -1.]]
ACTION: 4 REWARD: -2.25
Qs:
[[-1.8619035 -2.6590552 -2.6115775 -3.30116   -0.9520342 -1.1766316
  -3.134706 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 2. 0. 2. 2. 2. 0.]
 [0. 2. 0. 2. 1. 1. 1.]
 [0. 2. 1. 1. 1. 2. 2.]]
ACTION: 4 REWARD: -3.25
Qs:
[[-0.4651857  -2.786004   -0.3739553   0.9524574   3.4143503  -2.4338284
   0.07014531]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0.]
 [0. 2. 0. 2. 2. 2. 0.]
 [0. 2. 2. 2. 1. 1. 1.]
 [0. 2. 1. 1. 1. 2. 2.]]
ACTION: 1 REWARD: -1.0
Qs:
[[-0.84425294 -1.8650736   0.27367187  1.0536759   3.9761426  -2.121455
   0.5936878 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 1. 1. 0. 0.]
 [0. 2. 0. 2. 2. 2. 0.]
 [0. 2. 2. 2. 1. 1. 1.]
 [2.

pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  1.  0.  0.  0.  0. -1.]]
ACTION: 2 REWARD: -0.75
Qs:
[[-1.6908476  -1.4316728  -1.5656551  -1.244625   -0.07193656 -1.7301157
  -1.9469675 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  1.  1.  0.  0.  0. -1.]]
ACTION: 3 REWARD: -0.25
Qs:
[[-2.5403695 -2.8433669 -1.3615556 -1.589129  -1.238892  -2.424816
  -2.6052513]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [-1.  1.  1.  1.  0.  0. -1.]]
ACTION: 0 REWARD: -0.75
Qs:
[[-2.0454295  -2.002363   -0.96090424 -1.0153471  -0.02223101 -2.0131755
  -1.609823  ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]


pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [-1.  0.  1.  0.  0.  1. -1.]]
ACTION: 6 REWARD: -1.0
Qs:
[[-1.4382449  -0.65989125  0.43668127 -0.85117495  0.19701216 -1.7991943
  -1.4082072 ]]
pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 2. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 2. 1.]
 [2. 0. 1. 2. 0. 1. 2.]]
ACTION: 1 REWARD: -2.0
Qs:
[[-6.7046895 -7.5849466 -4.8336906 -4.2369766 -4.7598147 -5.6919713
  -6.526525 ]]
pred: 3
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 2. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 2. 0. 0. 0. 2. 1.]
 [2. 1. 1. 2. 0. 1. 2.]]
ACTION: 0 REWARD: -2.0
Qs:
[[-6.0702376 -7.562297  -4.445973  -4.574625  -5.436267  -4.7776875
  -5.773178 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [ 1.  0.  0.  0.  0.  1.  0.]

pred: 3
MEMORY:
[[ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0. -1.  0. -1.]
 [ 0.  0. -1.  0.  1.  0.  1.]
 [-1.  0.  1.  1. -1.  1. -1.]]
ACTION: 0 REWARD: -2.0
Qs:
[[-2.9794724 -4.3889337 -2.2843723 -0.523727  -1.1888036 -3.0134995
  -2.7362747]]
pred: 3
MEMORY:
[[ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0. -1.  0. -1.  0.  0.]
 [ 0.  0.  1.  0. -1.  0. -1.]
 [ 1.  0. -1.  1.  1. -1.  1.]
 [-1.  0.  1.  1. -1.  1. -1.]]
ACTION: 2 REWARD: -10
Qs:
[[-3.3746707  -4.0952287  -0.8411392  -0.44349897 -1.6551675  -2.912065
  -2.545395  ]]
pred: 3
MEMORY:
[[0. 0. 2. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 2. 0. 2. 0. 0.]
 [0. 0. 1. 0. 2. 2. 2.]
 [1. 1. 2. 1. 1. 2. 1.]
 [2. 2. 1. 1. 2. 1. 2.]]
ACTION: 3 REWARD: -2.25
Qs:
[[-3.7053535  -3.6319718  -3.509353   -2.4516587   0.52672917 -3.4264607
  -2.8076773 ]]
pred: 4
MEMORY:
[[ 0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0. -1. 

pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  1. -1.  0.  0. -1.  0.]]
ACTION: 6 REWARD: -2.5
Qs:
[[-1.9825077 -1.222917  -2.6234066 -3.7739055 -1.0980078 -1.3567743
  -3.4747472]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  1.]
 [-1.  0.  0.  0.  0. -1. -1.]
 [ 1.  1. -1.  0.  0. -1.  1.]]
ACTION: 0 REWARD: -2.5
Qs:
[[-4.4249067 -1.2467989 -3.6725314 -4.5253096 -2.0671458 -3.9428332
  -4.381629 ]]
pred: 1
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  1.]
 [-1.  0.  0.  0.  0. -1. -1.]
 [ 1.  1. -1.  0. -1. -1.  1.]]
ACTION: 3 REWARD: -3.0
Qs:
[[-5.3814654 -2.260706  -3.3047585 -4.7214313 -2.7868047 -4.606742
  -4.864694 ]]
pred: 1
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.

  -4.652357 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 2. 2. 0.]
 [0. 1. 1. 2. 1. 2. 0.]]
ACTION: 4 REWARD: -1.0
Qs:
[[-3.6498318  -5.884809   -3.4510665  -0.33562204 -0.24700242 -5.1770406
  -2.4539256 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 2. 2. 0.]
 [0. 1. 1. 2. 1. 2. 2.]]
ACTION: 6 REWARD: -0.25
Qs:
[[-5.052432   -6.9803452  -4.4665475  -0.5131167   0.07475825 -6.774148
  -3.2124372 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0. -1.  0.  0. -1. -1.  1.]
 [-1.  1.  1. -1.  1. -1. -1.]]
ACTION: 1 REWARD: 0.0
Qs:
[[-0.9371409  0.3297722 -1.3208691 -1.872622   1.7353616 -0.8804563
  -2.371975 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  1. 

[[-3.9839518 -3.810488  -1.9180838 -3.3608909 -3.0325317 -3.290563
  -4.995608 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  1.  0.  0. -1.  0.  0.]
 [ 1. -1.  1. -1.  1.  0. -1.]]
ACTION: 2 REWARD: -3.75
Qs:
[[-3.2969933  -3.855599   -0.68793464 -3.378504   -3.3906946  -2.6587043
  -4.832839  ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  1.  1.  0. -1.  0.  0.]
 [ 1. -1.  1. -1.  1.  0. -1.]]
ACTION: 3 REWARD: -50
Qs:
[[-4.492817  -4.1733885 -1.2299519 -4.2966533 -4.083017  -3.51511
  -5.9635706]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.  0.]
 [ 0.  1.  1.  0. -1.  0.  0.]
 [ 1. -1.  1. -1.  1.  0. -1.]]
ACTION: 3 REWARD: -1.25
Qs:
[[-4.492817  -4.1733885 -1.2299519 -4.2966533 -4.083017  

Qs:
[[-3.8804364 -4.6127996 -1.8130914  0.3318266  1.3689017 -3.5820699
  -1.9347186]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. -1.  0.]
 [ 0.  0.  0. -1.  0.  1.  0.]
 [ 1.  0.  0. -1.  1.  1.  0.]
 [ 1. -1. -1. -1.  1. -1.  0.]
 [-1. -1.  1.  1.  1. -1.  0.]]
ACTION: 5 REWARD: -2.5
Qs:
[[-1.4076185 -4.32295   -3.6149187 -1.4572657 -1.0597749 -4.17954
  -3.9040542]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 2. 0.]
 [0. 0. 0. 2. 0. 1. 0.]
 [1. 0. 0. 2. 1. 1. 0.]
 [1. 2. 2. 2. 1. 2. 0.]
 [2. 2. 1. 1. 1. 2. 2.]]
ACTION: 3 REWARD: -0.75
Qs:
[[-5.7968807  -4.6396523  -2.2432451   0.18699168  2.4346929  -5.736641
  -2.7476    ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 2. 0.]
 [0. 0. 0. 2. 0. 1. 0.]
 [1. 2. 0. 2. 1. 1. 0.]
 [1. 2. 2. 2. 1. 2. 0.]
 [2. 2. 1. 1. 1. 2. 2.]]
ACTION: 0 REWARD: -1.5
Qs:
[[-3.0546026  -2.050498    0.445607    0.46449447  2.5687716  -3.210833
  -0.9063983 ]]
pred: 4
MEMORY:
[[ 0.  0.  0. -1.  0.  1.  0.]
 [

Qs:
[[-3.295144   -3.3462007   0.6278786  -0.79130405  1.6767905  -2.4664776
  -0.7912789 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0. -1.  0.]
 [ 0.  0.  0. -1.  0.  1.  1.]
 [ 0.  1. -1. -1.  1. -1.  1.]]
ACTION: 2 REWARD: -3.25
Qs:
[[-6.5205665 -4.5857635 -5.965918  -5.949372  -4.0638833 -4.526584
  -5.9691253]]
pred: 4
MEMORY:
[[0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 2. 0. 2. 0.]
 [0. 0. 1. 2. 0. 1. 1.]
 [0. 1. 2. 2. 1. 2. 1.]]
ACTION: 3 REWARD: -10
Qs:
[[-4.7444944  -5.0503087  -1.6497811  -0.42394564  0.7358011  -4.903338
  -1.9435651 ]]
pred: 4
MEMORY:
[[0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 2. 2. 0. 2. 0.]
 [0. 0. 1. 2. 0. 1. 1.]
 [0. 1. 2. 2. 1. 2. 1.]]
ACTION: 5 REWARD: -6.25
Qs:
[[-5.6725283  -5.467633   -2.3850245   0.47462124  1.0126209  -5.6612635
  -2.5806441 ]]
pred: 4
MEMORY:
[[ 0.  0.  0. -1.  0.  0.

ACTION: 0 REWARD: -10
Qs:
[[-5.731345  -2.4946132 -5.7480884 -7.0976973 -3.5646627 -4.4160037
  -5.7714815]]
pred: 1
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 2. 0. 0. 0. 1.]]
ACTION: 4 REWARD: -1.0
Qs:
[[-5.130168   -4.277314   -2.71814    -0.86246765 -0.6499171  -5.2221313
  -3.115902  ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 1. 0. 1.]]
ACTION: 5 REWARD: -2.0
Qs:
[[-4.6354356  -3.2529895  -1.7220739  -0.37001908  1.2589831  -4.928193
  -2.3280077 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]
 [ 0. -1. -1. -1.  1.  1.  1.]]
ACTION: 6 REWARD: -2.5
Qs:
[[-2.988013  -3.2358317 -1.4993824 -1.1234185 -2.1745093 -4.0757923
  -3.9937644]]
pred: 3
MEMORY:
[[0. 0. 0

Qs:
[[-2.5002801  -3.5157762   1.8057795   0.17810105  0.17208081 -4.6298156
  -3.1104507 ]]
pred: 2
MEMORY:
[[2. 0. 0. 0. 0. 0. 0.]
 [2. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 2. 1. 0. 1. 0. 0.]
 [1. 2. 1. 2. 2. 1. 0.]
 [2. 2. 1. 2. 1. 2. 0.]]
ACTION: 0 REWARD: -10
Qs:
[[-0.90211636 -3.3478937   1.2592735   0.05156099  0.30468377 -4.389853
  -3.227375  ]]
pred: 2
MEMORY:
[[2. 0. 0. 0. 0. 0. 0.]
 [2. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 2. 1. 2. 1. 0. 0.]
 [1. 2. 1. 2. 2. 1. 0.]
 [2. 2. 1. 2. 1. 2. 0.]]
ACTION: 5 REWARD: -50
Qs:
[[-1.553814   -2.2539935   2.6221092   0.66603297  3.0193765  -3.7515879
  -1.5403975 ]]
pred: 4
MEMORY:
[[2. 0. 0. 0. 0. 0. 0.]
 [2. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 2. 1. 2. 1. 0. 0.]
 [1. 2. 1. 2. 2. 1. 0.]
 [2. 2. 1. 2. 1. 2. 0.]]
ACTION: 5 REWARD: -1.75
Qs:
[[-1.553814   -2.2539935   2.6221092   0.66603297  3.0193765  -3.7515879
  -1.5403975 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]


pred: 5
MEMORY:
[[0. 2. 2. 0. 2. 0. 0.]
 [0. 1. 2. 0. 2. 0. 0.]
 [0. 2. 1. 2. 1. 1. 1.]
 [2. 1. 2. 2. 1. 1. 2.]
 [1. 1. 2. 1. 2. 2. 1.]
 [2. 1. 2. 2. 2. 1. 1.]]
ACTION: 1 REWARD: -10
Qs:
[[-1.3851163  -2.4153326  -0.75498927  1.3652213   3.377332   -2.5523834
  -0.31205332]]
pred: 4
MEMORY:
[[ 0. -1. -1.  0. -1.  0.  0.]
 [ 0.  1. -1.  0. -1.  0. -1.]
 [ 0. -1.  1. -1.  1.  1.  1.]
 [-1.  1. -1. -1.  1.  1. -1.]
 [ 1.  1. -1.  1. -1. -1.  1.]
 [-1.  1. -1. -1. -1.  1.  1.]]
ACTION: 2 REWARD: -10
Qs:
[[-3.3625426 -5.357221  -2.3925383 -5.913336  -7.296691  -2.1535444
  -6.1493382]]
pred: 5
MEMORY:
[[0. 2. 2. 0. 2. 0. 0.]
 [0. 1. 2. 0. 2. 0. 2.]
 [2. 2. 1. 2. 1. 1. 1.]
 [2. 1. 2. 2. 1. 1. 2.]
 [1. 1. 2. 1. 2. 2. 1.]
 [2. 1. 2. 2. 2. 1. 1.]]
ACTION: 3 REWARD: 50
Qs:
[[-0.81086075 -3.3519638  -1.4075481   1.0972477   2.5491629  -2.1634388
  -0.37281013]]
pred: 4
MEMORY:
[[0. 2. 2. 0. 2. 0. 0.]
 [0. 1. 2. 0. 2. 0. 2.]
 [2. 2. 1. 2. 1. 1. 1.]
 [2. 1. 2. 2. 1. 1. 2.]
 [1. 1. 2. 1. 2. 2. 1.]
 

ACTION: 3 REWARD: 0.25
Qs:
[[-5.9312487 -5.155564  -4.114111  -2.807221  -2.0907598 -5.6480355
  -5.0401464]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 2.]
 [2. 0. 0. 1. 1. 0. 2.]]
ACTION: 6 REWARD: 1.0
Qs:
[[-5.8129745 -5.495607  -3.6701314 -3.4865086 -3.0820172 -6.273945
  -5.6213927]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 2.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 2.]
 [2. 0. 0. 1. 1. 0. 2.]]
ACTION: 5 REWARD: 0.0
Qs:
[[-6.2947593 -6.9618144 -5.065525  -4.1416144 -3.9319148 -6.491307
  -6.5545163]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 2.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 2. 0. 2.]
 [2. 0. 0. 1. 1. 1. 2.]]
ACTION: 0 REWARD: 0.0
Qs:
[[-5.149252  -6.9212637 -4.4294662 -3.5393023 -4.4613557 -6.6090646
  -6.592643 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0

pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 2. 1. 0. 2.]
 [2. 2. 1. 2. 1. 1. 2.]
 [1. 2. 2. 2. 1. 2. 1.]
 [1. 1. 2. 1. 2. 1. 1.]
 [2. 2. 2. 1. 2. 2. 1.]]
ACTION: 5 REWARD: 0.25
Qs:
[[ 0.7049698  -2.0725727  -1.1865717   0.6345569   4.8152766  -1.0620747
  -0.46654797]]
pred: 4
MEMORY:
[[-1.  0.  0.  0.  0.  0.  1.]
 [ 1.  0.  0. -1.  1.  1. -1.]
 [-1. -1.  1. -1.  1.  1. -1.]
 [ 1. -1. -1. -1.  1. -1.  1.]
 [ 1.  1. -1.  1. -1.  1.  1.]
 [-1. -1. -1.  1. -1. -1.  1.]]
ACTION: 4 REWARD: 50
Qs:
[[-8.769889  -8.139695  -4.9026237 -7.5884223 -6.7298255 -6.7572722
  -7.1347575]]
pred: 2
MEMORY:
[[-1.  0.  0.  0.  0.  0.  1.]
 [ 1.  0.  0. -1.  1.  1. -1.]
 [-1. -1.  1. -1.  1.  1. -1.]
 [ 1. -1. -1. -1.  1. -1.  1.]
 [ 1.  1. -1.  1. -1.  1.  1.]
 [-1. -1. -1.  1. -1. -1.  1.]]
ACTION: 4 REWARD: -1.25
Qs:
[[-8.769889  -8.139695  -4.9026237 -7.5884223 -6.7298255 -6.7572722
  -7.1347575]]
pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0.

ACTION: 1 REWARD: -0.75
Qs:
[[-5.0624533 -4.5858355 -3.287505  -1.1149545 -0.8040785 -5.0788236
  -3.216156 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 2. 2. 0. 0. 2.]]
ACTION: 5 REWARD: -1.5
Qs:
[[-4.923205   -4.295577   -2.3083956  -0.9332511   0.49395823 -4.833579
  -2.3576922 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 2. 2. 0. 1. 2.]]
ACTION: 0 REWARD: -2.0
Qs:
[[-2.7695558 -2.4213998 -1.5474012 -0.5151534  0.6963486 -3.6631722
  -1.3242248]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 2.]
 [1. 1. 2. 2. 0. 1. 2.]]
ACTION: 5 REWARD: -1.75
Qs:
[[-3.5462024 -3.230819  -1.2043194 -1.3969815 -1.6083474 -4.369277
  -2.7332234]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0

pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 0. 0.]
 [0. 1. 2. 0. 2. 0. 1.]
 [0. 1. 1. 0. 2. 1. 2.]
 [0. 1. 1. 0. 2. 2. 2.]
 [1. 2. 2. 0. 1. 1. 2.]]
ACTION: 3 REWARD: -50
Qs:
[[-5.382389  -2.7630134 -3.0335045 -2.808989  -1.1533469 -5.1223803
  -4.805076 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 0. 0.]
 [0. 1. 2. 0. 2. 0. 1.]
 [0. 1. 1. 0. 2. 1. 2.]
 [0. 1. 1. 0. 2. 2. 2.]
 [1. 2. 2. 0. 1. 1. 2.]]
ACTION: 3 REWARD: -0.75
Qs:
[[-5.382389  -2.7630134 -3.0335045 -2.808989  -1.1533469 -5.1223803
  -4.805076 ]]
pred: 4
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0. -1.]]
ACTION: 6 REWARD: -1.0
Qs:
[[-0.5866834  -0.63385135  0.22351299 -0.18692547  0.53011113 -0.656272
  -1.0228806 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0.

[[-5.2917557 -6.1673865 -3.871925  -0.0214913  1.3338301 -8.077615
  -3.9906535]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 2. 0. 0.]
 [0. 0. 0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 1. 2. 0.]
 [1. 1. 2. 0. 2. 1. 0.]
 [1. 1. 2. 0. 2. 2. 1.]]
ACTION: 2 REWARD: -2.0
Qs:
[[-4.0709634  -6.2788634  -2.785205   -0.22459574 -0.31133857 -7.596698
  -3.63771   ]]
pred: 3
MEMORY:
[[0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 2. 0. 0.]
 [0. 0. 0. 0. 2. 2. 0.]
 [2. 0. 1. 0. 1. 2. 0.]
 [1. 1. 2. 0. 2. 1. 0.]
 [1. 1. 2. 0. 2. 2. 1.]]
ACTION: 3 REWARD: -1.0
Qs:
[[-3.0611165  -6.0315366  -0.52749926 -0.32244834 -1.0428774  -7.2971625
  -3.7390332 ]]
pred: 3
MEMORY:
[[0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 2. 0. 0.]
 [0. 0. 0. 0. 2. 2. 0.]
 [2. 0. 1. 0. 1. 2. 0.]
 [1. 1. 2. 2. 2. 1. 0.]
 [1. 1. 2. 1. 2. 2. 1.]]
ACTION: 2 REWARD: 0.5
Qs:
[[-2.7621117  -4.9854803  -0.7804378  -0.10281233 -0.3818409  -5.8395243
  -2.7948217 ]]
pred: 3
MEMORY:
[[ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1. -1.  0.]
 [ 0.  0.  

 [ 0.  0. -1.  0.  0. -1.  0.]]
ACTION: 0 REWARD: -0.75
Qs:
[[-1.9101537 -2.6519136 -1.5447562 -1.533393  -1.7229924 -2.393
  -3.017819 ]]
pred: 3
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 2. 0.]
 [2. 0. 1. 0. 0. 1. 0.]
 [1. 0. 2. 0. 0. 2. 0.]]
ACTION: 2 REWARD: 0.75
Qs:
[[-5.441505  -6.0646653 -4.85792   -4.5217266 -4.107489  -4.8781967
  -5.955554 ]]
pred: 4
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [2. 0. 1. 0. 0. 2. 0.]
 [2. 0. 1. 0. 0. 1. 0.]
 [1. 0. 2. 0. 0. 2. 0.]]
ACTION: 1 REWARD: 0.0
Qs:
[[-3.3108156 -5.650074  -2.707076  -2.9142413 -3.6791952 -4.8490767
  -4.367883 ]]
pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 2. 0.]
 [2. 0. 1. 0. 0. 2. 0.]
 [2. 0. 1. 0. 0. 1. 0.]
 [1. 1. 2. 0. 0. 2. 0.]]
ACTION: 3 REWARD: -1.0
Qs:
[[-3.6342535 -6.320126  -2.6423712 -2.8131416 -4.4208474 -4.861465
  -4.470011 ]]
pred: 2
MEMORY:
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 

[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 2. 0. 0. 0. 0.]
 [2. 0. 2. 0. 0. 0. 2.]
 [2. 1. 1. 2. 1. 1. 1.]]
ACTION: 6 REWARD: -5.75
Qs:
[[-7.0546947 -6.175652  -1.6872554 -0.0410264 -2.439316  -8.412851
  -6.259477 ]]
pred: 3
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 2. 0. 0. 0. 1.]
 [2. 2. 2. 0. 0. 0. 2.]
 [2. 1. 1. 2. 1. 1. 1.]]
ACTION: 4 REWARD: -5.25
Qs:
[[-6.761906   -6.8419137  -0.84371847 -0.1804276  -2.9952652  -7.4044447
  -5.827518  ]]
pred: 3
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 2. 0. 0. 0. 1.]
 [2. 2. 2. 0. 1. 0. 2.]
 [2. 1. 1. 2. 1. 1. 1.]]
ACTION: 3 REWARD: -3.75
Qs:
[[-3.9699078 -4.189881   1.0605245 -1.1084841 -3.2901201 -5.3117623
  -4.85499  ]]
pred: 2
MEMORY:
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 0. 0. 1.]
 [2. 2. 2. 1. 1. 0. 2.]
 [2. 1. 1. 2. 1. 1. 1.]]
ACTION: 2 REWARD: -1.25
Qs:
[[-2.7

In [70]:
agent.memory

[(array([[1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1.]]),
  1,
  1.0,
  array([[0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0.]]),
  'Keep Playing!'),
 (array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
         [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
         [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
         [ 1.,  1.,  1.,  1.,  1.,  1.,  1.],
         [ 1., -1.,  1.,  1.,  1.,  1.,  1.],
         [ 1.,  1.,  1.,  1.,  1.,  1.,  1.]]),
  0,
  0.25,
  array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0., -1.,  

In [None]:
agent.losses

In [None]:
        for el in batch:
            if self.player==1: #replace 2's with -1s
                el[0][el[0]>1]=-1
                el[3][el[3]>1]=-1
            if self.player==2: #replace 1s with -1s. replace 2's with 1s
                el[0][(el[0] < 2) & (el[0] > 0)] = -1
                el[0][el[0]>1]=1                
                el[3][(el[3] < 2) & (el[3] > 0)] = -1
                el[3][el[3]>1]=1                
        
        states = []
        targets_f = []

In [None]:
states = []
targets_f = []
batch = random.sample(agent.memory,agent.batch_size)
for state,action, reward, state_prime,status in batch:
    if status != 'Keep Playing!':
        target = reward
    elif status == 'Keep Playing!':
        if action not in gameplay.Get_Legal_Moves(state): #if not a legal move make target the negative reward
            target = reward
        else:
             target = (reward + agent.gamma * np.argmax(agent.model_predict(state_prime))) #idk why they have first element.

    target_f = agent.model_predict(state_prime)
    print(target_f[0][action])
    targets_f.append(target_f[0][action])
    states.append(state)
    #print('STATE:',state)
    #print('ACTION:',action)
    #print('target:',target)
    #print('reward:',reward)

            
states=np.expand_dims(np.array(states), axis=3) #reshaping to train model
targets_f=np.array(targets_f)


In [None]:
keras.models.save_model(agent.model,'mymodel_'+str(episode)+'.h5')

In [None]:
agent.memory

In [None]:
from tensorflow.keras.models import Sequential, save_model, load_model
keras.models.save_model(agent.model,'mymodel.h5')

In [None]:
agent.memory[-1][2]

In [None]:
agent.memory[1][0]

In [None]:
gameplay.BOARD

In [None]:
if status != 'Keep Playing!': #if we are in a terminal state restart game
    self.gameplay.reset()
    turn = 0

state = self.gameplay.BOARD.copy()
actions = self.gameplay.Get_Legal_Moves(self.gameplay.BOARD)
action = np.random.choice(actions,1)[0]
#take random actions. record their states,actions, rewards, next states, and status

    #player1
if turn % 2 ==0:
    self.gameplay.Add_Piece(1,action,self.gameplay.BOARD)
        #turn +=1

    #player2
elif turn % 2 ==1:
    self.gameplay.Add_Piece(2,action,self.gameplay.BOARD)
        #turn +=1


status = self.gameplay.Check_Goal(self.gameplay.BOARD)
if status !='Keep Playing!':
    state_prime = np.zeros((self.gameplay.ROWS,self.gameplay.COLUMNS))
    state_prime[state_prime < 1] = 1
else:
    state_prime = self.gameplay.BOARD.copy() #gamestate
reward = self.gameplay.get_reward(self.player,state)

if turn % 2 ==0 and self.player==1:
    memory.append((state,action,reward,state_prime,status)) #memorize this
if turn % 2 ==1 and self.player==2:
    memory.append((state,action,reward,state_prime,status)) #memorize this
turn +=1


In [None]:

keras.models.save_model(
agent1.model,'mymodel.h5')

In [None]:
#saving model and using it for predictions
from tensorflow.keras.models import Sequential, save_model, load_model

keras.models.save_model(
agent.model,'mymodel.h5')

loaded_model = load_model('mymodel.h5')
state=np.expand_dims(np.expand_dims(gameplay.BOARD, axis=3),axis=0)
loaded_model.predict(state)

In [None]:
state=np.expand_dims(np.expand_dims(gameplay.BOARD, axis=3),axis=0)
loaded_model.predict(state)

In [None]:
import keras
from keras import layers
from keras import models

model=models.Sequential()
model.add(layers.Conv2D(16, (3,3), activation='relu', input_shape=INPUT_SHAPE))
model.add(layers.MaxPooling2D((2,2)))

model.add(layers.Flatten())
model.add(layers.Dense(32,activation='relu'))
model.add(layers.Dense(action_size,activation='softmax'))

model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
model.summary()