In [None]:
from Connect4 import Connect4
from Robots import Robots
from GamePlay import GamePlay

import numpy as np
import random

import datetime


import keras
from keras import layers
from keras import models
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential, save_model, load_model
from tensorflow.keras import losses



In [None]:
#class that creates the connect4 environment
class DeepQLAgent():

    def __init__(self,player=1,gameplay=GamePlay(),memory=None):
        self.player=player
        
        self.gameplay=gameplay
        self.ROWS = self.gameplay.ROWS
        self.COLUMNS = self.gameplay.COLUMNS
        
        self.batch_size = 200
        self.lr=.01
        self.INPUT_SHAPE=(self.ROWS, self.COLUMNS, 1)
        self.action_size =self.COLUMNS
        self.gamma=.9
        self.epsilon = .6 #starting explore probability
        self.explore_stop = .01 #stopping explore rate
        self.decay_rate = 0.99995
        
        self.memory_size=10000
        self.model=self._build_model()
        self.model2=self._build_model()
        self.c=0
        
        self.losses=[]
        self.avgrewards=[]
        self.memory=[]

    #build model
    def _build_model(self):
        model=models.Sequential()
        model.add(layers.Conv2D(64, (4,4), input_shape=self.INPUT_SHAPE))
        model.add(layers.Activation('relu'))
        
        model.add(layers.Flatten())
        model.add(layers.Dense(64,activation='relu'))
        model.add(layers.Dense(64,activation='relu'))
        model.add(layers.Dense(self.action_size,activation='linear'))
        
        optimizer = keras.optimizers.Adam(lr=self.lr)
        l=losses.MeanSquaredError()
        model.compile(loss=l, optimizer=optimizer)
        
        return model


    #function to hold previous states/actions/rewards/nextstates/status
    def memorize(self,state,action,reward,state_prime,done):
        if len(self.memory)>=self.memory_size: #if memory is full remove earliest element before adding new one
            #self.memory.pop(0)
            self.memory.pop(random.randrange(len(self.memory)))
        self.memory.append((state,action,reward,state_prime,done)) #append memory

    def load(self,name):
        self.model.load_weights(name)
    

    def save(self,name):
        self.model.save_weights(name)
    

    #predict values of state using model
    def model_predict(self,state):
        state_reshape=np.expand_dims(np.expand_dims(state, axis=3),axis=0)
        return self.model.predict(state_reshape)

    #predict values of state using model2
    def model2_predict(self,state):
        state_reshape=np.expand_dims(np.expand_dims(state, axis=3),axis=0)
        return self.model2.predict(state_reshape)
    
    
    #function to take action given state                  
    def make_move(self,state,p): 
        if p==2:
            state=self.p2_swap(state)
        if np.random.rand() <=self.epsilon:
            #explore
            legal_actions=self.gameplay.Get_Legal_Moves(state)
            return random.randrange(len(legal_actions))
        else:
            #don't explore
            qvals = self.model_predict(state)
            return np.argmax(qvals)

    #helper function to swap 2s and 1s if player 2
    def p2_swap(self,b):
        c=[]
        for row in b:
            rowvec=[]
            for el in row:
                if el==1:
                    rowvec.append(2)
                if el==2:
                    rowvec.append(1)
                if el==0:
                    rowvec.append(0)
            c.append(rowvec)

        return np.array(c)
        
        
    def learn(self,batch_size):
        if len(self.memory)<batch_size:
            return
        if self.c>=50:
            self.c=0
            self.model2.set_weights(self.model.get_weights()) #periodically update model 2 weights
        
        batch = random.sample(self.memory,self.batch_size)
        X = []
        y = []
        ##############################################################################################
        ### iterate through batch:
        ##############################################################################################
        for state, action, reward, state_prime, status in batch:
            #swap 2s and 1s for training against itself
            if self.player==2:
                state=self.p2_swap(state)
            
            qs=self.model_predict(state)[0] #get qs
            if status!='Keep Playing!': #if terminal, q[s,a] is reward
                target=reward
            else:
                #otherwise, q[s,a] is the immediate reward plus discounted future reward
                #future reward is calculated using model2 on stateprime, and getting argmax
                target = reward + self.gamma*np.argmax(self.model2_predict(state_prime))
            
            qs[action]=target #updating this state's qvalues for the action taken
            
            #while we are here, why not update the other q values if their action leads to a winning state:
            for a in range(len(qs)):
                b=state.copy() #pretend board
                s=gameplay.Add_Piece(1,a,b) #pretend add piece
                if gameplay.Check_Goal(s)!='Keep Playing!': #see if pretend status is terminal
                    qs[a]=gameplay.get_reward(self.player,s) #if it is, update q[s,a] with the reward
                    continue
                b2=state.copy()
                s2=gameplay.Add_Piece(2,a,b2)
                if gameplay.Check_Goal(s)!='Keep Playing!': #see if pretend status is terminal
                    qs[a]=20 #make the moves that counter wins more favorable

            X.append(state) #append for training
            y.append(qs) #new and imporved q values,ready for training
        ##############################################################################################
        ##############################################################################################         
        
        #reshape data elements before retraining model:
        X=np.expand_dims(np.array(X), axis=3)  
        y=np.array(y)
        
        #updating model:
        history = self.model.fit(X,y, verbose=0)
        self.c = self.c+1 #add 1 to constant
        loss = history.history['loss'][0]
        self.losses.append(loss)
        if self.epsilon <= self.explore_stop:
            self.epsilon=self.explore_stop
        else:
            self.epsilon = self.epsilon*self.decay_rate



In [None]:
random.randrange(3)   
              

In [None]:
def prettyprint(mem,q=False):
    print('-------------------------------------------------')
    for row in mem[0]:
        print(row)
    if q==True:
        qs=agent.model_predict(mem[0])[0]
        for i in range(len(qs)):
            qs[i]=round(qs[i],4)
        print('Q values:')
        print(qs)

    print('-------------------------------------------------')
    print('ACTION:',mem[1],'REWARD:',mem[2],'STATUS:',mem[4])
    print('-------------------------------------------------')
    for row in mem[3]:
        print(row)
    print('-------------------------------------------------')

#print(prettyprint(memory[0]))

In [None]:
#helper function
def get_status_reward(state,state_prime,action,p):
    if p==1 and action not in gameplay.Get_Legal_Moves(state):
        return 'Illegal Move',-15
    else:
        return gameplay.Check_Goal(state_prime), gameplay.get_reward(1,state_prime)

In [None]:
agent=DeepQLAgent()
agent.load('weights.h5')

In [None]:
gameplay=GamePlay()
#agent=DeepQLAgent()

from Robots import Robots
bot=Robots(depth=2)


EPISODES = 50000
episode=0
wins=0
rewardsum=0

memory=[]
while episode<EPISODES:
    
    state=gameplay.BOARD.copy()#get state
    #player 1 moves
    if gameplay.Check_Goal(gameplay.BOARD)=='Keep Playing!': #only do an action if it is not terminal
        action=agent.make_move(state,1) #get action
        gameplay.Add_Piece(1,action,gameplay.BOARD) #do action
        state_prime = gameplay.BOARD.copy() #get state prime
        
        #update status, #get reward
        status,reward=get_status_reward(state,state_prime,action,1)
        
################################################################
    #player 2
    if status=='Keep Playing!':  #only do an action if it is not terminal
        
        #three actions generated in a list (agent,rando,or minimax). randomly pick one
        l=[agent.make_move(state_prime,2),bot.Rando_bot(state_prime),bot.MiniMaxAlphaBeta_bot(state_prime,2)]
        action2=l[random.randrange(3)]
        
        gameplay.Add_Piece(2,action2,gameplay.BOARD) #do action
        state_prime = gameplay.BOARD.copy()
        status,reward=get_status_reward(state,state_prime,action,2)
    
    #memorize this
    agent.memorize(state,action,reward,state_prime,status)

    #track sum of rewards:
    rewardsum=rewardsum+reward
    
    if status!='Keep Playing!': #reset board
        #track who won
        if status=='Player 1 wins!':
            wins=wins+1
        gameplay.reset()
        episode=episode+1
        #LEARNNNNNNNNNN
        agent.learn(agent.batch_size)
        

    if episode % 100 ==0 and status!='Keep Playing!':
        print('--------------------------------------------------------')
        print('SUMMARY:')
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        print('EPISODE: ',episode)
        print('SUM OF REWARDS:',rewardsum)
        print('Win Rate:',wins/100)
        print('Avg loss last 100:',round(sum(agent.losses)/(len(agent.losses)+.001),2),'epsilon:',round(agent.epsilon,3))
        wins=0
        rewardsum=0
    if episode % 1000==0 and status !='Keep Playing!':
        keras.models.save_model(agent.model,'mymodel_'+str(episode)+'.h5')
        
        
#for mem in memory:
#    print(prettyprint(mem,True))

In [None]:
for i in range(len(agent.memory)):
    #if agent.memory[i][4]!='Keep Playing!':
    prettyprint(agent.memory[i],True)

In [None]:
keras.models.save_model(agent.model,'mymodel_'+str(episode)+'.h5')