In [1]:
import numpy as np
import pickle
import random

In [2]:

class Config:
    def __init__(self):
        self.train = True
        self.grid_size = 3
        self.epochs = 50000
        self.lr= 0.2
        self.exp_rate= 0.0
        self.decay_rate = 0.9
        self.win_reward_p1 = 1
        self.win_reward_p2 = 0
        self.stats_mode = 0
        self.policy_p1 = 'policy_p1'
        self.start_player = 'computer'
        #print ('In config : ', self.grid_size, self.epochs )

In [3]:

class State:
    def __init__(self, p1, p2, config):
        #print ('---- : ', config.grid_size)
        self.grid_size = config.grid_size
        self.epochs = config.epochs
        self.stats_mode = config.stats_mode
        self.win_reward_p1 = config.win_reward_p1
        self.win_reward_p2 = config.win_reward_p2
        self.start_player  = config.start_player
        self.grid = np.zeros((self.grid_size, self.grid_size))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.gridHash = None
        # init p1 plays first
        self.playerSymbol = 1

    # get unique hash of current grid state
    def getHash(self):
        self.gridHash = str(self.grid.reshape(self.grid_size * self.grid_size))
        return self.gridHash

    def winner(self):
        # row
        for i in range(self.grid_size):
            if sum(self.grid[i, :]) == self.grid_size:
                self.isEnd = True
                return 1
            if sum(self.grid[i, :]) == -self.grid_size:
                self.isEnd = True
                return -1
        # col
        for i in range(self.grid_size):
            if sum(self.grid[:, i]) == self.grid_size:
                self.isEnd = True
                return 1
            if sum(self.grid[:, i]) == -self.grid_size:
                self.isEnd = True
                return -1
        # diagonal
        diag_sum1 = sum([self.grid[i, i] for i in range(self.grid_size)])
        diag_sum2 = sum([self.grid[i, self.grid_size - i - 1] for i in range(self.grid_size)])
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == self.grid_size:
            self.isEnd = True
            if diag_sum1 == self.grid_size or diag_sum2 == self.grid_size:
                return 1
            else:
                return -1

        # tie or no available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None

    def availablePositions(self):
        positions = []
        for i in range(self.grid_size):
            for j in range(self.grid_size):
                if self.grid[i, j] == 0:
                    positions.append((i, j))  # need to be tuple
        return positions

    def updateState(self, position):
        self.grid[position] = self.playerSymbol
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    # only when game ends
    def giveReward(self):
        result = self.winner()
        # backpropagate reward
        if result == 1:
            self.p1.feedbackReward(self.win_reward_p1)
            self.p2.feedbackReward(1-self.win_reward_p1)
        elif result == -1:
            self.p1.feedbackReward(1-self.win_reward_p2)
            self.p2.feedbackReward(self.win_reward_p2)
        else:
            self.p1.feedbackReward(0.1)
            self.p2.feedbackReward(0.5)

    # grid reset
    def reset(self):
        self.grid = np.zeros((self.grid_size, self.grid_size))
        self.gridHash = None
        self.isEnd = False
        self.playerSymbol = 1

    def play(self):
        for i in range(self.epochs):
            if i % 1000 == 0:
                print("Epochs {}".format(i))
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.grid, self.playerSymbol)
                # take action and upate grid state
                self.updateState(p1_action)
                grid_hash = self.getHash()
                self.p1.addState(grid_hash)
                # check grid status if it is end

                win = self.winner()
                if win is not None:
                    # self.showGrid()
                    # ended with p1 either win or draw
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions, self.grid, self.playerSymbol)
                    self.updateState(p2_action)
                    grid_hash = self.getHash()
                    self.p2.addState(grid_hash)

                    win = self.winner()
                    if win is not None:
                        # self.showGrid()
                        # ended with p2 either win or draw
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

    def make_action (self, player, win_status):
            if not self.stats_mode:
                if win_status == 1:
                    print ('\nComputer made the move, current board status -->> ')
                else:
                    print ('\nIt is your turn -->>')
            positions = self.availablePositions()
            player_action = player.chooseAction(positions, self.grid, self.playerSymbol)
            # take action and upate grid state
            self.updateState(player_action)
            if not self.stats_mode:
                self.showGrid()
            # check grid status if it is end
            win = self.winner()
            status = 0
            if win is not None:
                if win == win_status:
                    if not self.stats_mode:
                        print ('\n', '-'*80, '\n')
                        print(player.name, " is the WINNER!!")
                        print ('\n', '-'*80)
                    if win_status == 1:
                        status = 1
                    else :
                        status = 2
                else:
                    status = 0
                    if not self.stats_mode:
                        print ('\n', '-'*80, '\n')
                        print("\t\tIt is a TIE! Play again to win! ", win )
                        print ('\n', '-'*80)
                self.reset()
                
            return win, status
                
    # play with human
    def play2(self):
        if self.start_player == 'computer':
            start_player = 1
        elif self.start_player == 'human':
            start_player = 2
        else:
            start_player = random.choice([1,2])
            
        if not self.stats_mode:
            print ('Computer uses letter\t=> \'x\'')
            print ('You use the letter\t=> \'o\'')
        if start_player == 1:
            self.playerSymbol = 1
            if not self.stats_mode:
                print ('\nComputer started the match!')
        else:
            self.playerSymbol = -1
            if not self.stats_mode:
                print ('\nYou started the match!')
                
        while not self.isEnd:
            if start_player == 1:
                # Player 1
                win, status = self.make_action (self.p1, 1)
                if win is not None:
                    return status, start_player
                else:
                    # Player 2
                    win, status = self.make_action (self.p2, -1)
                    if win is not None:
                        return status, start_player
            else:
                # Player 2
                win, status = self.make_action (self.p2, -1)
                if win is not None:
                    return status, start_player
                else:
                    # Player 1
                    win, status = self.make_action (self.p1, 1)
                    if win is not None:
                        return status, start_player       
        return 0 , 0              

    def showGrid(self):
        # p1: x  p2: o
        for i in range(0, self.grid_size):
            divider = (self.grid_size * '----') + '-'
            print(divider)
            out = '| '
            for j in range(0, self.grid_size):
                if self.grid[i, j] == 1:
                    token = 'x'
                if self.grid[i, j] == -1:
                    token = 'o'		    # Player 2
                if self.grid[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print(divider)
        

In [5]:
class Player:
    def __init__(self, name, config):
        #print ('---- : ', config.lr)
        self.name = name
        self.states = []  # record all positions taken
        self.grid_size = config.grid_size
        self.lr = config.lr
        self.exp_rate = config.exp_rate
        self.decay_rate = config.decay_rate
        self.states_value = {}  # state -> value

    def getHash(self, grid):
        gridHash = str(grid.reshape(self.grid_size * self.grid_size))
        return gridHash

    def chooseAction(self, positions, current_grid, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999999
            for p in positions:
                next_grid = current_grid.copy()
                next_grid[p] = symbol
                next_gridHash = self.getHash(next_grid)
                value = 0 if self.states_value.get(next_gridHash) is None else self.states_value.get(next_gridHash)
                # print("value", value)
                if value >= value_max:
                    value_max = value
                    action = p
        # print("{} takes action {}".format(self.name, action))
        return action

    # append a hash state
    def addState(self, state):
        self.states.append(state)

    # at the end of game, backpropagate and update states value
    def feedbackReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_rate * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()


class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions, current_grid=None, symbol=None):
        while True:
            row = int(input("Input your action grid row:"))
            col = int(input("Input your action grid col:"))
            action = (row, col)
            if action in positions:
                return action
                
                

In [6]:
if __name__ == "__main__":
	config = Config ()
	# training
	p1 = Player("p1", config=config)
	p2 = Player("p2", config=config)

	st = State(p1, p2, config=config)
	if config.train:
		print("Training started...")
		st.play()
		p1.savePolicy()
		p2.savePolicy()
		print ('\n\t Training Done ! \n')
	
	print("\nTesting...")
	# play with human
	p1 = Player("Computer", config=config)
	p1.loadPolicy(config.policy_p1)

	p2 = HumanPlayer("Human")

	st = State(p1, p2, config=config)
	st.play2()

Training started...
Epochs 0
Epochs 1000
Epochs 2000
Epochs 3000
Epochs 4000
Epochs 5000
Epochs 6000
Epochs 7000
Epochs 8000
Epochs 9000
Epochs 10000
Epochs 11000
Epochs 12000
Epochs 13000
Epochs 14000
Epochs 15000
Epochs 16000
Epochs 17000
Epochs 18000
Epochs 19000
Epochs 20000
Epochs 21000
Epochs 22000
Epochs 23000
Epochs 24000
Epochs 25000
Epochs 26000
Epochs 27000
Epochs 28000
Epochs 29000
Epochs 30000
Epochs 31000
Epochs 32000
Epochs 33000
Epochs 34000
Epochs 35000
Epochs 36000
Epochs 37000
Epochs 38000
Epochs 39000
Epochs 40000
Epochs 41000
Epochs 42000
Epochs 43000
Epochs 44000
Epochs 45000
Epochs 46000
Epochs 47000
Epochs 48000
Epochs 49000

	 Training Done ! 


Testing...
Computer uses letter	=> 'x'
You use the letter	=> 'o'

Computer started the match!

Computer made the move, current board status -->> 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   | x | 
-------------

It is your turn -->>
Input your action grid row:1
Input your action grid 

In [6]:
st.play2()

Computer uses letter	=> 'x'
You use the letter	=> 'o'

Computer started the match!

Computer made the move, current board status -->> 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   | x | 
-------------

It is your turn -->>
Input your action grid row:1
Input your action grid col:1
-------------
|   |   |   | 
-------------
|   | o |   | 
-------------
|   |   | x | 
-------------

Computer made the move, current board status -->> 
-------------
|   |   |   | 
-------------
|   | o |   | 
-------------
|   | x | x | 
-------------

It is your turn -->>
Input your action grid row:2
Input your action grid col:0
-------------
|   |   |   | 
-------------
|   | o |   | 
-------------
| o | x | x | 
-------------

Computer made the move, current board status -->> 
-------------
|   |   |   | 
-------------
|   | o | x | 
-------------
| o | x | x | 
-------------

It is your turn -->>
Input your action grid row:0
Input your action grid col:2
-------------
| 

(2, 1)

In [7]:
p1 = Player("Computer", config=config)
p1.loadPolicy('policies/policy_p1')
p2 = HumanPlayer("Human")

st = State(p1, p2, config=config)
st.play2()

Computer uses letter	=> 'x'
You use the letter	=> 'o'

Computer started the match!

Computer made the move, current board status -->> 
-------------
|   |   |   | 
-------------
|   | x |   | 
-------------
|   |   |   | 
-------------

It is your turn -->>
Input your action grid row:1
Input your action grid col:0
-------------
|   |   |   | 
-------------
| o | x |   | 
-------------
|   |   |   | 
-------------

Computer made the move, current board status -->> 
-------------
|   | x |   | 
-------------
| o | x |   | 
-------------
|   |   |   | 
-------------

It is your turn -->>
Input your action grid row:2
Input your action grid col:1
-------------
|   | x |   | 
-------------
| o | x |   | 
-------------
|   | o |   | 
-------------

Computer made the move, current board status -->> 
-------------
| x | x |   | 
-------------
| o | x |   | 
-------------
|   | o |   | 
-------------

It is your turn -->>
Input your action grid row:2
Input your action grid col:2
-------------
| 

(1, 1)

In [10]:
config.grid_size = 4
p1 = Player("Computer", config=config)
p1.loadPolicy('policies_4x4/policy_p1')
p2 = HumanPlayer("Human")
st = State(p1, p2, config=config)
st.play2()

Computer uses letter	=> 'x'
You use the letter	=> 'o'

Computer started the match!

Computer made the move, current board status -->> 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   | x | 
-----------------

It is your turn -->>
Input your action grid row:3
Input your action grid col:0
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
| o |   |   | x | 
-----------------

Computer made the move, current board status -->> 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
| o |   | x | x | 
-----------------

It is your turn -->>
Input your action grid row:1
Input your action grid col:2
-----------------
|   |   |   |   | 
-----------------
|   |   | o |   | 
-----------------
|   |   |   |   | 
-----------------
| o |   | x | x | 
-------

(2, 1)