In [1]:
import numpy as np
import pickle

In [2]:
# credits
# got idea for using pickle from this medium article
# https://towardsdatascience.com/why-turn-into-a-pickle-b45163007dac
# I have utilized code from humanplayer class, play2, and board layout from jae duk seo's chapter 1 tic tac toe code 
# as I thought it was a really cool idea 
# https://github.com/JaeDukSeo/reinforcement-learning-an-introduction/blob/master/chapter01/TicTacToe.py
# planning on revising the code to fit more in line with what I want to do later

In [3]:
board_rows = 4
board_cols = 4

class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((board_rows, board_cols))
        self.p1 = p1
        self.p2 = p2
        self.term = False
        self.boardHash = None
        self.playerSymbol = 1

    def getHash(self):
        self.boardHash = str(self.board.reshape(board_rows * board_cols))
        return self.boardHash

    def winner(self):
        for i in range(board_rows):
            if sum(self.board[i, :]) == 4:
                self.term = True
                return 1
            if sum(self.board[i, :]) == -4:
                self.term = True
                return -1
        for i in range(board_cols):
            if sum(self.board[:, i]) == 4:
                self.term = True
                return 1
            if sum(self.board[:, i]) == -4:
                self.term = True
                return -1      
        diagonal_sum1 = sum([self.board[i, i] for i in range(board_cols)])
        diagonal_sum2 = sum([self.board[i, board_cols - i - 1] for i in range(board_cols)])
        diagonal_sum = max(abs(diagonal_sum1), abs(diagonal_sum2))
        if diagonal_sum == 4:
            self.term = True
            if diagonal_sum1 == 4 or diagonal_sum2 == 4:
                return 1
            else:
                return -1
        if len(self.availPositions()) == 0:
            self.term = True
            return 0
        self.term = False
        return None

    def availPositions(self):
        positions = []
        for i in range(board_rows):
            for j in range(board_cols):
                if self.board[i, j] == 0:
                    positions.append((i, j)) 
        return positions

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1
        
    def giveReward(self):
        result = self.winner()
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.5)
            self.p2.feedReward(0.5)

    def reset(self):
        self.board = np.zeros((board_rows, board_cols))
        self.boardHash = None
        self.term = False
        self.playerSymbol = 1

    def play(self, rounds=100):
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
            while not self.term:
                positions = self.availPositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                win = self.winner()
                if win is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break
                else:
                    positions = self.availPositions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)
                    win = self.winner()
                    if win is not None:
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
    def playHuman(self):
        while not self.term:
            positions = self.availPositions()
            p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
            self.updateState(p1_action)
            self.showBoard()
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break
            else:
                positions = self.availPositions()
                p2_action = self.p2.chooseAction(positions)
                self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break
    def showBoard(self):
        for i in range(0, board_rows):
            print('-----------------')
            border = '| '
            for j in range(0, board_cols):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                border += token + ' | '
            print(border)
        print('-----------------')


In [4]:
class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []
        self.alpha = 0.1
        self.exp_rate = exp_rate
        self.decay_gamma = 1.0
        self.states_value = {}

    def getHash(self, board):
        boardHash = str(board.reshape(board_cols * board_rows))
        return boardHash

    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                if value >= value_max:
                    value_max = value
                    action = p
        return action
    def addState(self, state):
        self.states.append(state)
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.alpha * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]
    def reset(self):
        self.states = []
    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()
    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()

In [None]:
class nstep(Player):
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []
        self.alpha = 0.1
        self.exp_rate = exp_rate
        self.decay_gamma = 1.0
        self.states_value = {}

    def getHash(self, board):
        boardHash = str(board.reshape(board_cols * board_rows))
        return boardHash

    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                if value >= value_max:
                    value_max = value
                    action = p
        return action
    def addState(self, state):
        self.states.append(state)
    def feedReward(self, reward):
        state = start_state
        states = [state]
        r = [0]
        time = 0
        ep = float('inf')
        while True:
            time += 1
            if time < ep:
                if np.random.binomial(1, 0.5) == 1:
                    next_state = state + 1
                else:
                    next_state = state - 1
                if next_state == 0:
                    reward = -1
                elif next_state == 20:
                    reward = 1
                else: 
                    reward = 0
                states.append(next_state)
                r.append(reward)
                if next_state in terminal:
                    ep = time
            update = time - n
            if update >= 0:
                G = 0.0
                for t in range(update + 1, min(ep, update + n) + 1):
                    G += pow(gamma, t - update - 1) * r[t]
                if update + n <= ep:
                    G += pow(gamma, n) * values[states[(update + n)]]
                state_update = states[update]
                if not state_update in terminal:
                    values[state_update] += alpha * (G - values[state_update])
            if update == ep - 1:
                break
            state = next_state
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.alpha * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]
    def reset(self):
        self.states = []
    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()
    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()

In [5]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name 
    def chooseAction(self, positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action
    def addState(self, state):
        pass

    def feedReward(self, reward):
        pass        
    def reset(self):
        pass

In [6]:
p1 = Player("p1")
p2 = Player("p2")
st = State(p1, p2)
print("training...")
st.play(50000)

training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000
Rounds 30000
Rounds 31000
Rounds 32000
Rounds 33000
Rounds 34000
Rounds 35000
Rounds 36000
Rounds 37000
Rounds 38000
Rounds 39000
Rounds 40000
Rounds 41000
Rounds 42000
Rounds 43000
Rounds 44000
Rounds 45000
Rounds 46000
Rounds 47000
Rounds 48000
Rounds 49000


In [7]:
p1.savePolicy()
p2.savePolicy()

In [11]:
p1 = Player("computer", exp_rate=0)
p1.loadPolicy("policy_p1")

human = HumanPlayer("human")

st = State(p1, human)
st.playHuman()

-----------------
|   |   |   |   | 
-----------------
|   |   | x |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
Input your action row:1
Input your action col:0
-----------------
|   |   |   |   | 
-----------------
| o |   | x |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
-----------------
|   |   | x |   | 
-----------------
| o |   | x |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
Input your action row:2
Input your action col:2
-----------------
|   |   | x |   | 
-----------------
| o |   | x |   | 
-----------------
|   |   | o |   | 
-----------------
|   |   |   |   | 
-----------------
-----------------
|   |   | x |   | 
-----------------
| o |   | x |   | 
-----------------
|   |   | o |   | 
-----------------
|   |   | x |   | 
-----------------
Input your action row:3
Input your action col:1
-----------------
|   |   

In [12]:
p1 = nstep("computer", exp_rate=0)
p2.loadPolicy("policy_p2")

human = HumanPlayer("human")

st = State(p1, human)
st.playHuman()

-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   | x | 
-----------------
Input your action row:0
Input your action col:0
-----------------
| o |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   | x | 
-----------------
-----------------
| o |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   | x | x | 
-----------------
Input your action row:3
Input your action col:0
-----------------
| o |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
| o |   | x | x | 
-----------------
-----------------
| o |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
|   |   |   |   | 
-----------------
| o | x | x | x | 
-----------------
Input your action row:1
Input your action col:0
-----------------
| o |   