In [1]:
import random

In [2]:
class Player:
    def __init__(self, name, char, player_type='human'):
        self.name = name
        self.char = char
        self.player_type = player_type
    def make_move(self, board):
        move = input('Make your move: ')
        return int(move)
    def new_game(self, ):
        print(f'{self.name} is {self.char}')
    def winner(self, ):
        print(f'{self.name} is winner')
    def reward(self, reward_value):
        print(f'{self.name} gets {reward_value}')
    def available_moves(self, board):
        moves = [x for x in range(9) if board[x] == ' ']
        return moves

In [3]:
[' '] * 9
''

''

In [4]:
class TicTacToe:
    def __init__(self, player1, player2):
        self.player1 = player1
        self.player2 = player2
        self.board = [' '] * 9
        self.player1_turn = random.choice([True, False])

    def print_board(self, ):
        b = self.board
        board = f'''
        {b[0]} | {b[1]} | {b[2]}
        ----------
        {b[3]} | {b[4]} | {b[5]}
        ----------
        {b[6]} | {b[7]} | {b[8]}
        ----------
        '''
        print(board)
        
    def check_winner(self, symbol):
        b = self.board
        winner = False
        conditions = (
            (0, 1, 2), (3, 4, 5), (6, 7, 8),
            (0, 3, 6), (1, 4, 7), (2, 5, 8),
            (0, 4, 8), (2, 4, 6)
        )
        for each_condition in conditions:
            i1, i2, i3 = each_condition
            if symbol == b[i1] == b[i2] == b[i3]:
                winner = True
                return winner
        return winner
    
    def check_full(self, ):
        emptys = [x for x in self.board if x == ' ']
        if len(emptys) == 0:
            return True
        return False
        
    def play(self, ):
        self.player1.new_game()
        self.player2.new_game()
        
        while True:
            self.print_board()
            # Initiate player's turn
            if self.player1_turn:
                player = self.player1
                other_player = self.player2
            else:
                player = self.player2
                other_player = self.player1
            
            # Ask for move
            if player.player_type == 'human':
                print(f'{player.name} Turn')
            move = player.make_move(self.board)
            # check validity, if invalid: ask again
            if (move < 0 or move > 8) or (self.board[move] != ' '):
                print('Invalid move!!!!')
                player.reward(-0.25, self.board)
                continue
            # put char
            self.board[move] = player.char
            # check winner; If win, break
            win = self.check_winner(player.char)
            if win:
                player.winner()
                player.reward(5, self.board)
                other_player.reward(-5, self.board)
                self.print_board()
                break
            # checkfull(draw) if full, break
            full = self.check_full()
            if full:
                print('Game ended as draw')
                player.reward(0.5, self.board)
                self.print_board()
                break
            # swap turn
            self.player1_turn = not self.player1_turn
            player.reward(0.25, self.board)

In [5]:
class AI(Player):
    def __init__(self, name, char, epsilon=1, alpha=0.01, gamma=0.9):
        super().__init__(name, char, 'AI')
        self.epsilon = epsilon # epsilon greedy
        self.alpha = alpha # Learning Rate
        self.gamma = gamma # Discount Factor
        self.q_table = {}

    def get_Q(self, state, action):
        if self.q_table.get((state, action)) == None:
            self.q_table[(state, action)] = 5
        return self.q_table[(state, action)]
        
    def q_learn(self,state, action, reward, new_state ):
        '''
        new_q_value = prev_q_value - alpha (reward + gamma * max_q - prev_q_value)
        '''
        prev_q_value = self.get_Q(state, action) 
        available_actions = self.available_moves(state) # make this function
        available_q_values = [self.get_Q(new_state, action) for action in available_actions]
        max_q = max(available_q_values)
        self.q_table[(state, action)] = prev_q_value - self.alpha * (reward + self.gamma * max_q - prev_q_value)
    
    def reward(self, reward_value, board):
        self.q_learn(self.prev_board,
                    self.prev_action,
                    reward_value, 
                    tuple(board)
                    )
        
    def make_move(self, board):
        # save old state
        self.prev_board = tuple(board)
        available_actions = self.available_moves(board)
        # select Random Action(Exploration)
        if random.random() < self.epsilon:
            action_taken = random.choice(available_actions)
            self.prev_action = action_taken
            self.epsilon *= 0.9999
            return action_taken
        # Select action with max Q-Value(Exploitation)
        q_values = []
        for action in available_actions:
            q_val = self.get_Q(board, action)
            q_values.append(q_val)
        max_q_val = max(q_values)
        index = q_values.index(max_q_val)
        action_taken = available_actions[index]
        self.previous_action = action_taken
        return action_taken

In [6]:
p1 = AI('P-one', 'X')
p2 = AI('P-two', 'O')

In [7]:
for i in range(5):
    print(f'\nEpisode: {1+i}')
    game = TicTacToe(p1, p2)
    game.play()


Episode: 1
P-one is X
P-two is O

          |   |  
        ----------
          |   |  
        ----------
          |   |  
        ----------
        

          | O |  
        ----------
          |   |  
        ----------
          |   |  
        ----------
        

          | O | X
        ----------
          |   |  
        ----------
          |   |  
        ----------
        

          | O | X
        ----------
          |   |  
        ----------
          | O |  
        ----------
        

          | O | X
        ----------
          | X |  
        ----------
          | O |  
        ----------
        

          | O | X
        ----------
          | X |  
        ----------
          | O | O
        ----------
        

          | O | X
        ----------
          | X | X
        ----------
          | O | O
        ----------
        
P-two is winner

          | O | X
        ----------
          | X | X
        ----------
        O | O | O
        --