<a href="https://colab.research.google.com/github/veda0696-stack/DL-assignment/blob/main/TicTacToe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
"""
Optimized Tic Tac Toe using Reinforcement Learning
"""

import numpy as np
import pickle

BOARD_ROWS = 3
BOARD_COLS = 3


class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None
        self.playerSymbol = 1   # Player 1 starts

    # Unique state hash
    def getHash(self):
        self.boardHash = str(self.board.reshape(BOARD_ROWS * BOARD_COLS))
        return self.boardHash

    # Check winner
    def winner(self):
        for i in range(BOARD_ROWS):
            if abs(sum(self.board[i, :])) == 3:
                self.isEnd = True
                return np.sign(sum(self.board[i, :]))

        for i in range(BOARD_COLS):
            if abs(sum(self.board[:, i])) == 3:
                self.isEnd = True
                return np.sign(sum(self.board[:, i]))

        diag1 = sum(self.board[i, i] for i in range(BOARD_ROWS))
        diag2 = sum(self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_ROWS))
        if abs(diag1) == 3 or abs(diag2) == 3:
            self.isEnd = True
            return 1 if diag1 == 3 or diag2 == 3 else -1

        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0

        self.isEnd = False
        return None

    # Empty positions
    def availablePositions(self):
        return [(i, j) for i in range(BOARD_ROWS)
                        for j in range(BOARD_COLS)
                        if self.board[i, j] == 0]

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        self.playerSymbol *= -1

    # Rewards
    def giveReward(self):
        result = self.winner()
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.3)
            self.p2.feedReward(0.3)

    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.isEnd = False
        self.playerSymbol = 1

    # Training
    def play(self, rounds=10000):
        for _ in range(rounds):
            while not self.isEnd:
                positions = self.availablePositions()
                action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                self.updateState(action)
                self.p1.addState(self.getHash())

                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                positions = self.availablePositions()
                action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                self.updateState(action)
                self.p2.addState(self.getHash())

                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

    # Human play
    def playHuman(self):
        while not self.isEnd:
            positions = self.availablePositions()
            action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
            self.updateState(action)
            self.showBoard()

            if self.winner() is not None:
                print("Computer wins!" if self.winner() == 1 else "Draw!")
                break

            positions = self.availablePositions()
            action = self.p2.chooseAction(positions)
            self.updateState(action)
            self.showBoard()

            if self.winner() is not None:
                print("Human wins!" if self.winner() == -1 else "Draw!")
                break

    def showBoard(self):
        for i in range(BOARD_ROWS):
            print("-------------")
            for j in range(BOARD_COLS):
                if self.board[i, j] == 1:
                    print("| X ", end="")
                elif self.board[i, j] == -1:
                    print("| O ", end="")
                else:
                    print("|   ", end="")
            print("|")
        print("-------------")


class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.gamma = 0.9
        self.states_value = {}

    def getHash(self, board):
        return str(board.reshape(BOARD_ROWS * BOARD_COLS))

    def chooseAction(self, positions, board, symbol):
        if np.random.rand() < self.exp_rate:
            return positions[np.random.choice(len(positions))]

        value_max = -999
        for p in positions:
            next_board = board.copy()
            next_board[p] = symbol
            value = self.states_value.get(self.getHash(next_board), 0)
            if value >= value_max:
                value_max = value
                action = p
        return action

    def addState(self, state):
        self.states.append(state)

    def feedReward(self, reward):
        for st in reversed(self.states):
            self.states_value.setdefault(st, 0)
            self.states_value[st] += self.lr * (self.gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        with open("policy_" + self.name, "wb") as f:
            pickle.dump(self.states_value, f)

    def loadPolicy(self, file):
        with open(file, "rb") as f:
            self.states_value = pickle.load(f)


class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions):
        while True:
            row = int(input("Enter row (0-2): "))
            col = int(input("Enter col (0-2): "))
            if (row, col) in positions:
                return (row, col)


# ================= MAIN =================
if __name__ == "__main__":
    p1 = Player("AI")
    p2 = Player("AI2")

    st = State(p1, p2)
    print("Training AI...")
    st.play(30000)

    p1.savePolicy()

    # Human Play
    p1 = Player("Computer", exp_rate=0)
    p1.loadPolicy("policy_AI")
    p2 = HumanPlayer("Human")

    st = State(p1, p2)

    while True:
        st.playHuman()
        if input("Play again? (y/n): ").lower() != 'y':
            break


Training AI...
-------------
|   | X |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
Enter row (0-2): 1
Enter col (0-2): 2
-------------
|   | X |   |
-------------
|   |   | O |
-------------
|   |   |   |
-------------
-------------
|   | X | X |
-------------
|   |   | O |
-------------
|   |   |   |
-------------
Enter row (0-2): 0
Enter col (0-2): 2
Enter row (0-2): 1
Enter col (0-2): 0
-------------
|   | X | X |
-------------
| O |   | O |
-------------
|   |   |   |
-------------
-------------
| X | X | X |
-------------
| O |   | O |
-------------
|   |   |   |
-------------
Computer wins!
Play again? (y/n): n
