In [None]:
import numpy as np
import random

# Define the Tic-Tac-Toe environment
class TicTacToe:
    def __init__(self):
        self.board = [' ' for _ in range(9)]  # Empty board
        self.current_player = 'X'
        self.winning_combinations = [(0, 1, 2), (3, 4, 5), (6, 7, 8),
                                     (0, 3, 6), (1, 4, 7), (2, 5, 8),
                                     (0, 4, 8), (2, 4, 6)]

    def reset(self):
        self.board = [' ' for _ in range(9)]
        self.current_player = 'X'

    def is_valid_move(self, move):
        return self.board[move] == ' '

    def make_move(self, move):
        self.board[move] = self.current_player
        self.current_player = 'O' if self.current_player == 'X' else 'X'

    def check_winner(self):
        for combo in self.winning_combinations:
            if self.board[combo[0]] == self.board[combo[1]] == self.board[combo[2]] != ' ':
                return self.board[combo[0]]
        if ' ' not in self.board:
            return 'Draw'
        return None

    def print_board(self):
        for i in range(0, 9, 3):
            print(' | '.join(self.board[i:i+3]))
            if i < 6:
                print('--+---+--')

# Q-learning parameters
learning_rate = 0.1
discount_factor = 0.9
exploration_prob = 0.1
episodes = 10000

# Initialize the Q-table
Q = {}

# Helper function to get the Q-value for a state-action pair
def get_q_value(state, action):
    return Q.get(state, {}).get(action, 0.0)

# Q-learning algorithm
for episode in range(episodes):
    env = TicTacToe()
    state = ''.join(env.board)  # Convert the board to a string for the state
    done = False

    while not done:
        valid_actions = [i for i in range(9) if env.is_valid_move(i)]

        if random.uniform(0, 1) < exploration_prob:
            action = random.choice(valid_actions)
        else:
            action = max(valid_actions, key=lambda a: get_q_value(state, a))

        next_state = state[:action] + env.current_player + state[action+1:]

        if env.is_valid_move(action):
            env.make_move(action)

        reward = 0
        winner = env.check_winner()
        if winner == 'X':
            reward = 1
        elif winner == 'O':
            reward = -1
        elif winner == 'Draw':
            reward = 0.1

        # Q-value update
        best_next_action = max(valid_actions, key=lambda a: get_q_value(next_state, a))
        Q.setdefault(state, {})[action] = get_q_value(state, action) + learning_rate * (reward + discount_factor * get_q_value(next_state, best_next_action) - get_q_value(state, action))

        state = next_state

        if winner is not None:
            done = True

# Play a game using the trained Q-values
def play_game_with_trained_agent():
    env = TicTacToe()
    state = ''.join(env.board)
    env.print_board()

    while True:
        valid_actions = [i for i in range(9) if env.is_valid_move(i)]

        if env.current_player == 'X':
            action = max(valid_actions, key=lambda a: get_q_value(state, a))
        else:
            action = int(input("Enter your move (0-8): "))

        if env.is_valid_move(action):
            env.make_move(action)
            state = ''.join(env.board)

            print()
            print()
            env.print_board()

            winner = env.check_winner()
            if winner:
                if winner == 'Draw':
                    print("It's a draw!")
                else:
                    print(f"{winner} wins!")
                break
        else:
            print("Invalid move. Try again.")

play_game_with_trained_agent()


  |   |  
--+---+--
  |   |  
--+---+--
  |   |  


X |   |  
--+---+--
  |   |  
--+---+--
  |   |  
Enter your move (0-8): 8


X |   |  
--+---+--
  |   |  
--+---+--
  |   | O


X | X |  
--+---+--
  |   |  
--+---+--
  |   | O
Enter your move (0-8): 2


X | X | O
--+---+--
  |   |  
--+---+--
  |   | O


X | X | O
--+---+--
X |   |  
--+---+--
  |   | O
Enter your move (0-8): 4


X | X | O
--+---+--
X | O |  
--+---+--
  |   | O


X | X | O
--+---+--
X | O | X
--+---+--
  |   | O
Enter your move (0-8): 1
Invalid move. Try again.
Enter your move (0-8): 7


X | X | O
--+---+--
X | O | X
--+---+--
  | O | O


X | X | O
--+---+--
X | O | X
--+---+--
X | O | O
X wins!
