In [2]:
import numpy as np
import random

# 井字棋环境
class TicTacToe:
    def __init__(self):
        self.board = [' ' for _ in range(9)]
        self.current_player = 'X'

    def reset(self):
        self.board = [' ' for _ in range(9)]
        self.current_player = 'X'
        return self.get_state()

    def get_state(self):
        return ''.join(self.board)

    def is_valid_move(self, action):
        return self.board[action] == ' '

    def make_move(self, action):
        if self.is_valid_move(action):
            self.board[action] = self.current_player
            self.current_player = 'O' if self.current_player == 'X' else 'X'
            return True
        return False

    def check_winner(self):
        winning_combinations = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # 横行
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # 竖列
            [0, 4, 8], [2, 4, 6]  # 对角线
        ]
        for combo in winning_combinations:
            if self.board[combo[0]] == self.board[combo[1]] == self.board[combo[2]] != ' ':
                return self.board[combo[0]]
        if ' ' not in self.board:
            return 'Tie'
        return None

# Q-learning 智能体
class QLearningAgent:
    def __init__(self, alpha=0.1, epsilon=0.1, discount=0.95):
        self.q_table = {}
        self.alpha = alpha
        self.epsilon = epsilon
        self.discount = discount

    def get_action(self, state, available_actions):
        if random.random() < self.epsilon:
            return random.choice(available_actions)
        else:
            q_values = [self.q_table.get((state, action), 0) for action in available_actions]
            max_q = max(q_values)
            best_actions = [action for action, q in zip(available_actions, q_values) if q == max_q]
            return random.choice(best_actions)

    def update(self, state, action, reward, next_state):
        current_q = self.q_table.get((state, action), 0)
        next_q = max([self.q_table.get((next_state, a), 0) for a in range(9)])
        new_q = current_q + self.alpha * (reward + self.discount * next_q - current_q)
        self.q_table[(state, action)] = new_q

# 训练函数
def train(episodes=100000):
    env = TicTacToe()
    agent = QLearningAgent()

    for _ in range(episodes):
        state = env.reset()
        done = False

        while not done:
            available_actions = [i for i in range(9) if env.is_valid_move(i)]
            action = agent.get_action(state, available_actions)
            env.make_move(action)
            next_state = env.get_state()
            winner = env.check_winner()

            if winner == 'X':
                reward = 1
                done = True
            elif winner == 'O':
                reward = -1
                done = True
            elif winner == 'Tie':
                reward = 0.5
                done = True
            else:
                reward = 0
                done = False

            agent.update(state, action, reward, next_state)
            state = next_state

            if not done:
                # 对手随机行动
                opponent_action = random.choice([i for i in range(9) if env.is_valid_move(i)])
                env.make_move(opponent_action)
                state = env.get_state()
                winner = env.check_winner()
                if winner:
                    done = True

    return agent

# 使用训练好的智能体进行游戏
def play_game(agent):
    env = TicTacToe()
    state = env.reset()
    done = False

    while not done:
        if env.current_player == 'X':
            available_actions = [i for i in range(9) if env.is_valid_move(i)]
            action = agent.get_action(state, available_actions)
        else:
            action = int(input("Enter your move (0-8): "))

        env.make_move(action)
        print(env.board[0:3])
        print(env.board[3:6])
        print(env.board[6:9])
        print()

        winner = env.check_winner()
        if winner:
            done = True
            if winner == 'Tie':
                print("It's a tie!")
            else:
                print(f"{winner} wins!")

        state = env.get_state()

# 主函数
if __name__ == "__main__":
    trained_agent = train()
    play_game(trained_agent)

[' ', ' ', ' ']
[' ', ' ', 'X']
[' ', ' ', ' ']

[' ', ' ', 'O']
[' ', ' ', 'X']
[' ', ' ', ' ']

[' ', 'X', 'O']
[' ', ' ', 'X']
[' ', ' ', ' ']

['O', 'X', 'O']
[' ', ' ', 'X']
[' ', ' ', ' ']

['O', 'X', 'O']
['X', ' ', 'X']
[' ', ' ', ' ']

['O', 'X', 'O']
['X', ' ', 'X']
[' ', ' ', 'O']

['O', 'X', 'O']
['X', 'X', 'X']
[' ', ' ', 'O']

X wins!
