Environment – Tic-Tac-Toe

In [4]:
# Step 1: Environment
import random

class TicTacToeEnv:
    def __init__(self, seed=42):
        self.rng = random.Random(seed)
        self.reset()

    def reset(self):
        self.board = [0]*9
        self.current_player = 1
        self.done = False
        self.winner = 0
        return tuple(self.board)

    def state(self):
        return tuple(self.board)

    def available_actions(self):
        return [i for i,v in enumerate(self.board) if v==0]

    @staticmethod
    def _lines():
        return [
            (0,1,2),(3,4,5),(6,7,8),
            (0,3,6),(1,4,7),(2,5,8),
            (0,4,8),(2,4,6)
        ]

    def _check_winner(self):
        for a,b,c in self._lines():
            s = self.board[a] + self.board[b] + self.board[c]
            if s == 3: return 1
            if s == -3: return -1
        if 0 not in self.board: return 0  # draw
        return None  # ongoing

    def step(self, action, player):
        if self.done: raise ValueError("Game over")
        if self.board[action] != 0:
            self.done = True
            self.winner = -player
            reward = -1 if player==1 else +1
            return self.state(), reward, True, {"illegal": True}

        self.board[action] = player
        status = self._check_winner()
        if status is not None:
            self.done = True
            self.winner = status
            reward = 1 if status==1 else (-1 if status==-1 else 0.2)
            return self.state(), reward, True, {}
        return self.state(), 0, False, {}

    def render(self):
        symbols = {1:'X', -1:'O', 0:'.'}
        for r in range(3):
            print(' '.join(symbols[self.board[r*3+c]] for c in range(3)))
        print()


✅ Test Environment:

In [5]:
env = TicTacToeEnv()
env.render()
state, reward, done, info = env.step(0,1)
env.render()


. . .
. . .
. . .

X . .
. . .
. . .



Opponents

In [6]:
# Step 2: Opponents
import random

def opponent_random(env):
    return random.choice(env.available_actions())

def opponent_rule_based(env):
    board = env.board
    avail = env.available_actions()

    def would_win(action, mark):
        tmp = board.copy()
        tmp[action] = mark
        for a,b,c in TicTacToeEnv._lines():
            if tmp[a]+tmp[b]+tmp[c]==3*mark: return True
        return False

    # 1) Win if possible
    for a in avail:
        if would_win(a, -1): return a
    # 2) Block X
    for a in avail:
        if would_win(a, 1): return a
    # 3) Choose center/corners/sides
    for p in [4,0,2,6,8,1,3,5,7]:
        if p in avail: return p
    return random.choice(avail)


✅ Test Opponent:

In [7]:
env = TicTacToeEnv()
print(opponent_random(env))
print(opponent_rule_based(env))


7
4


Step 4: Training Loop

In [11]:
def play_episode(env, agent, opponent_fn=opponent_random, agent_starts=True):
    s = env.reset()
    env.current_player = 1 if agent_starts else -1
    done = False
    while not done:
        if env.current_player == 1:
            legal = env.available_actions()
            a = agent.select_action(s, legal)
            s_next, r, done, _ = env.step(a, 1)
            legal_next = env.available_actions() if not done else []
            agent.update(s, a, r, s_next, legal_next, done)
            s = s_next
            env.current_player = -1
        else:
            a_op = opponent_fn(env)
            s, r, done, _ = env.step(a_op, -1)
            env.current_player = 1
    return env.winner

def train(agent, episodes=5000, eval_every=500):
    env = TicTacToeEnv()
    log = []
    for ep in range(1, episodes+1):
        agent_starts = (ep%2==0)
        opp = opponent_random if ep%4 !=0 else opponent_rule_based
        winner = play_episode(env, agent, opponent_fn=opp, agent_starts=agent_starts)
        agent.decay_epsilon()
        if ep % eval_every == 0:
            log.append((ep, winner, agent.epsilon))
            print(f"Episode {ep} | Last winner: {winner} | ε={agent.epsilon:.3f}")
    return log


✅ Test Agent:

In [12]:
agent = QAgent()
log = train(agent, episodes=3000, eval_every=500)


Episode 500 | Last winner: -1 | ε=0.779
Episode 1000 | Last winner: -1 | ε=0.606
Episode 1500 | Last winner: -1 | ε=0.472
Episode 2000 | Last winner: 0 | ε=0.368
Episode 2500 | Last winner: -1 | ε=0.286
Episode 3000 | Last winner: -1 | ε=0.223


Step 5: Evaluation

In [19]:
# Step 5: Evaluation
def evaluate(agent, n_games=500, opponent_fn=opponent_rule_based):
    env = TicTacToeEnv()
    results = {1:0, -1:0, 0:0}
    for i in range(n_games):
        s = env.reset()
        env.current_player = 1 if i%2==0 else -1
        done = False
        while not done:
            if env.current_player == 1:
                legal = env.available_actions()
                if legal:
                    q_vals = [agent.Q[(s,a)] for a in legal]
                    a = legal[int(np.argmax(q_vals))]
                    s, r, done, _ = env.step(a, 1)
                env.current_player = -1
            else:
                a_op = opponent_fn(env)
                s, r, done, _ = env.step(a_op, -1)
                env.current_player = 1
        results[env.winner] += 1
    return results

# Evaluate
res_rand = evaluate(agent, 500, opponent_random)
res_rule = evaluate(agent, 500, opponent_rule_based)
print("Vs Random:", res_rand)
print("Vs Rule-Based:", res_rule)


Vs Random: {1: 369, -1: 111, 0: 20}
Vs Rule-Based: {1: 0, -1: 500, 0: 0}


Demo Game

In [20]:
# Step 6: Demo
env_demo = TicTacToeEnv()
s = env_demo.reset()
env_demo.current_player = 1
env_demo.render()

while not env_demo.done:
    if env_demo.current_player == 1:
        legal = env_demo.available_actions()
        if legal:
            q_vals = [agent.Q[(s,a)] for a in legal]
            a = legal[int(np.argmax(q_vals))]
            s, r, done, _ = env_demo.step(a, 1)
        env_demo.render()
        env_demo.current_player = -1
    else:
        a_op = opponent_rule_based(env_demo)
        s, r, done, _ = env_demo.step(a_op, -1)
        env_demo.render()
        env_demo.current_player = 1

print("Winner:", {1:"Agent", -1:"Opponent", 0:"Draw"}[env_demo.winner])


. . .
. . .
. . .

X . .
. . .
. . .

X . .
. O .
. . .

X X .
. O .
. . .

X X O
. O .
. . .

X X O
X O .
. . .

X X O
X O .
O . .

Winner: Opponent
