In [1]:
import numpy as np
from numpy import random

In [None]:
class QLearningAgent:
    def __init__(self, n_state=3**9, n_action=9, alpha=0.5, gamma=0.99):
        self.n_state = n_state
        self.n_action = n_action
        self.alpha = alpha
        self.gamma = gamma
        self.qtable = np.random.uniform(low=-1,high=1,size=(self.n_state, self.n_action))

    # Q-learningでQテーブルを更新
    def update_qtable(self, state, action, reward, next_state):
        self.qtable[state,action] += self.alpha*(reward + self.gamma*self.qtable[next_state].max() - self.qtable[state,action])

    # Q値が最大の行動を選択
    def get_greedy_action(self, state):
        return self.qtable[state].argmax()

    # εの確率でランダムに行動を選択
    def get_action(self, state, episode):
        epsilon = 0.7 * (1/(episode+1))
        if epsilon <= np.random.uniform(0,1):
            action = self.get_greedy_action(state)
        else:
            action = -1
        return action


done

0: 空きマスがあり，勝敗もついていない（続行）

1: 勝敗がついている

2: 空きマスがなく，勝敗がついていない

-1: すでに埋まっているマスを選択（違反）

In [40]:
class tic_tac_toe:

    def __init__(self):
        self.idx = 0
        self.board = np.zeros(9, dtype=int)

    def reset(self, done):
        if done != 0:
            self.idx = 0
            self.board[:] = 0

    # ボードの様子を描画
    def draw(self):
        print(f"idx: {self.idx}")
        for i in range(3):
            print()
            for j in range(3):
                if self.board[3*i+j] == 1:
                    print(" O ", end="")
                elif self.board[3*i+j] == 2:
                    print(" X ", end="")
                else:
                    print(" * ", end="")
            print()

    # 新しいマークを追加
    def step(self, pos):
        # すでに置かれている
        if self.board[pos] != 0:
            done = -1
            return done

        self.idx += 1
        self.board[pos] = 2 - self.idx%2
        done = self.judge()
        return done

    def random(self):
        pos = random.choice(np.where(self.board == 0)[0])
        self.idx += 1
        self.board[pos] = 2 - self.idx%2

    def judge(self):
        for patt in [[0,1,2],[3,4,5],[6,7,8],[0,3,6],[1,4,7],[2,5,8],[0,4,8],[2,4,6]]:
            if np.all(self.board[patt] == 2 - self.idx%2):
                done = 1
                break
        else:
            done = 2 if self.idx == 9 else 0
        return done


In [41]:
game = tic_tac_toe()
agent = QLearningAgent()


In [50]:
game.reset(-1)
game.draw()
for _ in range(9):
    game.random()
    game.draw()
    if game.judge():
        print("tic-tac-toe end")
        break


idx: 0

 *  *  * 

 *  *  * 

 *  *  * 
idx: 1

 *  *  * 

 *  *  * 

 *  O  * 
idx: 2

 *  *  * 

 *  *  * 

 *  O  X 
idx: 3

 *  *  O 

 *  *  * 

 *  O  X 
idx: 4

 X  *  O 

 *  *  * 

 *  O  X 
idx: 5

 X  *  O 

 *  *  * 

 O  O  X 
idx: 6

 X  *  O 

 X  *  * 

 O  O  X 
idx: 7

 X  O  O 

 X  *  * 

 O  O  X 
idx: 8

 X  O  O 

 X  *  X 

 O  O  X 
idx: 9

 X  O  O 

 X  O  X 

 O  O  X 
tic-tac-toe end


In [None]:
# 各種設定
num_episode = 1200  # 学習エピソード数
penalty = 10  # 途中でエピソードが終了したときのペナルティ

# ログ
episode_rewards = []
max_steps = (game.n_row) * (game.n_column)  # エピソードの最大ステップ数

for episode in range(num_episode):
    game.reset()
    episode_reward = 0

    for t in range(max_steps):
        action = agent.get_action(state, episode)  #  行動を選択
        next_state, reward, done, _ = game.step(action)
        # もしエピソードの途中で終了してしまったらペナルティを加える
        if done and t < max_steps - 1:
            reward = - penalty
        episode_reward += reward
        agent.update_qtable(state, action, reward, next_state)  # Q値の表を更新
        state = next_state
        if done:
            break

    episode_rewards.append(episode_reward)
    if episode % 50 == 0:
        print("Episode %d finished | Episode reward %f" % (episode, episode_reward))
