In [1]:
import numpy as np
import matplotlib.pyplot as plt
import ipdb

np.random.seed(0)

In [340]:
def board_to_int(board):
    x = 0
    for i in range(9):
        x *= 3
        x += board.flat[8 - i]
    return x

def int_to_board(x):
    board = np.zeros((3, 3), 'int')
    for i in range(9):
        board.flat[i] = x % 3
        x //= 3
    return board

In [341]:
def test_coding_encoding(tests_cnt):
    for i in range(tests_cnt):
        d = np.random.randint(3, size=(3, 3))
        x = board_to_int(d)
        d2 = int_to_board(x)
        if not ((d2 == d).all()):
            print('err')
            return

test_coding_encoding(0)

In [342]:
def calc_allowed_actions(state):
    board = int_to_board(state)
    return np.flatnonzero(board == 0)

In [431]:
states_count = 3 ** 9
allowed_actions = list(map(calc_allowed_actions, range(states_count)))
base_states = np.repeat(-1, states_count)
transforms = np.empty((states_count), dtype=tuple)

In [432]:
def find_similar_states(base_state):
    board = int_to_board(base_state)
    for i in range(4):
        state = board_to_int(board)
        base_states[state] = base_state
        transforms[state] = np.array((i, 0))
        board_reversed = np.fliplr(board)
        reversed_state = board_to_int(board_reversed)
        transforms[reversed_state] = np.array((i, 1))
        base_states[reversed_state] = base_state
        board = np.rot90(board)
        
        
for state in range(states_count):
    if base_states[state] == -1:
        find_similar_states(state)

In [407]:
a = np.flatnonzero(base_states == 98)
for st in a:
    b = int_to_board(st)
    print(transforms[st])


[0 0]
[0 1]
[3 1]
[1 0]
[3 0]
[2 1]
[1 1]
[2 0]


In [394]:
class Judge:
    def __init__(self):
        self.DRAW = 0
        self.PLAYING = 3
        self.results = list(map(self.calc_game_status, range(states_count)))

    def get_board_lines(self, state):
        board = int_to_board(state)
        board_rotated = np.rot90(board)
        diag1 = np.diag(board)
        diag2 = np.diag(board_rotated)
        return np.vstack([board, board_rotated, diag1, diag2])
        
    def line_winner(self, line):
        for player in range(1, 3):
            if (line == player).all():
                return player
        return 0

    def calc_game_status(self, state):
        all_lines = self.get_board_lines(state)
        for line in all_lines:
            if self.line_winner(line):
                return self.line_winner(line)
        return self.PLAYING if len(allowed_actions[state]) else self.DRAW
    
    def get_result(self, state):
        return self.results[state]

In [395]:
judge = Judge()

In [354]:
class AgentSARSA:
    def __init__(self, epsilon=0.01, gamma=0.5, alpha=0.1):
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.q = np.zeros((states_count, 9))
        self.prev_state = 0
        self.prev_action = 0
        
    def choose_action(self, state):
        q = self.q[state].take(allowed_actions[state])
        indices_of_best = np.flatnonzero(q == q.max())
        best_actions = allowed_actions[state].take(indices_of_best)
        good_choice = np.random.choice(best_actions)
        rand_choice = np.random.choice(allowed_actions[state])
        return np.random.choice([good_choice, rand_choice], p=[1 - self.epsilon, self.epsilon])
        
    def get_action(self, state):
        action = self.choose_action(state)
        if state:
            self.q[self.prev_state][self.prev_action] += self.alpha * (self.gamma * self.q[state][action]
                                                                       - self.q[self.prev_state][self.prev_action])
        self.prev_state = state
        self.prev_action = action
        return action
    
    def put_reward(self, reward):
        self.q[self.prev_state][self.prev_action] += self.alpha * (reward
                                                              - self.q[self.prev_state][self.prev_action])

In [546]:
class Simulator:
    def __init__(self, judge, agents):
        self.agents = agents
        self.judge = judge
        
    def play_games(self, games_cnt):
        for i in range(games_cnt):
            progress = round((i + 1) / games_cnt * 100, 1)
            print('\r' * 10 + str(progress) + '%', end='')
            state = 0
            player = 0
            while self.judge.get_result(state) == 3:
                action = self.agents[player].get_action(state)
                state += (player + 1) * 3 ** action
                player = (player + 1) % 2
                state = base_states[state]
                
            winner = self.judge.get_result(state)
            if winner:
                self.agents[winner - 1].put_reward(100)
                self.agents[1 - (winner - 1)].put_reward(-100)
        print()

In [550]:
SARSA_agents = AgentSARSA(alpha=0.05), AgentSARSA(alpha=0.05)

sim = Simulator(judge, SARSA_agents)

In [551]:
import time
cl = time.clock()
sim.play_games(10000)
cl = time.clock() - cl
print('Duration: ', cl)

100.0%
Duration:  5.899312999999893


In [560]:
class PlayingInterface:
    def __init__(self, agents, role=1):
        self.judge = judge
        if role == 1:
            self.agent = agents[1]
        else:
            self.agent = agents[0]
        self.agent.epsilon = 0
        self.agent_role = 3 - role
        self.role = role        
        
    def start(self):
        transform = []
        res = 3
        state = 0
        if self.role == 2:
            action = self.agent.get_action(state)
            state += self.agent_role * 3 ** action
        self.print_board(state)
        
        while res == 3:
            y, x = list(map(int, input().split()))
            action = y * 3 + x
            state += self.role * 3 ** action
            res = self.judge.get_result(state)
            
            transform = transforms[state]
            state = base_states[state]
            
            if res == 3:
                action = self.agent.get_action(state)
                state += self.agent_role * 3 ** action
                res = self.judge.get_result(state)
            
            board = int_to_board(state)
            board = np.rot90(board, transform[0])
            if transform[1]:
                board = np.fliplr(board)
            state = board_to_int(board)
            
            self.print_board(state)

        if res == self.role:
            print("You won")
        elif res == self.agent_role:
            print("You lost")
        elif res == 0:
            print("Draw")
        
        print('Restart? (y/n)')
        ans = input()
        if ans == 'y':
            self.start()
        
    def print_board(self, state):
        b = int_to_board(state)
        print(b)

In [561]:
p = PlayingInterface(SARSA_agents)

In [None]:
p.start()

[[0 0 0]
 [0 0 0]
 [0 0 0]]
1 1
[[0 0 2]
 [0 1 0]
 [0 0 0]]
0 1
[[0 1 2]
 [0 1 0]
 [0 2 0]]
2 2
[[2 1 2]
 [0 1 0]
 [0 2 1]]
1 2
[[2 1 2]
 [2 1 1]
 [0 2 1]]
2 0
[[2 2 1]
 [1 1 2]
 [2 1 1]]
Draw
Restart? (y/n)


In [555]:
b = np.zeros((3, 3), 'int')
b[1][1] = 1
x = base_states[board_to_int(b)]
print(int_to_board(x))
SARSA_agents[1].q[x]

[[0 0 0]
 [0 1 0]
 [0 0 0]]


array([-1.22527061, -0.7500491 , -0.86946113, -1.06738796,  0.        ,
       -0.83554497, -0.04547683, -0.81485557, -0.72190697])

In [559]:
SARSA_agents[1].epsilon=0
sim.play_games(10000)
SARSA_agents[1].q[x]

100.0%


array([-1.22527061, -0.7500491 , -0.86946113, -1.06738796,  0.        ,
       -0.83554497,  0.00744915, -0.81485557, -0.72190697])

# Q-learning

In [525]:
class AgentQ(AgentSARSA):
    def get_action(self, state):
        action = self.choose_action(state)
        q_max = self.q[state].take(allowed_actions[state]).max()
        
        self.q[self.prev_state][self.prev_action] += self.alpha * (self.gamma * q_max
                                                                   - self.q[self.prev_state][self.prev_action])
        self.prev_state = state
        self.prev_action = action
        return action

In [527]:
Q_agents = [AgentQ(alpha=0.05), AgentQ(alpha=0.05)]

sim2 = Simulator(judge, Q_agents)
sim2.play_games(100000)
print(1)

100.0%
1


In [528]:
p = PlayingInterface(Q_agents[1])

In [529]:
p.start()

[[0 0 0]
 [0 0 0]
 [0 0 0]]
1 1
[[0 0 2]
 [0 1 0]
 [0 0 0]]
0 1
[[0 1 2]
 [0 1 0]
 [0 2 0]]
2 2
[[2 1 2]
 [0 1 0]
 [0 2 1]]
1 0
[[2 1 2]
 [1 1 2]
 [0 2 1]]
2 0
Draw
