In [18]:
import numpy as np
import copy
import time
from utils import encode_state

In [19]:
class Node:
    def __init__(self, model, game, value, player, parent, action):
        self.model = model
        self.game = game
        self.value = value
        self.player = player

        self.parent = parent
        self.action = action
        self.childs = []

        self.S = 0
        self.N = 0
        self.V = 0
        self.P = []

        self.evaluate()

    def evaluate(self):
        state = self.game.prepareBoardForPlayer(self.player)
        encoded_state = encode_state(state)
        self.P, self.V = self.model(np.array([encoded_state]))
        self.P, self.V = np.array(self.P)[0], np.array(self.V)[0, 0]
        
    def expand(self):
        actions = self.game.getPossibleActions()
        for action in actions:
            game = copy.deepcopy(self.game)
            game.updateBoard(action, self.value)
            child = Node(self.model, game, self.value * -1, self.player, self, action)
            self.childs.append(child)
            child.backpropogate(child.V)
    
    def backpropogate(self, value):
        node = self
        while node != None:
            node.S += value
            node.N += 1
            node = node.parent

    def select_child(self, C):
        if len(self.childs) > 0:
            UCBs = [child.UCB(C) for child in self.childs]
            max_i = np.argmax(UCBs)
            return self.childs[max_i]
        else:
            return None

    def UCB(self, C):
        l = self.S / self.N if self.N > 0 else 0
        r = self.P[self.action] * C * np.sqrt(self.parent.N) / (1 + self.N)
        return l + r
    
    def get_child_by_action(self, action):
        for child in self.childs:
            if child.action == action:
                return child
        return None

class MCTS:
    def __init__(self, model, C, thinking_amount, stop_mode = "time"):
        self.model = model
        self.C = C
        self.thinking_amount = thinking_amount
        self.stop_mode = stop_mode

    def think(self, game, value):
        root = Node(self.model, copy.deepcopy(game), value, value, None, None)
        self.start_time = time.time()
        self.expanded = 0
        while self.should_think():
            node = self.select(root)
            node.expand()
            self.expanded += 1
        
        # print(f"expanded: {self.expanded}")
        p = []
        # all_n = sum([child.N for child in root.childs])
        for action in range(7):
            child = root.get_child_by_action(action)
            p_val = 0 if child == None else child.N / root.N# all_n
            p.append(p_val)
        return p
    
    def should_think(self):
        if self.stop_mode == "time":
            return time.time() - self.start_time < self.thinking_amount
        else:
            return self.expanded < self.thinking_amount

    def select(self, node):
        selected = node.select_child(self.C)
        while selected != None:
            node = selected
            selected = node.select_child(self.C)
        return node

In [20]:
from game import gamerules

In [21]:
class MCTSPlayer(gamerules.Player):
    def __init__(self, name, mcts):
        super().__init__(name)
        self.mcts = mcts
    
    def getAction(self, board, value):
        actions = self.mcts.think(board, value)
        action = np.argmax(actions)
        return action

    def newGame(self, new_opponent):
        pass

In [22]:
from utils import play_game, test_games
from classes.player import RNGPlayer

In [23]:
# mcts = MCTS(0.8, 1)
# p1 = MCTSPlayer("Custom", mcts)
# p2 = RNGPlayer()

In [24]:
# test_games(p1, p2, 100)

In [25]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
from tensorflow import keras

In [26]:
def build_network():
    input = keras.Input((6, 7, 3))
    x = keras.layers.Conv2D(32, (3, 3), activation="relu", padding="same")(input)
    x = keras.layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = keras.layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    x = keras.layers.Dense(64, activation="relu")(x)

    policy = keras.layers.Dense(7, activation="softmax", name="policy")(x)
    value = keras.layers.Dense(1, name="value")(x)
    model = keras.Model(inputs=input, outputs=[policy, value])
    return model

model = build_network()

model.compile(
    optimizer="adam",
    loss={
        "policy": "categorical_crossentropy",
        "value": "mean_squared_error"
    },
    metrics = {
        "policy": "accuracy",
        "value": "mse"
    }
)

model.summary()
model.save("models/model_init.h5")

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 6, 7, 3)]    0           []                               
                                                                                                  
 conv2d_3 (Conv2D)              (None, 6, 7, 32)     896         ['input_2[0][0]']                
                                                                                                  
 conv2d_4 (Conv2D)              (None, 6, 7, 64)     18496       ['conv2d_3[0][0]']               
                                                                                                  
 conv2d_5 (Conv2D)              (None, 6, 7, 64)     36928       ['conv2d_4[0][0]']               
                                                                                            

In [27]:
board = gamerules.Board()
encoded_state = encode_state(board.board)
# expanded = np.expand_dims(encoded_state, axis=0)
pol, val = model(np.array([encoded_state]))
print(pol)

tf.Tensor(
[[0.15262103 0.1334935  0.12522776 0.15412737 0.1386852  0.14764065
  0.14820446]], shape=(1, 7), dtype=float32)


In [28]:
pol, val = model(np.array([encoded_state]))
pol, val = np.array(pol)[0], np.array(val)[0, 0]
print(pol)
print(val)


[0.15262103 0.1334935  0.12522776 0.15412737 0.1386852  0.14764065
 0.14820446]
-0.059420027


In [29]:
a = np.zeros((2, 2))
b = np.zeros((2, 2))
c = np.zeros((2, 2))
x = np.stack([a, b, c], axis=2)
print(x)

[[[0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]]]


In [43]:
class ReplayBuffer:
    def __init__(self, states = []):
        self.states = states

    def add(self, state):
        self.states.append(state)
    
    def retrieve_all(self):
        X, Y_p, Y_v = zip(*self.states)
        X, Y_p, Y_v = np.array(X), np.array(Y_p), np.array(Y_v)
        return X, Y_p, Y_v
    
    def retrieve_last(self, last_count):
        n = len(self.states)
        start = n - last_count
        start = np.clip(start, 0, n-1)
        
        states = self.states[start: n]

        X, Y_p, Y_v = zip(*states)
        X, Y_p, Y_v = np.array(X), np.array(Y_p), np.array(Y_v)
        return X, Y_p, Y_v

In [51]:
import copy
from tensorflow.keras.models import load_model
import pickle

In [77]:
# ITERATION_COUNT = 100
START_ITERATION = 0
END_ITERATION = 5
ITERATIONS = range(START_ITERATION, END_ITERATION)
GAMES_PER_ITERATION = 10
EPOCH_COUNT = 100
BATCH_SIZE = 250
LOAD_MODEL = True
MODEL_PATH = "models_initial/model_50.h5"
LOAD_BUFFER = True
BUFFER_PATH = "buffer.pkl"
LAST_COUNT = 1500

if LOAD_MODEL:
    model = load_model(MODEL_PATH)

if LOAD_BUFFER:
    with open(BUFFER_PATH, 'rb') as f:
        buffer = pickle.load(f)
else:
    buffer = ReplayBuffer()

mcts = MCTS(model, 1, 4)
buffer = ReplayBuffer()
buffer.states = []

for iteration in ITERATIONS:
    print(f"iteration: {iteration + 1}")

    for game in range(GAMES_PER_ITERATION):
        print(f"game: {game + 1}")
        board = gamerules.Board()
        boards = []
        states = []
        p_vals = []
        turn = 1

        while len(board.getPossibleActions()) > 0:
            p = mcts.think(board, turn)
            p_vals.append(p)
            action = np.random.choice(7, p=p)
            
            state = board.prepareBoardForPlayer(turn)
            state = encode_state(state)
            states.append(state)
            boards.append(copy.deepcopy(board))

            board.updateBoard(action, turn)
            
            gameWon = board.checkVictory(action, turn)
            if gameWon:
                break
            turn *= -1

        res = turn if gameWon else 0
        turn = 1
        for i in range(len(states)):
            state = states[i]
            p = p_vals[i]
            v = turn * res
            turn *= -1

            buffer.add((state, p, v))
    X, Y_p, Y_v = buffer.retrieve_last(LAST_COUNT)
    history = model.fit(
        X, {"policy": Y_p, "value": Y_v},
        epochs=EPOCH_COUNT,
        batch_size=BATCH_SIZE
    )
    model.save(f"models/model_{iteration+1}.h5")

OSError: No file or directory found at models_initial/model50_.h5

In [73]:
from utils import test_games

# model0 = load_model("models_prob_bad/model_init.h5")
model1 = load_model("models/model_50.h5")

# mcts0 = MCTS(model0, 2, 1, "time")
mcts1 = MCTS(model1, 2, 4, "time")

# p1 = MCTSPlayer("Initial model", mcts0)
p1 = RNGPlayer()
p2 = MCTSPlayer("After iteration model", mcts1)

test_games(p1, p2, 10)

game pair 0, result: (1; 1)
game pair 1, result: (-1; 1)
game pair 2, result: (0; 1)
game pair 3, result: (-1; 1)
game pair 4, result: (0; 1)
p1_wins: 1: (1; 0)
p2_wins: 7: (5; 2)
draws: 2


In [74]:
print(len(buffer.states))

15113


In [75]:
import pickle
with open("buffer.pkl", "wb") as f:
    pickle.dump(buffer, f)