In [3]:
from gym_tictactoe.env import TicTacToeEnv, agent_by_mark, next_mark
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from myrl.buffers import ReplayBuffer
from myrl.utils import ExperimentWriter
import copy
# from myrl.value_functions import 

env = TicTacToeEnv()
obs = env.reset()

import random
import math
random.random()

0.4683776676063258

In [59]:
class RolloutPolicy:
    def __init__(self, env):
        self.env = env
    def act(self, obs, render=False):
        return env.action_space.sample()
    def rollout(self, obs, model, render=False):
        d = False
        rsum = 0
        while not d:
            obs, r, d, _ = model.step(obs, self.act(obs))
            rsum += r
        return rsum

class Model:
    def __init__(self, env):
        self.env = env
    def step(self, obs, action):
        self._set_env(obs)
        return self.env.step(action)
    def available_actions(self, obs):
        self._set_env(obs)
        return self.env.available_actions()
    def not_available_actions(self, obs):
        return self._list_cut(list(range(self.get_num_actions())), self.available_actions(obs))
    def _list_cut(self, l1, l2):
        toret = []
        for a1 in l1:
            if a1 not in l2:
                toret.append(a1)
        return toret
    def _set_env(self, obs):

        self.env.board = list(obs[0])
        self.env.mark  = obs[1] 
        self.done = False
    def get_num_actions(self):
        return self.env.action_space.n

class TreePolicy:
    def __init__(self):
        pass
    def act(self, obs, available_actions):
        import random
        return random.choice(available_actions)
    def get_action_probs(self, obs, available_actions):
        return [1/len(available_actions) for i in range(len(available_actions))]

class Backbone(nn.Module):
    def __init__(self, net_arch, middle_activation=F.relu, last_activation=F.relu):
        super().__init__()
        self.middle_activation = middle_activation
        self.last_activation = last_activation
        self.layers = nn.ModuleList([nn.Linear(a, b) for a, b in zip(net_arch[:-1], net_arch[1:])])
    def forward(self, h):
        h = h.view(h.shape[0], -1)
        for lay in self.layers[:-1]:
            h = self.middle_activation(lay(h))
        h = self.layers[-1](h)
        h = self.last_activation(h)
        return h

class ValueFunction(nn.Module):
    def __init__(self, net_arch, backbone):
        super().__init__()
        self.backbone = backbone
        self.layers = nn.ModuleList([nn.Linear(a, b) for a, b in zip(net_arch[:-1], net_arch[1:])])
    def forward(self, h):
        h = torch.tensor(h, dtype=torch.float)
        h = h.view(h.shape[0], -1)
        h = self.backbone(h)
        for lay in self.layers[:-1]:
            h = F.relu(lay(h))
        h = self.layers[-1](h)
        return h

class NNTreePolicy(nn.Module):
    def __init__(self, net_arch, backbone, temperature=1):
        super().__init__()
        self.backbone = backbone
        self.temperature = temperature
        self.layers = nn.ModuleList([nn.Linear(a, b) for a, b in zip(net_arch[:-1], net_arch[1:])])
    def forward(self, x, not_available_actions=None):
        h = torch.tensor(x, dtype=torch.float)
        h = h.view(h.shape[0], -1)
        h = self.backbone(h)
        for lay in self.layers[:-1]:
            h = F.tanh(lay(h))
        h = self.layers[-1](h)/self.temperature
        if not_available_actions is not None and len(not_available_actions)>0:
            not_available_actions = torch.tensor(not_available_actions)
            h[0, not_available_actions] = float('-inf')
        h = torch.softmax(h, dim=1)
        return h
    def act(self, obs, not_available_actions):
        obs = self.obs2testorobs(obs)
        h = self.forward(obs, not_available_actions=not_available_actions)
        action = np.random.choice(range(len(h[0])), p=h.detach().squeeze(0).numpy())
        return action      
    def get_action_probs(self, obs, available_actions=None):
        obs = self.obs2testorobs(obs)
        h = self.forward(obs)
        return h.tolist()[0]
    def obs2testorobs(self, obs):
        l2 = [1] if obs[1]=='O' else [-1]
        obs = torch.tensor([list(obs[0])+l2])
        obs[obs==2] = -1
        return obs

class CNNTreePolicy(NNTreePolicy):
    def __init__(self, net_arch, backbone, temperature=1):
        pass

rollout_policy = RolloutPolicy(env)
model = Model(TicTacToeEnv())
backbone = Backbone([10, 64])
value_function = ValueFunction([64, 32, 1], backbone=backbone)
tree_policy = NNTreePolicy([64, 64, 9], backbone=backbone)

In [7]:
def eval_winrate(totest, bench, env, n_games=100):
    wins = 0
    draws = 0
    rsum = 0
    for igame in range(n_games):
        done, reward = False, 0
        obs = env.reset()
        curr_policy = totest if igame<=n_games//2 else bench
        rew2count = 1 if igame<=n_games//2 else -1
        while not done:
            action = curr_policy.act(obs, model.not_available_actions(obs))
            obs, r, done, _ = env.step(action)
            rsum += r
            curr_policy = totest if curr_policy==bench else bench
            wins += 1 if r==rew2count else 0
        draws += 1 if r==0 else 0
    winrate = wins/n_games
    drawrate = draws/n_games
    return winrate, drawrate, (1-winrate-drawrate)

In [81]:
eval_winrate(tree_policy, rollout_policy, TicTacToeEnv(), n_games=1000)

(0.77, 0.032, 0.19799999999999998)

In [87]:
tree_policy.temperature = 0.01

In [25]:
def flip_sign(tensor_obs, idx):
    signs = [1, -1]
    return tensor_obs*signs[idx]
def flip_board(tensor_obs, idx):
    ops = [[], [0], [1], [0, 1]]
    return torch.flip(tensor_obs, ops[idx])
def rotate_board(tensor_obs, idx):
    none = lambda x:x
    rot  = lambda x:torch.rot90(x, 1, [0, 1])
    ops = [none, rot]
    return ops[idx](tensor_obs)
def symetric_add2rbuff(rbuff, list_with_boards, rew_sign):
    added_boards = set()
    for tensor_obs, monte_probs, rew_sign in list_with_boards:
        for sign in range(2):
            for flip in range(4):
                for rotate in range(2):
                    side_len = 3#int(math.sqrt(tensor_obs.shape[0]*tensor_obs.shape[1]))
                    # print(side_len)
                    board_obs = tensor_obs.view(1, -1)
                    board_obs, player = board_obs[:, :-1], board_obs[:, -1].unsqueeze(-1)
                    board_obs = board_obs.reshape(side_len, side_len)
                    sign_boardt = flip_sign(board_obs, sign)
                    player = flip_sign(player, sign)
                    flip_boardt = flip_board(sign_boardt, flip)
                    board = rotate_board(flip_boardt, rotate)
                    board = board.reshape(1, -1)
                    board = torch.cat((board, player), dim=-1)
                    board = board.view(1, 1, -1)


                    board_monte = monte_probs.reshape(side_len, side_len)
                    # sign_montet = flip_sign(board_monte, sign)
                    flip_montet = flip_board(board_monte, flip)
                    monte = rotate_board(flip_montet, rotate)
                    monte = monte.reshape(1, 1, -1)

                    if board not in added_boards:
                        added_boards.add(board)
                        raw_reward = torch.tensor([[[r]]]).float()
                        flip_rew_sign = 1 if sign==0 else -1
                        reward_now = raw_reward*flip_rew_sign*rew_sign
                        rbuff.add(board, monte, reward_now)

In [116]:
def UCB(root_node, policy, model, cpucb=150):
    available_actions = model.available_actions(root_node.obs)
    probs = policy.get_action_probs(root_node.obs)
    minscore, a_minscore = float('inf'), -1
    for action in available_actions:
        if action in root_node.action2child.keys():
            child = root_node.action2child[action]
            q =  child.get_q()
            u = -probs[action]/(1+child.n)
        else:
            q = 0
            u = float('-inf')
        score = q + cpucb*u
        if score < minscore:
            minscore = score
            a_minscore = action
    return a_minscore

def MCTS2(root_node, max_depth, n_times, policy, model, cpucb=150):
    current_node  = root_node
    current_depth = 0
    n_times_done  = 0

    while n_times_done != n_times:
        if current_depth == max_depth or current_node.done:
            reward = current_node.rollout(rollout_policy, model)
            if not current_node.done:
                reward2 = value_function(tree_policy.obs2testorobs(current_node.obs)).item()
                print(reward, reward2, "reward "*3)
            current_node.backpropagate(reward, gamma=0.99)
            current_node = root_node
            n_times_done += 1
            current_depth = 0
            model.env.done = False
        else:
            action = UCB(current_node, policy, model, cpucb)
            if action in current_node.action2child:
                current_node = current_node.action2child[action]
            else:
                current_node = current_node.create_child(Node2, model, action)
            current_depth += 1

    visits = []
    for a in range(model.get_num_actions()):
        if a in root_node.action2child:
            visits.append(root_node.action2child[a].get_q())
        else:
            visits.append(float('inf'))
    return visits

In [109]:
class Node2:
    def __init__(self, obs, reward, change_child_rew_sign=True, reward_sign=1, done=False, parent=None):
        self.n = 0
        self.cumulative_reward = 0
        self.parent = parent
        self.action2child = {}
        self.obs = obs
        self.done = done
        self.reward = reward
        self.reward_sign = reward_sign
        self.change_child_rew_sign = change_child_rew_sign
        
    def get_q(self):
        return self.cumulative_reward/(self.n)

    def backpropagate(self, r, gamma=1):
        self.n += 1
        self.cumulative_reward += r*gamma*self.reward_sign
        if self.parent is not None:
            self.parent.backpropagate(r, gamma=gamma)

    def create_child(self, ChildType, model, action):
        obs, reward, done, info = model.step(self.obs, action)
        reward_sign= -self.reward_sign if self.change_child_rew_sign else self.reward_sign
        child = ChildType(obs, reward, done=done, reward_sign=reward_sign, parent=self, change_child_rew_sign=self.change_child_rew_sign)
        self.action2child[action] = child
        return child  

    def rollout(self, rollout_policy, model, render=False):
        if self.done:
            return self.reward
        return rollout_policy.rollout(self.obs, model, render=render)

In [118]:
import numpy as np
test_env = TicTacToeEnv()
obs = test_env.reset()
done = False
test_env.render()

random_policy = TreePolicy()

while not done:
    rew_sign = 1 if test_env.mark==test_env.start_mark else -1
    root = Node2(obs, 0, reward_sign=rew_sign)
    # print(tree_policy(tree_policy.obs2testorobs(obs)))
    # print(tree_policy(tree_policy.obs2testorobs(obs), model.not_available_actions(obs)))
    # print(tree_policy.act(obs, model.not_available_actions(obs)))
    besta = tree_policy.act(obs, model.not_available_actions(obs))
    dic = MCTS2(root, 1, 100, tree_policy, model, 100)
    dic = np.array(dic)
    tdic = torch.tensor([-dic])
    print("move probs= ", torch.softmax(tdic*10, dim=1).view(3, 3))
    move = np.argmin(dic)
    print(move, besta, test_env.mark)
    # print(dic.reshape(3, 3))
    obs, r, done, _ = test_env.step(besta)
    test_env.render()
    print(" ")
    print(" ")

69948387146 reward reward reward 
-1 -0.04248036444187164 reward reward reward 
1 -0.04248036444187164 reward reward reward 
-1 0.1199137270450592 reward reward reward 
0 0.1199137270450592 reward reward reward 
-1 0.1199137270450592 reward reward reward 
1 0.1199137270450592 reward reward reward 
1 0.1199137270450592 reward reward reward 
1 0.1199137270450592 reward reward reward 
1 0.1199137270450592 reward reward reward 
1 0.1199137270450592 reward reward reward 
1 0.1199137270450592 reward reward reward 
-1 0.1199137270450592 reward reward reward 
1 0.1199137270450592 reward reward reward 
1 0.1199137270450592 reward reward reward 
move probs=  tensor([[1.3170e-07, 9.9725e-01, 0.0000e+00],
        [1.2063e-04, 0.0000e+00, 0.0000e+00],
        [1.3170e-07, 2.6248e-03, 1.3170e-07]], dtype=torch.float64)
1 3 X
   | |X
  -----
  X|O|O
  -----
   | | 

 
 
0 -0.20908930897712708 reward reward reward 
-1 -0.20908930897712708 reward reward reward 
1 -0.20908930897712708 reward reward rewa

In [82]:
best_policy = copy.deepcopy(tree_policy)
wll = ExperimentWriter('tb/alpha_tictacte_zero_symetric_probs_withARENA____')

In [83]:
tree_policy.temperature = 1

In [86]:
test_env = TicTacToeEnv()
obs = test_env.reset()
done = False
test_env.render()
# rbuff = ReplayBuffer(nitems=3, max_len=50*9*8)
bsize = 128
# wll.new()
writer = wll.writer
opt = torch.optim.Adam(list(tree_policy.parameters())+list(value_function.parameters()), lr=1e-3)
import copy
best_tree_policy = copy.deepcopy(tree_policy)
best_opt = copy.deepcopy(opt)
best_vfunc = copy.deepcopy(value_function)
loss = 0

for game in range(10000):
    game_step = 0
    done = False
    game_buff = [] 
    while not done:
        game_step += 1
        rew_sign = 1 if test_env.mark==test_env.start_mark else -1
        root = Node2(obs, 0, reward_sign=rew_sign)
        dic = MCTS2(root, 10, 100, tree_policy, model, 100)
        dic = np.array(dic)

        tdic = torch.tensor([[-dic]])
        monte_probs = torch.softmax(tdic*10, dim=-1).detach()
        tensor_obs = tree_policy.obs2testorobs(obs).unsqueeze(0)
        game_buff.append([tensor_obs, monte_probs, rew_sign])
        
        move = np.argmin(dic)
        obs, r, done, _ = test_env.step(move)

    symetric_add2rbuff(rbuff, game_buff, rew_sign)

    if len(rbuff) > bsize:
        for opt_step in range(4):
            tensor_obs, monte_probs, game_finish = rbuff.get(bsize)
            policy_probs = tree_policy(tensor_obs)
            loss_policy = -(monte_probs*torch.log(policy_probs+1e-8)).mean()*2
            loss_value  = ((value_function(tensor_obs)-game_finish)**2).mean()
            loss = loss_policy + loss_value
            opt.zero_grad()
            loss.backward()
            opt.step()
        print("loss=", loss.item(), loss_policy.item(), loss_value.item())
        writer.add_scalar('loss/loss', loss.item(), game)
        writer.add_scalar('loss/policy', loss_policy.item(), game)
        writer.add_scalar('loss/vfunc', loss_value.item(), game)
    else:
        print(len(rbuff))
    if game % 7 == 0:
        tree_policy.temperature, best_tree_policy.temperature = 1, 1
        winrate, drawrate, loserate = eval_winrate(tree_policy, best_tree_policy, test_env, n_games=100)
        tree_policy.temperature, best_tree_policy.temperature = 0.1, 0.1
        print("ARENA!!! ", winrate, drawrate, loserate)
        if winrate > loserate:
            best_tree_policy = copy.deepcopy(tree_policy)
            best_opt = copy.deepcopy(opt)
            best_vfunc = copy.deepcopy(value_function)
            print("upgrade", winrate, drawrate, loserate)
            winrate, drawrate, _ = eval_winrate(tree_policy, rollout_policy, test_env, n_games=1000)
            print("winrate against random ", winrate, drawrate)
        else:
            tree_policy = copy.deepcopy(best_tree_policy)
            opt = copy.deepcopy(best_opt)
            value_function = copy.deepcopy(best_vfunc)

        
    winrate2, drawrate2, _ = eval_winrate(tree_policy, rollout_policy, test_env, n_games=100)
    writer.add_scalar('winrate/winrate', winrate2, game)
    writer.add_scalar('winrate/drawrate', drawrate2, game)
    print(game, "winrate=", winrate2, drawrate2)

    obs = test_env.reset()


65127023 1.434959888458252
509 winrate= 0.77 0.06
loss= 3.690540838937007 2.2853763587276563 1.4051644802093506
510 winrate= 0.81 0.01
loss= 3.478757577403076 2.0890408329801633 1.3897167444229126
ARENA!!!  0.32 0.25 0.42999999999999994
511 winrate= 0.77 0.06
loss= 3.6741971089923546 2.247821362265365 1.4263757467269897
512 winrate= 0.76 0.05
loss= 3.5669531080458636 2.1460446569655414 1.4209084510803223
513 winrate= 0.75 0.06
loss= 3.7000273124207257 2.267028178357005 1.4329991340637207
514 winrate= 0.82 0.01
loss= 3.5224562136011057 2.1017477957086497 1.420708417892456
515 winrate= 0.81 0.02
loss= 3.4167942751836637 2.0126036156606535 1.4041906595230103
516 winrate= 0.77 0.06
loss= 3.809126016724957 2.370811102021588 1.4383149147033691
517 winrate= 0.78 0.05
loss= 3.562494380377054 2.112750990293741 1.449743390083313
ARENA!!!  0.34 0.26 0.3999999999999999
518 winrate= 0.74 0.06
loss= 3.5895875863307234 2.1835323743098494 1.406055212020874
519 winrate= 0.79 0.04
loss= 3.50173450238928

KeyboardInterrupt: 

In [129]:
! mkdir run65
# torch.save(tree_policy.state_dict(), 'run65/tree.ph')
# torch.save(value_function.state_dict(), 'run65/value.ph')
# torch.save(opt.state_dict(), 'run65/opt.ph')

In [11]:
# tree_policy.load_state_dict(torch.load('run65/tree.ph'))
# value_function.load_state_dict(torch.load('run65/value.ph'))
# opt.load_state_dict(torch.load('run65/opt.ph'))

NameError: name 'opt' is not defined

In [107]:
test_env = TicTacToeEnv()
obs = test_env.reset()
done = False
test_env.render()

random_policy = TreePolicy()
human = 1

while not done:
    rew_sign = 1 if test_env.mark==test_env.start_mark else -1
    root = Node2(obs, 0, reward_sign=rew_sign)
    besta = tree_policy.act(obs, model.not_available_actions(obs))
    if human == rew_sign:
        besta = int(input("your play? "))
        print("your action was", besta)
    obs, r, done, _ = test_env.step(besta)
    test_env.render()
    print(" ")
    print(" ")

   | | 
  -----
   | | 
  -----
   | | 

your action was 5
   | | 
  -----
   | |O
  -----
   | | 

 
 
   | | 
  -----
   | |O
  -----
  X| | 

 
 


ValueError: invalid literal for int() with base 10: ''