In [1]:
from gym_tictactoe.env import TicTacToeEnv, agent_by_mark, next_mark
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from myrl.buffers import ReplayBuffer
from myrl.utils import ExperimentWriter
# from myrl.value_functions import 

env = TicTacToeEnv()
obs = env.reset()

In [2]:
env.action_space

Discrete(9)

In [148]:
class RolloutPolicy:
    def __init__(self, env):
        self.env = env
    def act(self, obs, render=False):
        return env.action_space.sample()
    def rollout(self, obs, model, render=False):
        d = False
        rsum = 0
        while not d:
            obs, r, d, _ = model.step(obs, self.act(obs))
            rsum += r
        return rsum

class Model:
    def __init__(self, env):
        self.env = env
    def step(self, obs, action):
        self._set_env(obs)
        return self.env.step(action)
    def available_actions(self, obs):
        self._set_env(obs)
        return self.env.available_actions()
    def not_available_actions(self, obs):
        return self._list_cut(list(range(self.get_num_actions())), self.available_actions(obs))
    def _list_cut(self, l1, l2):
        toret = []
        for a1 in l1:
            if a1 not in l2:
                toret.append(a1)
        return toret
    def _set_env(self, obs):
        self.env.board = list(obs[0])
        self.env.mark  = obs[1] 
        self.done = False
    def get_num_actions(self):
        return self.env.action_space.n

class TreePolicy:
    def __init__(self):
        pass
    def act(self, obs, available_actions):
        import random
        return random.choice(available_actions)
    def get_action_probs(self, obs, available_actions):
        return [1/len(available_actions) for i in range(len(available_actions))]

class Backbone(nn.Module):
    def __init__(self, net_arch, middle_activation=F.relu, last_activation=F.relu):
        super().__init__()
        self.middle_activation = middle_activation
        self.last_activation = last_activation
        self.layers = nn.ModuleList([nn.Linear(a, b) for a, b in zip(net_arch[:-1], net_arch[1:])])
    def forward(self, h):
        h = h.view(h.shape[0], -1)
        for lay in self.layers[:-1]:
            h = self.middle_activation(lay(h))
        h = self.layers[-1](h)
        h = self.last_activation(h)
        return h

class ValueFunction(nn.Module):
    def __init__(self, net_arch, backbone):
        super().__init__()
        self.backbone = backbone
        self.layers = nn.ModuleList([nn.Linear(a, b) for a, b in zip(net_arch[:-1], net_arch[1:])])
    def forward(self, h):
        h = torch.tensor(h, dtype=torch.float)
        h = h.view(h.shape[0], -1)
        h = self.backbone(h)
        for lay in self.layers[:-1]:
            h = F.relu(lay(h))
        h = self.layers[-1](h)
        return h

class NNTreePolicy(nn.Module):
    def __init__(self, net_arch, backbone, temperature=1):
        super().__init__()
        self.backbone = backbone
        self.temperature = temperature
        self.layers = nn.ModuleList([nn.Linear(a, b) for a, b in zip(net_arch[:-1], net_arch[1:])])
    def forward(self, x, not_available_actions=None):
        h = torch.tensor(x, dtype=torch.float)
        h = h.view(h.shape[0], -1)
        h = self.backbone(h)
        for lay in self.layers[:-1]:
            h = F.relu(lay(h))
        h = self.layers[-1](h)/self.temperature
        if not_available_actions is not None and len(not_available_actions)>0:
            not_available_actions = torch.tensor(not_available_actions)
            try:
                h[0, not_available_actions] = float('-inf')
            except:
                print(not_available_actions)
                print(h)
                raise
        h = torch.softmax(h, dim=1)
        return h
    def act(self, obs, not_available_actions):
        obs = self.obs2testorobs(obs)
        h = self.forward(obs, not_available_actions=not_available_actions)
        action = np.random.choice(range(len(h[0])), p=h.detach().squeeze(0).numpy())
        return action      
    def get_action_probs(self, obs, available_actions):
        obs = self.obs2testorobs(obs)
        h = self.forward(obs)
        return h.tolist()[0]
    def obs2testorobs(self, obs):
        l2 = [1] if obs[1]=='O' else [-1]
        obs = torch.tensor([list(obs[0])+l2])
        obs[obs==2] = -1
        return obs

rollout_policy = RolloutPolicy(env)
model = Model(TicTacToeEnv())
backbone = Backbone([10, 16])
value_function = ValueFunction([16, 4, 1], backbone=backbone)
tree_policy = NNTreePolicy([16, 9, 9], backbone=backbone)

In [149]:
class Node:
    def __init__(self, obs, reward, change_child_rew_sign=True, reward_sign=1, done=False, parent=None):
        self.n = 0
        self.cumulative_reward = 0#reward
        self.parent = parent
        self.action2child = {}
        self.nchildren = 0
        self.taken_actions = []
        self.obs = obs
        self.done = done
        self.reward = reward
        self.reward_sign = reward_sign
        self.change_child_rew_sign = change_child_rew_sign
        self.nzeros = 0
        self.nones = 0
        self.nmones = 0
    def get_q(self):
        return self.cumulative_reward/(self.n)

    def backpropagate(self, r, gamma=1):
        self.n += 1
        self.cumulative_reward += r*gamma*self.reward_sign
        self.nzeros += 1 if r==0 else 0
        self.nones += 1 if r*self.reward_sign==1  else 0
        self.nmones+= 1 if r*self.reward_sign==-1 else 0
        if not( -1 <= self.get_q() <= 1 ) and 0:
            print(self.__dict__)
        if self.parent is None:
            return 
        self.parent.backpropagate(r, gamma=gamma)
    def print_parents(self):
        if self.parent is None:
            return
        self.parent.print_parents()

    def create_child(self, ChildType, policy, model):
        action = policy.act(self.obs, self.taken_actions)
        self.taken_actions.append(action)
        obs, reward, done, info = model.step(self.obs, action)
        reward_sign= -self.reward_sign if self.change_child_rew_sign else self.reward_sign
        child = ChildType(obs, reward, done=done, reward_sign=reward_sign, parent=self, change_child_rew_sign=self.change_child_rew_sign)
        self.action2child[action] = child
        self.nchildren += 1
        return child, action

    def _list_cut(self, l1, l2):
        toret = []
        for a1 in l1:
            if a1 not in l2:
                toret.append(a1)
        return toret

    def rollout(self, rollout_policy, model, render=False):
        if self.done:
            return self.reward
        return rollout_policy.rollout(self.obs, model, render=render)

In [150]:
def UCB(root_node, policy, model, alpha=1):
    scores = [0 for i in range(model.get_num_actions())]
    all_actions = list(range(model.get_num_actions()))
    probs = policy.get_action_probs(root_node.obs, all_actions)
    minscore, a_minscore = float('inf'), -1
    for a in root_node.action2child.keys():
        child = root_node.action2child[a]
        u = -probs[a]/(1+child.n)
        q = child.get_q()
        score = q + alpha*u
        scores.append(score)
        if score < minscore:
            minscore = score
            a_minscore = a
    return scores, (a_minscore, minscore)

def MCTS(root_node, max_depth, n_times, policy, model, alpha_UCB=1):
    current_node  = root_node
    current_depth = 0
    n_times_done  = 0

    while n_times_done != n_times:
        if current_depth == max_depth or current_node.done:
            reward = current_node.rollout(rollout_policy, model)
            current_node.backpropagate(reward, gamma=0.99)
            current_node = root_node
            n_times_done += 1
            current_depth = 0
            model.env.done = False
        elif current_node.nchildren < len(model.available_actions(current_node.obs)):
            child, action = current_node.create_child(Node, policy, model) 
            current_node = child
            current_depth += 1
        else:
            scores, (a_minscore, minscore) = UCB(current_node, policy, model, alpha_UCB)
            current_node = current_node.action2child[a_minscore]
            current_depth += 1

    visits = []
    for a in range(model.get_num_actions()):
        if a in root_node.action2child:
            visits.append(root_node.action2child[a].get_q())
        else:
            visits.append(float('inf'))
    return visits

In [151]:
def eval_winrate(totest, bench, env, n_games=100):
    wins = 0
    draws = 0
    rsum = 0
    for igame in range(n_games):
        done, reward = False, 0
        obs = env.reset()
        curr_policy = totest if igame<=n_games//2 else bench
        rew2count = 1 if igame<=n_games//2 else -1
        while not done:
            action = curr_policy.act(obs, model.not_available_actions(obs))
            obs, r, done, _ = env.step(action)
            rsum += r
            curr_policy = totest if curr_policy==bench else bench
            wins += 1 if r==rew2count else 0
        draws += 1 if r==0 else 0
    winrate = wins/n_games
    drawrate = draws/n_games
    return winrate, drawrate, (1-winrate-drawrate)

In [152]:
import numpy as np
test_env = TicTacToeEnv()
obs = test_env.reset()
done = False
test_env.render()
# rbuff = ReplayBuffer(200)
random_policy = TreePolicy()

while not done:
    rew_sign = 1 if test_env.mark==test_env.start_mark else -1
    root = Node(obs, 0, reward_sign=rew_sign)
    dic = MCTS(root, 10, 100, tree_policy, model, 100)
    dic = np.array(dic)
    tdic = torch.tensor([-dic])
    print("move probs= ", torch.softmax(tdic, dim=1).view(3, 3))
    move = np.argmin(dic)
    print(move, test_env.mark)
    print(dic.reshape(3, 3))
    obs, r, done, _ = test_env.step(move)
    test_env.render()
    print(" ")
    print(" ")

   | | 
  -----
   | | 
  -----
   | | 

move probs=  tensor([[0.1352, 0.0666, 0.1341],
        [0.1137, 0.1088, 0.1242],
        [0.1242, 0.0795, 0.1137]], dtype=torch.float64)
0 O
[[-0.28285714  0.42428571 -0.275     ]
 [-0.11       -0.066      -0.198     ]
 [-0.198       0.2475     -0.11      ]]
  O| | 
  -----
   | | 
  -----
   | | 

 
 
move probs=  tensor([[0.1996, 0.0742, 0.1354],
        [0.0000, 0.1177, 0.1904],
        [0.1120, 0.0966, 0.0742]], dtype=torch.float64)
0 X
[[-0.33        0.66        0.05823529]
 [        inf  0.198      -0.28285714]
 [ 0.2475      0.396       0.66      ]]
  X| | 
  -----
   | | 
  -----
   | | 

 
 
move probs=  tensor([[0.0000, 0.1184, 0.1228],
        [0.1437, 0.1115, 0.1417],
        [0.1070, 0.1254, 0.1295]], dtype=torch.float64)
3 O
[[       inf 0.27       0.23294118]
 [0.07615385 0.33       0.09      ]
 [0.37125    0.21214286 0.18      ]]
  X| | 
  -----
  O| | 
  -----
   | | 

 
 
move probs=  tensor([[0.1692, 0.2131, 0.1501],
        [

In [None]:
best_policy = copy.deepcopy(tree_policy)
wll = ExperimentWriter('tb/alpha_tictacte_zero_valuef_')


In [58]:
root._list_cut([1, 2, 3, 5], [2, 3, 4])
model.available_actions(obs), root.taken_actions, obs

([1, 8], [6, 4], ((1, 0, 2, 2, 2, 1, 2, 1, 0), 'O'))

In [45]:
test_env = TicTacToeEnv()
obs = test_env.reset()
done = False
test_env.render()
# rbuff = ReplayBuffer(nitems=3, max_len=150)
bsize = 128
wll.new()
writer = wll.writer
# opt = torch.optim.Adam(list(tree_policy.parameters())+list(value_function.parameters()), lr=5e-3)
import copy
best_tree_policy = copy.deepcopy(tree_policy)
best_opt = copy.deepcopy(opt)
best_vfunc = copy.deepcopy(value_function)

for game in range(10000):
    game_step = 0
    done = False
    tmp_buff = []
    while not done:
        game_step += 1
        rew_sign = 1 if test_env.mark==test_env.start_mark else -1
        root = Node(obs, 0, reward_sign=rew_sign)
        dic = MCTS(root, 10, 100, tree_policy, model, 100)
        dic = np.array(dic)

        tdic = torch.tensor([[-dic]])
        monte_probs = torch.softmax(tdic, dim=-1).detach()
        tensor_obs = tree_policy.obs2testorobs(obs).unsqueeze(0)
        # rbuff.add(tensor_obs, monte_probs)
        tmp_buff.append([tensor_obs, monte_probs, rew_sign])
        
        move = np.argmin(dic)
        obs, r, done, _ = test_env.step(move)

    for elements in tmp_buff:
        # print(elements[0].shape, elements[1].shape, torch.tensor([[[r]]]).shape)
        rbuff.add(elements[0], elements[1], elements[2]*torch.tensor([[[r]]]).float())

    if len(rbuff) > bsize:
        for opt_step in range(4):
            tensor_obs, monte_probs, game_finish = rbuff.get(bsize)
            policy_probs = tree_policy(tensor_obs)
            loss_policy = -(monte_probs*torch.log(policy_probs+1e-8)).mean()
            loss_value  = ((value_function(tensor_obs)-game_finish)**2).mean()
            loss = loss_policy + loss_value
            opt.zero_grad()
            loss.backward()
            opt.step()
        writer.add_scalar('loss/loss', loss.item(), game)
        writer.add_scalar('loss/policy', loss_policy.item(), game)
        writer.add_scalar('loss/vfunc', loss_value.item(), game)
        # else:
        #     print(len(rbuff))
    # winrate, drawrate, loserate = eval_winrate(tree_policy, best_tree_policy, test_env, n_games=100)
    # if winrate > 0.55:
    #     best_tree_policy = copy.deepcopy(tree_policy)
    #     best_opt = copy.deepcopy(opt)
    #     best_vfunc = copy.deepcopy(value_function)
    #     print("upgrade", winrate, drawrate, loserate)
    # else:
    #     tree_policy = copy.deepcopy(best_tree_policy)
    #     opt = copy.deepcopy(best_opt)
    #     value_function = copy.deepcopy(best_vfunc)
        
    winrate, drawrate, _ = eval_winrate(tree_policy, rollout_policy, test_env, n_games=100)
    writer.add_scalar('winrate/winrate', winrate, game)
    writer.add_scalar('winrate/drawrate', drawrate, game)
    print(game, "winrate=", winrate, drawrate)

    obs = test_env.reset()


   | | 
  -----
   | | 
  -----
   | | 

0 winrate= 0.45 0.02
1 winrate= 0.58 0.01
2 winrate= 0.51 0.0
3 winrate= 0.51 0.01
4 winrate= 0.47 0.05
5 winrate= 0.55 0.01
6 winrate= 0.48 0.0
7 winrate= 0.55 0.02
8 winrate= 0.52 0.0
9 winrate= 0.46 0.01
10 winrate= 0.5 0.0
11 winrate= 0.39 0.01
12 winrate= 0.51 0.0
13 winrate= 0.53 0.0
14 winrate= 0.55 0.01
15 winrate= 0.48 0.0
16 winrate= 0.48 0.0
17 winrate= 0.48 0.02
18 winrate= 0.58 0.02
19 winrate= 0.46 0.0
20 winrate= 0.5 0.02
21 winrate= 0.56 0.01
22 winrate= 0.54 0.02
23 winrate= 0.57 0.01
24 winrate= 0.49 0.03
25 winrate= 0.55 0.01
26 winrate= 0.52 0.02
27 winrate= 0.54 0.02
28 winrate= 0.56 0.0
29 winrate= 0.53 0.01
30 winrate= 0.54 0.0
31 winrate= 0.5 0.0
32 winrate= 0.4 0.02
33 winrate= 0.5 0.01
34 winrate= 0.55 0.02
35 winrate= 0.5 0.0
36 winrate= 0.44 0.02
37 winrate= 0.53 0.0
38 winrate= 0.49 0.04
39 winrate= 0.51 0.01
40 winrate= 0.49 0.04
41 winrate= 0.47 0.01
42 winrate= 0.42 0.01
43 winrate= 0.41 0.02
44 winrate= 0.48 0.02

KeyboardInterrupt: 

In [37]:
rbuff.deqs[1]

deque([tensor([[0.0000, 0.1192, 0.1454, 0.1302, 0.0000, 0.1553, 0.1772, 0.1454, 0.1274]],
              dtype=torch.float64),
       tensor([[0.0000, 0.1542, 0.1768, 0.0627, 0.1909, 0.1364, 0.1687, 0.0000, 0.1104]],
              dtype=torch.float64),
       tensor([[0.1316, 0.1316, 0.2254, 0.0000, 0.3300, 0.0000, 0.0000, 0.1813, 0.0000]],
              dtype=torch.float64),
       tensor([[0.2119, 0.0000, 0.2006, 0.0000, 0.0000, 0.2557, 0.0000, 0.1292, 0.2026]],
              dtype=torch.float64),
       tensor([[0.0000, 0.0000, 0.0000, 0.2197, 0.0000, 0.2845, 0.0000, 0.1999, 0.2960]],
              dtype=torch.float64),
       tensor([[0.2037, 0.0000, 0.0000, 0.0000, 0.2988, 0.2249, 0.0000, 0.2726, 0.0000]],
              dtype=torch.float64),
       tensor([[0.0000, 0.2263, 0.1032, 0.0000, 0.0000, 0.0000, 0.1816, 0.4888, 0.0000]],
              dtype=torch.float64),
       tensor([[0.1116, 0.0000, 0.0680, 0.0000, 0.2296, 0.1618, 0.1430, 0.1430, 0.1430]],
              dtype=torch.fl

In [None]:
print(inspect.getsource(rbuff.add))

In [26]:
tree_policy

NNTreePolicy(
  (layers): ModuleList(
    (0): Linear(in_features=9, out_features=9, bias=True)
    (1): Linear(in_features=9, out_features=9, bias=True)
  )
)

In [None]:
0 0 1
0 0 0 
0 0 0