In [None]:
import numpy as np
from collections import defaultdict
import random

# =========================
# 1. ゲーム設定（元コードと同じ）
# =========================
cards = [0,1,2]   # J=0, Q=1, K=2
card_names = ['J','Q','K']
actions = [0,1]   # Check/Fold=0, Bet/Raise/Call=1
action_names = ['Check/Fold','Raise/Call']

class Node:
    def __init__(self):
        self.N = 0
        self.Q = 0
        self.children = {}  # action -> Node

trees1 = defaultdict(Node)
trees2 = defaultdict(Node)

def compute_reward(my_card, opp_card, history):
    reward = 0
    if len(history)==2:
        h0,h1 = history
        if h0==0 and h1==0: #Check→Check
            reward = 1 if my_card>opp_card else -1
        elif h0==1 and h1==0: #Raise→Fold
            reward = 1
        elif h0==1 and h1==1: #Raise→Call
            reward = 2 if my_card>opp_card else -2
    elif len(history)==3: #Check→Raise→?
        h0,h1,h2 = history
        if h0==0 and h1==1:
            if h2==0:#先手アクションがFold
                reward = -1
            else:#先手アクションがCall
                reward = 2 if my_card>opp_card else -2
    return reward

# reward for player_id: keeps same semantics as before
def get_reward_for_player(player_id, p1_card, p2_card, history):
    p1_view = compute_reward(p1_card, p2_card, history)
    return p1_view if player_id == 1 else -p1_view

def is_terminal(history):
    if history in ([0,0],[1,0],[1,1],[0,1,0],[0,1,1]):
        return True
    return False

# =========================
# 新しいシミュレーション関数（摂動ノイズのオプションを追加）
# =========================
def simulate_both_mcts(noise_sigma=0.0):
    """
    1プレイアウトで両者とも自己のツリーを用いてUCB選択を行い、
    プレイアウト終了後に各プレイヤーの訪問情報でバックアップする。
    noise_sigma > 0 の場合、UCBスコアに正規分布ノイズを加える。
    """
    p1_card = np.random.choice(cards)
    p2_card = np.random.choice([c for c in cards if c != p1_card])

    history = []
    visits_p1 = []
    visits_p2 = []
    Cp = 2.0

    while True:
        if is_terminal(history) == True:
            break

        current_player = 1 if len(history) % 2 == 0 else 2

        if current_player == 1:
            state = (p1_card, tuple(history))
            tree = trees1
            visits = visits_p1
        else:
            state = (p2_card, tuple(history))
            tree = trees2
            visits = visits_p2

        node = tree[state]

        # 子が未生成／未訪問の行動がある場合はランダムに選ぶ（探索促進）
        if node.N == 0 or any(a not in node.children or node.children[a].N == 0 for a in actions):
            action = np.random.choice(actions)
        else:
            # UCB 選択と摂動ノイズの加算
            def calculate_score(a):
                child = node.children[a]
                # UCB スコアの計算
                ucb_score = child.Q + np.sqrt(np.log(node.N + 1) / (child.N + 1) * Cp)

                # 摂動フェーズの場合、正規分布からのノイズを加算
                if noise_sigma > 0.0:
                    # ノイズ N(0, noise_sigma) を抽出
                    noise = np.random.normal(0, noise_sigma)
                    return ucb_score + noise
                else:
                    return ucb_score

            action = max(actions, key=calculate_score)

        # ノードとactionを記録しておき、バックアップ時に使う
        if action not in node.children:
            node.children[action] = Node()
        child_node = node.children[action]

        visits.append((node, action, child_node))

        history = history + [action]

    # ゲーム終了: 報酬を計算（P1視点）
    reward_p1 = compute_reward(p1_card, p2_card, history)

    # バックアップ
    # P1 の訪問更新（報酬は reward_p1）
    for node, action, child in visits_p1:
        child.N += 1
        child.Q += (reward_p1 - child.Q) / child.N
        node.N += 1

    # P2 の訪問更新（P2視点の報酬は -reward_p1）
    for node, action, child in visits_p2:
        child.N += 1
        child.Q += ((-reward_p1) - child.Q) / child.N
        node.N += 1

    return reward_p1

# =========================
# メインループ（3フェーズでの学習）
# =========================
# 各フェーズのシミュレーション回数
N1_OPTIMIZATION = 10000000  # 最適応答に近づける
N2_PERTURBATION = 1000000   # 摂動を加える
N3_REOPTIMIZATION = 10000000 # 再度最適応答に近づける
n_total_simulations = N1_OPTIMIZATION + N2_PERTURBATION + N3_REOPTIMIZATION

# 摂動の強さ（標準偏差。UCBスコアに加算される正規分布ノイズの標準偏差）
SIGMA_PERTURBATION = 0.5

# チェックポイント（結果を表示したいシミュレーション回数）
# フェーズの境目を含める
checkpoints = [
    100000, 1000000,
    N1_OPTIMIZATION, # フェーズ1終了時
    N1_OPTIMIZATION + 100000, # フェーズ2中盤
    N1_OPTIMIZATION + N2_PERTURBATION, # フェーズ2終了時
    N1_OPTIMIZATION + N2_PERTURBATION + 1000000, # フェーズ3中盤
    n_total_simulations # 全シミュレーション終了時
]

for i in range(n_total_simulations):

    current_sim = i + 1

    # フェーズの判定とノイズの設定
    if current_sim <= N1_OPTIMIZATION:
        # Phase 1: 最適応答 (純粋なUCB)
        noise_sigma = 0.0
    elif current_sim <= N1_OPTIMIZATION + N2_PERTURBATION:
        # Phase 2: 摂動 (UCB + 正規分布ノイズ)
        noise_sigma = SIGMA_PERTURBATION
    else:
        # Phase 3: 再最適応答 (純粋なUCB)
        noise_sigma = 0.0

    # シミュレーション実行
    simulate_both_mcts(noise_sigma=noise_sigma)

    if current_sim in checkpoints:
        print(f"=======================================")
        print(f"シミュレーション回数 N={current_sim} (Phase={1 if current_sim <= N1_OPTIMIZATION else 2 if current_sim <= N1_OPTIMIZATION + N2_PERTURBATION else 3}, Noise={noise_sigma})")
        print(f"=======================================")

        # 先手の学習済み行動分布
        print("先手の学習済み行動分布")
        for state, node in trees1.items():
            my_card, history = state
            total_N = sum(child.N for child in node.children.values())

            # N=0のノードは無視
            if total_N == 0: continue

            print(f"\nCard={card_names[my_card]}, History={history}")
            for a, child in node.children.items():
                prob = child.N/total_N
                print(f"  Action {action_names[a]}: N={child.N}, Estimated Prob={prob:.3f}, Q={child.Q:.3f}")

        # 後手の学習済み行動分布
        print("\n後手の学習済み行動分布")
        for state, node in trees2.items():
            my_card, history = state
            # P2が判断するのは履歴長が奇数の場合のみ
            if len(history) % 2 == 1:
                 total_N = sum(child.N for child in node.children.values())

                 # N=0のノードは無視
                 if total_N == 0: continue

                 print(f"\nCard={card_names[my_card]} (P2), History={history}")
                 for a, child in node.children.items():
                     prob = child.N/total_N
                     print(f"  Action {action_names[a]} (P2): N={child.N}, Estimated Prob={prob:.3f}, Q={child.Q:.3f}")

シミュレーション回数 N=100000 (Phase=1, Noise=0.0)
先手の学習済み行動分布

Card=J, History=()
  Action Check/Fold: N=27417, Estimated Prob=0.819, Q=-1.001
  Action Raise/Call: N=6073, Estimated Prob=0.181, Q=-1.032

Card=J, History=(np.int64(0), np.int64(1))
  Action Check/Fold: N=18387, Estimated Prob=0.999, Q=-1.000
  Action Raise/Call: N=18, Estimated Prob=0.001, Q=-2.000

Card=Q, History=()
  Action Raise/Call: N=608, Estimated Prob=0.018, Q=-0.459
  Action Check/Fold: N=32651, Estimated Prob=0.982, Q=-0.283

Card=K, History=()
  Action Check/Fold: N=15902, Estimated Prob=0.478, Q=1.239
  Action Raise/Call: N=17349, Estimated Prob=0.522, Q=1.176

Card=K, History=(np.int64(0), np.int64(1))
  Action Check/Fold: N=3, Estimated Prob=0.001, Q=-1.000
  Action Raise/Call: N=3806, Estimated Prob=0.999, Q=2.000

Card=Q, History=(np.int64(0), np.int64(1))
  Action Check/Fold: N=10864, Estimated Prob=0.522, Q=-1.000
  Action Raise/Call: N=9932, Estimated Prob=0.478, Q=-1.002

後手の学習済み行動分布

Card=K (P2), History=(np