先手の学習済み行動分布 (各カード・状態ごと):

Card=Q, History=()
  Action Check: N=1142, Estimated Prob=0.698
  Action Bet: N=494, Estimated Prob=0.302

Card=J, History=()
  Action Bet: N=12, Estimated Prob=0.007
  Action Check: N=1681, Estimated Prob=0.993

Card=K, History=()
  Action Check: N=1, Estimated Prob=0.001
  Action Bet: N=1670, Estimated Prob=0.999


先手の学習済み行動分布 (カード・履歴ごと):

Card=J, History=()
  Action Raise: N=17, Estimated Prob=0.001
  Action Check: N=16648, Estimated Prob=0.999

Card=J, History=(np.int64(1), np.int64(1))
  Action Check: N=9, Estimated Prob=0.529
  Action Raise: N=8, Estimated Prob=0.471

Card=K, History=()
  Action Raise: N=16659, Estimated Prob=1.000
  Action Check: N=2, Estimated Prob=0.000

Card=K, History=(np.int64(1), np.int64(1))
  Action Check: N=6243, Estimated Prob=0.500
  Action Raise: N=6243, Estimated Prob=0.500

Card=Q, History=()
  Action Raise: N=16382, Estimated Prob=0.982
  Action Check: N=292, Estimated Prob=0.018

Card=Q, History=(np.int64(1), np.int64(1))
  Action Check: N=6147, Estimated Prob=0.500
  Action Raise: N=6147, Estimated Prob=0.500

Card=Q, History=(np.int64(0), np.int64(1))
  Action Check: N=76, Estimated Prob=0.349
  Action Raise: N=142, Estimated Prob=0.651

Card=J, History=(np.int64(0), np.int64(1))
  Action Check: N=16631, Estimated Prob=0.999
  Action Raise: N=17, Estimated 

In [None]:
import numpy as np
from collections import defaultdict

# =========================
# ゲーム設定
# =========================
cards = [0,1,2]   # J=0, Q=1, K=2
card_names = ['J','Q','K']
actions = [0,1]   # Check/Fold=0, Bet/Raise/Call=1
action_names = ['Check/Fold','Raise/Call']

nash_second_player_after_check = {0:[2/3,1/3], 1:[1.0,0.0], 2:[0.0,1.0]}
nash_second_player_after_raise = {0:[1.0,0.0], 1:[2/3,1/3], 2:[0.0,1.0]}

class Node:
    def __init__(self):
        self.N = 0
        self.Q = 0
        self.children = {}

trees = defaultdict(Node)

def compute_reward(my_card, opp_card, history):
    reward = 0
    if len(history)==2:
        h0,h1 = history
        if h0==0 and h1==0: reward = 1 if my_card>opp_card else -1
        elif h0==1 and h1==0: reward = 1
        elif h0==1 and h1==1: reward = 2 if my_card>opp_card else -2
    elif len(history)==3:
        h0,h1,h2 = history
        if h0==0 and h1==1: reward = -1 if h2==0 else (2 if my_card>opp_card else -2)
    return reward

def simulate(my_card, history):
    opp_card = np.random.choice([c for c in cards if c != my_card])
    state = (my_card, tuple(history))
    node = trees[state]

    if node.N==0 or any(a not in node.children or node.children[a].N==0 for a in actions):
        action = np.random.choice(actions)
    else:
        action = max(actions, key=lambda a: node.children[a].Q + np.sqrt(np.log(node.N+1)/(node.children[a].N+1)))

    history_next = history + [action]

    if len(history_next)==1:
        p = nash_second_player_after_check[opp_card] if history_next[0]==0 else nash_second_player_after_raise[opp_card]
        opp_action = np.random.choice(actions, p=p)
        history_next.append(opp_action)

    reward_total = None
    if len(history_next)==2 and history_next[0]==0 and history_next[1]==1:
        next_state = (my_card, tuple(history_next))
        next_node = trees[next_state]
        if next_node.N==0 or any(a not in next_node.children or next_node.children[a].N==0 for a in actions):
            next_action = np.random.choice(actions)
        else:
            next_action = max(actions, key=lambda a: next_node.children[a].Q + np.sqrt(np.log(next_node.N+1)/(next_node.children[a].N+1)))
        history_final = history_next + [next_action]
        reward_total = compute_reward(my_card, opp_card, history_final)
        if next_action not in next_node.children: next_node.children[next_action]=Node()
        child2 = next_node.children[next_action]
        child2.N += 1
        child2.Q += (reward_total - child2.Q)/child2.N
        next_node.N += 1
    else:
        reward_total = compute_reward(my_card, opp_card, history_next)

    if action not in node.children: node.children[action]=Node()
    child = node.children[action]
    child.N += 1
    child.Q += (reward_total - child.Q)/child.N
    node.N += 1

    return reward_total

# =========================
# 途中報告付き POMCP
# =========================
n_simulations = 10000000
checkpoints = [100, 1000, 10000, 100000, 1000000,2000000,3000000,4000000,5000000,6000000,7000000,8000000,9000000,10000000]
rewards_all = []

for i in range(n_simulations):
    my_card = np.random.choice(cards)
    r = simulate(my_card, [])
    rewards_all.append(r)

    if (i+1) in checkpoints:
        print(f"\n--- Simulations = {i+1} ---")
        # 初期状態 Raise 頻度
        print("初期状態 Raise 頻度 (先手カード別)")
        for c in cards:
            state = (c, tuple([]))
            node = trees[state]
            total = sum(child.N for child in node.children.values())
            raise_N = node.children[1].N if 1 in node.children else 0
            freq = raise_N/total if total>0 else 0
            print(f"  {card_names[c]}: {freq:.3f}")

        # Check->Raise->Call 頻度
        print("Check->Raise->Call 頻度 (先手カード別)")
        for c in cards:
            state = (c, tuple([0,1]))
            node = trees[state]
            total = sum(child.N for child in node.children.values())
            call_N = node.children[1].N if 1 in node.children else 0
            freq = call_N/total if total>0 else 0
            print(f"  {card_names[c]}: {freq:.3f}")

        # 初期状態期待値
        expected_value = np.mean(rewards_all)
        print(f"初期状態期待値 (カード配布前): {expected_value:.3f}")



--- Simulations = 100 ---
初期状態 Raise 頻度 (先手カード別)
  J: 0.065
  Q: 0.032
  K: 0.579
Check->Raise->Call 頻度 (先手カード別)
  J: 0.125
  Q: 0.042
  K: 0.750
初期状態期待値 (カード配布前): -0.150

--- Simulations = 1000 ---
初期状態 Raise 頻度 (先手カード別)
  J: 0.012
  Q: 0.003
  K: 0.349
Check->Raise->Call 頻度 (先手カード別)
  J: 0.017
  Q: 0.017
  K: 0.977
初期状態期待値 (カード配布前): -0.100

--- Simulations = 10000 ---
初期状態 Raise 頻度 (先手カード別)
  J: 0.004
  Q: 0.001
  K: 0.553
Check->Raise->Call 頻度 (先手カード別)
  J: 0.004
  Q: 0.459
  K: 0.995
初期状態期待値 (カード配布前): -0.075

--- Simulations = 100000 ---
初期状態 Raise 頻度 (先手カード別)
  J: 0.169
  Q: 0.000
  K: 0.662
Check->Raise->Call 頻度 (先手カード別)
  J: 0.001
  Q: 0.293
  K: 0.999
初期状態期待値 (カード配布前): -0.058

--- Simulations = 1000000 ---
初期状態 Raise 頻度 (先手カード別)
  J: 0.517
  Q: 0.001
  K: 0.504
Check->Raise->Call 頻度 (先手カード別)
  J: 0.000
  Q: 0.047
  K: 1.000
初期状態期待値 (カード配布前): -0.055

--- Simulations = 2000000 ---
初期状態 Raise 頻度 (先手カード別)
  J: 0.675
  Q: 0.001
  K: 0.548
Check->Raise->Call 頻度 (先手カード別)
  J: 0.000
 

In [None]:
import numpy as np
from collections import defaultdict

# =========================
# 1. ゲーム設定
# =========================
cards = [0,1,2]   # J=0, Q=1, K=2
card_names = ['J','Q','K']
actions = [0,1]   # Check/Fold=0, Raise/Call=1
action_names = ['Check/Fold','Raise/Call']

# =========================
# 2. 後手固定Nash方策
# =========================
# 先手Check直後
nash_second_player_after_check = {
    0: [2/3, 1/3],  # J: Check 2/3, Raise 1/3
    1: [1.0, 0.0],  # Q: Check 1, Raise 0
    2: [0.0, 1.0],  # K: Check 0, Raise 1
}

# 先手Raise直後
nash_second_player_after_raise = {
    0: [1.0, 0.0],    # J: Fold 1, Call 0
    1: [2/3, 1/3],    # Q: Fold 2/3, Call 1/3
    2: [0.0, 1.0],    # K: Fold 0, Call 1
}

# =========================
# 3. POMCP用ノード定義
# =========================
class Node:
    def __init__(self):
        self.N = 0
        self.Q = 0
        self.children = {}  # action -> Node

trees = defaultdict(Node)

# =========================
# 4. 報酬計算関数
# =========================
def compute_reward(my_card, opp_card, history):
    reward = 0
    if len(history)==2:
        h0,h1 = history
        if h0==0 and h1==0:
            reward = 1 if my_card>opp_card else -1
        elif h0==0 and h1==1:
            reward = -1
        elif h0==1 and h1==0:
            reward = 1
        elif h0==1 and h1==1:
            reward = 2 if my_card>opp_card else -2
    elif len(history)==3:
        h0,h1,h2 = history
        if h0==0 and h1==1:
            reward = -1 if h2==0 else (2 if my_card>opp_card else -2)
    return reward

# =========================
# 5. POMCPシミュレーション
# =========================
def simulate(my_card, history):
    # 後手のカード
    opp_card = np.random.choice([c for c in cards if c!=my_card])

    # 現在ノード
    state = (my_card, tuple(history))
    node = trees[state]

    # 先手アクション選択（UCB or ランダム）
    if node.N==0 or any(a not in node.children or node.children[a].N==0 for a in actions):
        action = np.random.choice(actions)
    else:
        action = max(actions, key=lambda a: node.children[a].Q + np.sqrt(np.log(node.N+1)/(node.children[a].N+1)))

    # 履歴更新
    history_next = history + [action]

    # 後手のアクション
    if len(history_next)==1:
        opp_action = np.random.choice(actions, p=nash_second_player_after_check[opp_card])
        history_next.append(opp_action)
    elif len(history_next)==2 and history_next[0]==1:
        opp_action = np.random.choice(actions, p=nash_second_player_after_raise[opp_card])
        history_next.append(opp_action)

    # 報酬計算
    reward = compute_reward(my_card, opp_card, history_next)

    # ノード更新
    if action not in node.children:
        node.children[action] = Node()
    child = node.children[action]
    child.N += 1
    child.Q += (reward - child.Q)/child.N
    node.N += 1

    # Check → 後手Raise → 先手アクションの更新
    if len(history_next)==2 and history_next[0]==0 and history_next[1]==1:
        next_state = (my_card, tuple(history_next))
        next_node = trees[next_state]
        if next_node.N==0 or any(a not in next_node.children or next_node.children[a].N==0 for a in actions):
            next_action = np.random.choice(actions)
        else:
            next_action = max(actions, key=lambda a: next_node.children[a].Q + np.sqrt(np.log(next_node.N+1)/(next_node.children[a].N+1)))
        history_final = history_next + [next_action]
        reward2 = compute_reward(my_card, opp_card, history_final)
        if next_action not in next_node.children:
            next_node.children[next_action] = Node()
        child2 = next_node.children[next_action]
        child2.N += 1
        child2.Q += (reward2 - child2.Q)/child2.N
        next_node.N += 1

    return reward

# =========================
# 6. シミュレーション実行
# =========================
n_simulations = 500000
for _ in range(n_simulations):
    my_card = np.random.choice(cards)
    simulate(my_card, [])

# =========================
# 7. 学習済み行動分布表示
# =========================
print("先手の学習済み行動分布 (カード・履歴ごと):")
for state, node in trees.items():
    my_card, history = state
    total_N = sum(child.N for child in node.children.values())
    print(f"\nCard={card_names[my_card]}, History={history}")
    for a, child in node.children.items():
        prob = child.N/total_N if total_N>0 else 0
        print(f"  Action {action_names[a]}: N={child.N}, Estimated Prob={prob:.3f}")

# =========================
# 8. 先手期待値表示
# =========================
print("\n先手の期待値 (カード・履歴ごと):")
for state, node in trees.items():
    my_card, history = state
    total_N = sum(child.N for child in node.children.values())
    print(f"\nCard={card_names[my_card]}, History={history}")
    for a, child in node.children.items():
        print(f"  Action {action_names[a]}: Estimated Value={child.Q:.3f}, N={child.N}, Prob={child.N/total_N:.3f}")

    if total_N>0:
        expected_value = sum((child.N/total_N)*child.Q for child in node.children.values())
        print(f"  -> Expected Value of this state: {expected_value:.3f}")


先手の学習済み行動分布 (カード・履歴ごと):

Card=J, History=()
  Action Raise/Call: N=166888, Estimated Prob=0.999
  Action Check/Fold: N=88, Estimated Prob=0.001

Card=K, History=()
  Action Raise/Call: N=166545, Estimated Prob=1.000
  Action Check/Fold: N=43, Estimated Prob=0.000

Card=Q, History=()
  Action Check/Fold: N=148399, Estimated Prob=0.892
  Action Raise/Call: N=18037, Estimated Prob=0.108

Card=Q, History=(np.int64(0), np.int64(1))
  Action Check/Fold: N=89035, Estimated Prob=0.900
  Action Raise/Call: N=9849, Estimated Prob=0.100

Card=J, History=(0, np.int64(1))
  Action Check/Fold: N=41, Estimated Prob=0.953
  Action Raise/Call: N=2, Estimated Prob=0.047

Card=K, History=(0, np.int64(1))
  Action Check/Fold: N=1, Estimated Prob=0.125
  Action Raise/Call: N=7, Estimated Prob=0.875

先手の期待値 (カード・履歴ごと):

Card=J, History=()
  Action Raise/Call: Estimated Value=-0.500, N=166888, Prob=0.999
  Action Check/Fold: Estimated Value=-1.000, N=88, Prob=0.001
  -> Expected Value of this state: -0.500

