先手の学習済み行動分布 (各カード・状態ごと):

Card=Q, History=()
  Action Check: N=1142, Estimated Prob=0.698
  Action Bet: N=494, Estimated Prob=0.302

Card=J, History=()
  Action Bet: N=12, Estimated Prob=0.007
  Action Check: N=1681, Estimated Prob=0.993

Card=K, History=()
  Action Check: N=1, Estimated Prob=0.001
  Action Bet: N=1670, Estimated Prob=0.999


先手の学習済み行動分布 (カード・履歴ごと):

Card=J, History=()
  Action Raise: N=17, Estimated Prob=0.001
  Action Check: N=16648, Estimated Prob=0.999

Card=J, History=(np.int64(1), np.int64(1))
  Action Check: N=9, Estimated Prob=0.529
  Action Raise: N=8, Estimated Prob=0.471

Card=K, History=()
  Action Raise: N=16659, Estimated Prob=1.000
  Action Check: N=2, Estimated Prob=0.000

Card=K, History=(np.int64(1), np.int64(1))
  Action Check: N=6243, Estimated Prob=0.500
  Action Raise: N=6243, Estimated Prob=0.500

Card=Q, History=()
  Action Raise: N=16382, Estimated Prob=0.982
  Action Check: N=292, Estimated Prob=0.018

Card=Q, History=(np.int64(1), np.int64(1))
  Action Check: N=6147, Estimated Prob=0.500
  Action Raise: N=6147, Estimated Prob=0.500

Card=Q, History=(np.int64(0), np.int64(1))
  Action Check: N=76, Estimated Prob=0.349
  Action Raise: N=142, Estimated Prob=0.651

Card=J, History=(np.int64(0), np.int64(1))
  Action Check: N=16631, Estimated Prob=0.999
  Action Raise: N=17, Estimated 

In [7]:
import numpy as np
#辞書キーが存在しない場合でも，自動でデフォルト値を生成できる辞書を作るために使用
from collections import defaultdict

# =========================
# 1. ゲーム設定
# =========================
cards = [0,1,2]   # J=0, Q=1, K=2
card_names = ['J','Q','K']
actions = [0,1]   # Check/Fold=0, Bet/Raise/Call=1
action_names = ['Check/Fold','Raise/Call']

# =========================
# 2. 後手固定Nash方策
# =========================
# 状態: ('先手のアクション') -> カード -> 後手行動確率
# 先手Check直後
nash_second_player_after_check = {
    0: [2/3, 1/3],  # J: Check 2/3, Raise 1/3
    1: [1.0, 0.0],  # Q: Check 1, Raise 0
    2: [0.0, 1.0],  # K: Check 0, Raise 1
}

# 先手Raise直後
nash_second_player_after_raise = {
    0: [1.0, 0.0],    # J: Fold 1, Call 0
    1: [2/3, 1/3],    # Q: Fold 2/3, Call 1/3
    2: [0.0, 1.0],    # K: Fold 0, Call 1
}

# =========================
# 3. POMCP用ノード
# =========================
class Node:
    def __init__(self):
        self.N = 0
        self.Q = 0
        self.children = {}  # action -> Node

trees = defaultdict(Node)

# =========================
# 4. 報酬計算
# =========================
def compute_reward(my_card, opp_card, history):
    reward = 0
    if len(history)==2:
        h0,h1 = history
        if h0==0 and h1==0: #Check→Check
            reward = 1 if my_card>opp_card else -1
        elif h0==1 and h1==0: #Raise→Fold
            reward = 1
        elif h0==1 and h1==1: #Raise→Call
            reward = 2 if my_card>opp_card else -2
    elif len(history)==3: #Check→Raise→?
        h0,h1,h2 = history
        if h0==0 and h1==1:
            # 先手Check → 後手Raise → 先手アクション
            if h2==0:#先手アクションがFold
                reward = -1
            else:#先手アクションがCall
                reward = 2 if my_card>opp_card else -2
    return reward

# =========================
# 5. シミュレーション
# =========================

#POMCPといいつつ，探索が深くないので全探索できちゃったやつ
#POMCPのシュミレーション1回を行う関数
def simulate(my_card, history):
    # opp_card∊{0,1,2}(my_card以外)，opp_action∊{0,1}
    opp_card = np.random.choice([c for c in cards if c != my_card])
    #state=my_cardと履歴(history)の組み合わせ
    state = (my_card, tuple(history))
    #その状態に対応するノードを木構造treesから取得する
    node = trees[state]

    # 先手のアクション選択（既存ロジックそのまま）
    if node.N == 0 or any(a not in node.children or node.children[a].N == 0 for a in actions):
        action = np.random.choice(actions)
    else:
        action = max(actions, key=lambda a: node.children[a].Q + np.sqrt(np.log(node.N + 1) / (node.children[a].N + 1)))

    history_next = history + [action]

    # 後手の最初のアクションを、先手の直前アクションに応じて選ぶ（ここが修正点）
    if len(history_next) == 1:
        if history_next[0] == 0:  # 先手がCheckした後は，「先手がcheckした後」の確率で
            p = nash_second_player_after_check[opp_card]
        else:  # 先手がRaiseした後は，「先手がRaiseした後」の確率で
            p = nash_second_player_after_raise[opp_card]
        opp_action = np.random.choice(actions, p=p)
        history_next.append(opp_action)

    # ここで最終報酬を決めるが、Check->Raise の場合は「さらに先手の返し」があるため扱いを分ける
    reward_total = None

    # Check -> Raise -> (先手の返し) の場合
    if len(history_next) == 2 and history_next[0] == 0 and history_next[1] == 1:
        # 次の状態ノード（先手が返す局面）を取得／選択
        next_state = (my_card, tuple(history_next))
        next_node = trees[next_state]
        # この部分は先のバンディットによる探索と同じアルゴリズムで動く
        # 未探索な部分に対するランダムな探索アルゴリズム
        if next_node.N == 0 or any(a not in next_node.children or next_node.children[a].N == 0 for a in actions):
            next_action = np.random.choice(actions)
        # 探索済みの部分に対するUCBによる探索アルゴリズム(知識利用部分)
        else:
            next_action = max(actions, key=lambda a: next_node.children[a].Q + np.sqrt(np.log(next_node.N + 1) / (next_node.children[a].N + 1)))
        #history_finalはこれまでのアクションの履歴と次のアクションの合算になっており，history_next==2で条件分岐したのでhistory_finalの大きさは3である．
        history_final = history_next + [next_action]
        reward_total = compute_reward(my_card, opp_card, history_final)

        # next_node（先手の返しのノード）を更新
        # ここでは自分のカードとこれまでのアクション履歴を合わせた状況におけるノードに対する学習を行っている
        if next_action not in next_node.children:
            next_node.children[next_action] = Node()
        child2 = next_node.children[next_action]
        child2.N += 1
        child2.Q += (reward_total - child2.Q) / child2.N
        next_node.N += 1

    else:
        # 通常ケース（Raise->Fold/Call や Check->Check など）では、今の history_next で終了
        reward_total = compute_reward(my_card, opp_card, history_next)

    # ここで「最初のノード配下の子（action）」を最終報酬で更新
    # node.children[action]はその行動をとった後に遷移する子ノードで，子ノードはその行動をとった回数や平均報酬を保持する
    if action not in node.children:
        node.children[action] = Node()
    child = node.children[action]
    # その行動を選んだ回数を1増やす
    # その行動の平均報酬を逐次更新する
    # 親ノードの訪問回数も1増やす
    child.N += 1
    child.Q += (reward_total - child.Q) / child.N
    node.N += 1

    return reward_total


# =========================
# 6. POMCP 実行
# =========================
n_simulations = 500000
for _ in range(n_simulations):
    my_card = np.random.choice(cards)
    simulate(my_card, [])

# =========================
# 7. 結果表示
# =========================
print("先手の学習済み行動分布 (カード・履歴ごと):")
for state, node in trees.items():
    my_card, history = state
    total_N = sum(child.N for child in node.children.values())
    print(f"\nCard={card_names[my_card]}, History={history}")
    for a, child in node.children.items():
        prob = child.N/total_N if total_N>0 else 0
        print(f"  Action {action_names[a]}: N={child.N}, Estimated Prob={prob:.3f}")

# =========================
# 8. 期待値表示
# =========================
print("\n先手の期待値 (カード・履歴ごと):")
for state, node in trees.items():
    my_card, history = state
    total_N = sum(child.N for child in node.children.values())
    print(f"\nCard={card_names[my_card]}, History={history}")
    for a, child in node.children.items():
        # child.Q がその action からの平均報酬の推定値
        print(f"  Action {action_names[a]}: Estimated Value={child.Q:.3f}, N={child.N}, Prob={child.N/total_N:.3f}")

    # 状態全体の期待値 = すべての行動の期待値の平均
    if total_N > 0:
        expected_value = sum((child.N/total_N)*child.Q for child in node.children.values())
        print(f"  -> Expected Value of this state: {expected_value:.3f}")


先手の学習済み行動分布 (カード・履歴ごと):

Card=Q, History=()
  Action Check/Fold: N=166051, Estimated Prob=0.996
  Action Raise/Call: N=749, Estimated Prob=0.004

Card=Q, History=(0, np.int64(1))
  Action Check/Fold: N=108988, Estimated Prob=0.985
  Action Raise/Call: N=1679, Estimated Prob=0.015

Card=K, History=()
  Action Check/Fold: N=98858, Estimated Prob=0.593
  Action Raise/Call: N=67764, Estimated Prob=0.407

Card=K, History=(np.int64(0), np.int64(1))
  Action Raise/Call: N=16358, Estimated Prob=1.000
  Action Check/Fold: N=1, Estimated Prob=0.000

Card=J, History=()
  Action Raise/Call: N=48504, Estimated Prob=0.291
  Action Check/Fold: N=118074, Estimated Prob=0.709

Card=J, History=(np.int64(0), np.int64(1))
  Action Check/Fold: N=58940, Estimated Prob=1.000
  Action Raise/Call: N=10, Estimated Prob=0.000

先手の期待値 (カード・履歴ごと):

Card=Q, History=()
  Action Check/Fold: Estimated Value=-0.334, N=166051, Prob=0.996
  Action Raise/Call: Estimated Value=-0.454, N=749, Prob=0.004
  -> Expected Value 

In [None]:
import numpy as np
from collections import defaultdict

# =========================
# 1. ゲーム設定
# =========================
cards = [0,1,2]   # J=0, Q=1, K=2
card_names = ['J','Q','K']
actions = [0,1]   # Check/Fold=0, Raise/Call=1
action_names = ['Check/Fold','Raise/Call']

# =========================
# 2. 後手固定Nash方策
# =========================
# 先手Check直後
nash_second_player_after_check = {
    0: [2/3, 1/3],  # J: Check 2/3, Raise 1/3
    1: [1.0, 0.0],  # Q: Check 1, Raise 0
    2: [0.0, 1.0],  # K: Check 0, Raise 1
}

# 先手Raise直後
nash_second_player_after_raise = {
    0: [1.0, 0.0],    # J: Fold 1, Call 0
    1: [2/3, 1/3],    # Q: Fold 2/3, Call 1/3
    2: [0.0, 1.0],    # K: Fold 0, Call 1
}

# =========================
# 3. POMCP用ノード定義
# =========================
class Node:
    def __init__(self):
        self.N = 0
        self.Q = 0
        self.children = {}  # action -> Node

trees = defaultdict(Node)

# =========================
# 4. 報酬計算関数
# =========================
def compute_reward(my_card, opp_card, history):
    reward = 0
    if len(history)==2:
        h0,h1 = history
        if h0==0 and h1==0:
            reward = 1 if my_card>opp_card else -1
        elif h0==0 and h1==1:
            reward = -1
        elif h0==1 and h1==0:
            reward = 1
        elif h0==1 and h1==1:
            reward = 2 if my_card>opp_card else -2
    elif len(history)==3:
        h0,h1,h2 = history
        if h0==0 and h1==1:
            reward = -1 if h2==0 else (2 if my_card>opp_card else -2)
    return reward

# =========================
# 5. POMCPシミュレーション
# =========================
def simulate(my_card, history):
    # 後手のカード
    opp_card = np.random.choice([c for c in cards if c!=my_card])

    # 現在ノード
    state = (my_card, tuple(history))
    node = trees[state]

    # 先手アクション選択（UCB or ランダム）
    if node.N==0 or any(a not in node.children or node.children[a].N==0 for a in actions):
        action = np.random.choice(actions)
    else:
        action = max(actions, key=lambda a: node.children[a].Q + np.sqrt(np.log(node.N+1)/(node.children[a].N+1)))

    # 履歴更新
    history_next = history + [action]

    # 後手のアクション
    if len(history_next)==1:
        opp_action = np.random.choice(actions, p=nash_second_player_after_check[opp_card])
        history_next.append(opp_action)
    elif len(history_next)==2 and history_next[0]==1:
        opp_action = np.random.choice(actions, p=nash_second_player_after_raise[opp_card])
        history_next.append(opp_action)

    # 報酬計算
    reward = compute_reward(my_card, opp_card, history_next)

    # ノード更新
    if action not in node.children:
        node.children[action] = Node()
    child = node.children[action]
    child.N += 1
    child.Q += (reward - child.Q)/child.N
    node.N += 1

    # Check → 後手Raise → 先手アクションの更新
    if len(history_next)==2 and history_next[0]==0 and history_next[1]==1:
        next_state = (my_card, tuple(history_next))
        next_node = trees[next_state]
        if next_node.N==0 or any(a not in next_node.children or next_node.children[a].N==0 for a in actions):
            next_action = np.random.choice(actions)
        else:
            next_action = max(actions, key=lambda a: next_node.children[a].Q + np.sqrt(np.log(next_node.N+1)/(next_node.children[a].N+1)))
        history_final = history_next + [next_action]
        reward2 = compute_reward(my_card, opp_card, history_final)
        if next_action not in next_node.children:
            next_node.children[next_action] = Node()
        child2 = next_node.children[next_action]
        child2.N += 1
        child2.Q += (reward2 - child2.Q)/child2.N
        next_node.N += 1

    return reward

# =========================
# 6. シミュレーション実行
# =========================
n_simulations = 500000
for _ in range(n_simulations):
    my_card = np.random.choice(cards)
    simulate(my_card, [])

# =========================
# 7. 学習済み行動分布表示
# =========================
print("先手の学習済み行動分布 (カード・履歴ごと):")
for state, node in trees.items():
    my_card, history = state
    total_N = sum(child.N for child in node.children.values())
    print(f"\nCard={card_names[my_card]}, History={history}")
    for a, child in node.children.items():
        prob = child.N/total_N if total_N>0 else 0
        print(f"  Action {action_names[a]}: N={child.N}, Estimated Prob={prob:.3f}")

# =========================
# 8. 先手期待値表示
# =========================
print("\n先手の期待値 (カード・履歴ごと):")
for state, node in trees.items():
    my_card, history = state
    total_N = sum(child.N for child in node.children.values())
    print(f"\nCard={card_names[my_card]}, History={history}")
    for a, child in node.children.items():
        print(f"  Action {action_names[a]}: Estimated Value={child.Q:.3f}, N={child.N}, Prob={child.N/total_N:.3f}")

    if total_N>0:
        expected_value = sum((child.N/total_N)*child.Q for child in node.children.values())
        print(f"  -> Expected Value of this state: {expected_value:.3f}")


先手の学習済み行動分布 (カード・履歴ごと):

Card=J, History=()
  Action Raise/Call: N=166888, Estimated Prob=0.999
  Action Check/Fold: N=88, Estimated Prob=0.001

Card=K, History=()
  Action Raise/Call: N=166545, Estimated Prob=1.000
  Action Check/Fold: N=43, Estimated Prob=0.000

Card=Q, History=()
  Action Check/Fold: N=148399, Estimated Prob=0.892
  Action Raise/Call: N=18037, Estimated Prob=0.108

Card=Q, History=(np.int64(0), np.int64(1))
  Action Check/Fold: N=89035, Estimated Prob=0.900
  Action Raise/Call: N=9849, Estimated Prob=0.100

Card=J, History=(0, np.int64(1))
  Action Check/Fold: N=41, Estimated Prob=0.953
  Action Raise/Call: N=2, Estimated Prob=0.047

Card=K, History=(0, np.int64(1))
  Action Check/Fold: N=1, Estimated Prob=0.125
  Action Raise/Call: N=7, Estimated Prob=0.875

先手の期待値 (カード・履歴ごと):

Card=J, History=()
  Action Raise/Call: Estimated Value=-0.500, N=166888, Prob=0.999
  Action Check/Fold: Estimated Value=-1.000, N=88, Prob=0.001
  -> Expected Value of this state: -0.500

