# Solving Poker using CFR

In [1]:
%load_ext autoreload
%autoreload 2

In [143]:
import math
import os
import pickle
import random
from copy import copy, deepcopy

import numpy as np
import rlcard
from pprint import pprint
from rlcard import models
from rlcard.agents import RandomAgent
from rlcard.agents.best_response_agent import BRAgent
from rlcard.core import Card
from rlcard.utils import tournament
from tqdm.notebook import tqdm, trange

from env import LeducholdemEnv

In [134]:
HANDS = ["SJ", "SQ", "SK", "HJ", "HQ", "HK"]
HAND2IDX = {hand: i for i, hand in enumerate(HANDS)}


def get_payoff(env, hand, our_id):
    orig_hand = copy(env.game.players[1 - our_id].hand)
    env.game.players[1 - our_id].hand = Card(suit=hand[0], rank=hand[1])
    payoffs = env.game.get_payoffs()
    env.game.players[1 - our_id].hand = orig_hand
    return payoffs[our_id]


def best_response_value(env, agent, our_id, num_sims, reach_probs):
    if env.is_over():
        # Fix reach probabilities by removing impossible entries,
        # in which our card is the same as the opponent's or the
        # public one.
        info = env.get_perfect_information()
        our_hand = info["hand_cards"][our_id]
        our_hand_idx = HAND2IDX[our_hand]
        reach_probs[our_hand_idx] = 0.0
        ri = np.array(reach_probs)
        if info["public_card"]:
            public_card = info["public_card"]
            public_card_idx = HAND2IDX[public_card]
            reach_probs[public_card_idx] = 0.0

        if np.sum(reach_probs) == 0:
            return 0.0
        
        reach_probs /= np.sum(reach_probs)
        
        # Compute expected value.
        ev = 0.0
        for hand, reach_prob in zip(HANDS, reach_probs):
            ev += reach_prob * get_payoff(env, hand, our_id)
        return ev

    curr_player = env.get_player_id()
    state = env.get_state(curr_player)
    legal_actions = state["legal_actions"]

    if our_id == curr_player:
        values = np.zeros(env.action_num)
        for a in legal_actions:
            # First, we check if this action runs into a chance node.
            round_before = env.get_perfect_information()["current_round"]
            env.step(a)
            round_after = env.get_perfect_information()["current_round"]
            is_chance_node = round_before != round_after and round_after == 1
            env.step_back()

            # Then, we compute the expected value of our action.
            action_evs = []
            for _ in range(num_sims if is_chance_node else 1):
                env.step(a)
                v = best_response_value(env, agent, our_id, num_sims, reach_probs)
                action_evs.append(v)
                env.step_back()
            values[a] = np.mean(action_evs)
            return np.max(values)
    else:
        # Compute probability of taking action with each holding in the
        # opponent's infoset.
        probs = np.zeros((6, 4))
        for i, hand in enumerate(HANDS):
            alt_state = deepcopy(state)
            alt_state["raw_obs"]["hand"] = hand
            _, prob = agent.eval_step(alt_state)
            probs[i] = prob

        # Recursively compute expected value of state for player with our_id.
        values = np.zeros(env.action_num)
        for a in legal_actions:
            # First, we check if this action runs into a chance node.
            round_before = env.get_perfect_information()["current_round"]
            env.step(a)
            round_after = env.get_perfect_information()["current_round"]
            is_chance_node = round_before != round_after and round_after == 1
            env.step_back()

            # Then, we compute the expected value of our action.
            action_evs = []
            for _ in range(num_sims if is_chance_node else 1):
                env.step(a)
                v = best_response_value(env, agent, our_id, num_sims, (probs.T)[a] * reach_probs)
                action_evs.append(v)
                env.step_back()
            values[a] = np.mean(action_evs)
        _, p = agent.eval_step(state)
        ev = sum(p[a] * values[a] for a in legal_actions)
        return ev
        

def exploitability(env, agent, our_id=0, num_sims=100):
    values = []
    for _ in range(num_sims):
        env.reset()
        v = best_response_value(env, agent, our_id, num_sims, reach_probs=np.ones(6))
        values.append(v)
    return values

In [139]:
env = LeducholdemEnv(config={"allow_step_back": True, "allow_raw_data": True, "record_action": True})
values = exploitability(env, RandomAgent(4), our_id=0, num_sims=50)
print(f"Value: {np.mean(values)} +- {np.std(values)}")

Value: 1.1722185185185183 +- 0.22524038519952083


In [38]:
random_agent = RandomAgent(4)
br_agent = BRAgent(env, random_agent)
env.set_agents([br_agent, random_agent])

curr_player = env.get_player_id()
br_agent.value(curr_player, env.get_state(curr_player), 0)

-0.16666666666666666

In [28]:
env = LeducholdemEnv(config={"allow_step_back": True, "allow_raw_data": True, "record_action": True})
env.reset()

({'legal_actions': [0, 1, 2],
  'obs': array([0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
  'raw_obs': {'hand': 'SK',
   'public_card': None,
   'all_chips': [1, 2],
   'my_chips': 1,
   'legal_actions': ['call', 'raise', 'fold'],
   'current_player': 0},
  'raw_legal_actions': ['call', 'raise', 'fold'],
  'action_record': []},
 0)

In [44]:
env.get_state(curr_player)["raw_obs"]["hand"]

'SK'

In [55]:
env.get_state(0)["raw_obs"]["hand"]

'SK'

In [30]:
env.get_perfect_information()

{'chips': [1, 2],
 'public_card': None,
 'hand_cards': ['SK', 'HJ'],
 'current_round': 0,
 'current_player': 0,
 'legal_actions': ['call', 'raise', 'fold']}

-0.16666666666666666

In [None]:
exploitability(env, RandomAgent(4), )

In [10]:
def make_infoset_string(state):
    private_card = state["raw_obs"]["hand"]
    public_card = state["raw_obs"]["public_card"] or "Unknown"
    action_history = ":".join([record[1] for record in state["action_record"]])
    return f"{private_card}|{public_card}|{action_history}"

In [146]:
class PureCFRAgent:
    def __init__(self, env):
        self.env = env
        self.regrets = {}
        self.avg_strategy = {}
        self.strategy = {}
        self.use_raw = False
        self.iteration = 0
        self.next_traverser = 0
    
    def regret_matching(self, infoset, legal_actions):
        if infoset not in self.regrets or np.all(self.regrets[infoset] <= 0):
            action_probs = np.zeros(self.env.action_num)
            action_probs[legal_actions] = 1.0 / len(legal_actions)
            return action_probs
        pos_regrets = np.maximum(self.regrets[infoset], 0)
        return pos_regrets / np.sum(pos_regrets)

    def train_step(self):
        self.iteration += 1
        self.env.reset()
        self.pure_cfr()
        self.next_traverser = 1 - self.next_traverser
        for infoset in self.avg_strategy:
            self.strategy[infoset] = self.avg_strategy[infoset] / np.sum(self.avg_strategy[infoset])
        for infoset in self.regrets:
            self.regrets[infoset] = np.maximum(self.regrets[infoset], 0)
    
    def pure_cfr(self):
        player = self.next_traverser
        if self.env.is_over():
            payoffs = self.env.get_payoffs()
            return payoffs[player]

        # Get state information.
        curr_player = self.env.get_player_id()
        state = self.env.get_state(curr_player)
        infoset = make_infoset_string(state)
        legal_actions = state["legal_actions"]

        # Compute action probabilities proportional to positive regrets.
        action_probs = self.regret_matching(infoset, legal_actions)

        assert np.isclose(np.sum(action_probs), 1.0), (
            f"Sum of action probs must be approx. 1 but is {np.sum(action_probs)}. "
            f"Regrets: {self.regrets[infoset]}."
        )
        illegal_actions = ~np.in1d(range(self.env.action_num), legal_actions)
        assert np.all(action_probs[illegal_actions] == 0), (
            f"Probability of illegal actions must be 0. "
            f"Legal actions: {legal_actions}. "
            f"Action probs: {action_probs}."
        )

        # Sample random action.
        action = np.random.choice(range(self.env.action_num), p=action_probs)

        if curr_player == player:
            if infoset not in self.regrets:
                self.regrets[infoset] = np.zeros(self.env.action_num)

            # Compute action values.
            action_values = np.zeros(self.env.action_num)
            for a in legal_actions:
                self.env.step(a)
                action_values[a] = self.pure_cfr()
                self.env.step_back()

            # Update regrets.
            for a in legal_actions:
                self.regrets[infoset][a] += action_values[a] - action_values[action]

            return action_values[action]
        else:
            if infoset not in self.avg_strategy:
                self.avg_strategy[infoset] = np.zeros(env.action_num)

            # Update average strategy.
            self.avg_strategy[infoset][action] += self.iteration

            # Recurse.
            self.env.step(action)
            value = self.pure_cfr()
            self.env.step_back()
            return value

    def eval_step(self, state):
        infoset = make_infoset_string(state)
        if infoset not in self.strategy:
            legal_actions = state["legal_actions"]
            action_probs = np.zeros(env.action_num)
            action_probs[legal_actions] = 1.0
            action_probs /= len(legal_actions)
        else:
            action_probs = self.strategy[infoset]
        action = np.random.choice(range(self.env.action_num), p=action_probs)
        return action, action_probs

In [147]:
NUM_EPOCHS = 250000
EVAL_INTERVAL = 10000
EVAL_ITERS = 10000
SAVE_DIR = "./agents"
SAVE_FILE = "cfr_model.pkl"
SAVE_PATH = os.path.join(SAVE_DIR, SAVE_FILE)

os.makedirs(SAVE_DIR, exist_ok=True)

env = LeducholdemEnv(config={"allow_step_back": True, "allow_raw_data": True, "record_action": True})
eval_env_1 = LeducholdemEnv(config={"allow_step_back": False, "allow_raw_data": True, "record_action": True})
eval_env_2 = LeducholdemEnv(config={"allow_step_back": True, "allow_raw_data": True, "record_action": True})

agent = PureCFRAgent(env)

# eval_agent = models.load("leduc-holdem-cfr").agents[0]
eval_agent = models.load("leduc-holdem-nfsp").agents[0]
# eval_agent = RandomAgent(4)
eval_env_1.set_agents([agent, eval_agent])

for epoch in trange(1, NUM_EPOCHS + 1):
    agent.train_step()
    if epoch % EVAL_INTERVAL == 0:
        avg_reward = tournament(eval_env_1, EVAL_ITERS)[0]
        expl = exploitability(eval_env_2, agent, our_id=0, num_sims=50)
        print(f"Epoch: {epoch} \t Avg reward: {avg_reward} \t Exploitability: {np.mean(expl)} +- {np.std(expl)}")
        pickle.dump(agent, open(SAVE_PATH, "wb"))

INFO:tensorflow:Restoring parameters from /Users/Christopher/.pyenv/versions/3.7.9/lib/python3.7/site-packages/rlcard/models/pretrained/leduc_holdem_nfsp/model


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=250000.0), HTML(value='')))

Epoch: 10000 	 Avg reward: 0.309 	 Exploitability: 0.9473901417690601 +- 0.29251010090782487
Epoch: 20000 	 Avg reward: 0.37095 	 Exploitability: 0.9787824796071202 +- 0.3538245335588527
Epoch: 30000 	 Avg reward: 0.39585 	 Exploitability: 1.0071904886563905 +- 0.3388894660223854



KeyboardInterrupt: 

In [None]:
import rlcard
from rlcard import models
from rlcard.agents import LeducholdemHumanAgent as HumanAgent
from rlcard.utils import print_card

# Make environment
# Set 'record_action' to True because we need it to print results
env = rlcard.make('leduc-holdem', config={'record_action': True})
human_agent = HumanAgent(env.action_num)
env.set_agents([human_agent, agent])

print(">> Leduc Hold'em pre-trained model")

while True:
    print(">> Start a new game")

    trajectories, payoffs = env.run(is_training=False)
    # If the human does not take the final action, we need to
    # print other players action
    final_state = trajectories[0][-1][-2]
    action_record = final_state['action_record']
    state = final_state['raw_obs']
    _action_list = []
    for i in range(1, len(action_record)+1):
        if action_record[-i][0] == state['current_player']:
            break
        _action_list.insert(0, action_record[-i])
    for pair in _action_list:
        print('>> Player', pair[0], 'chooses', pair[1])

    # Let's take a look at what the agent card is
    print('===============     CFR Agent    ===============')
    print_card(env.get_perfect_information()['hand_cards'][1])

    print('===============     Result     ===============')
    if payoffs[0] > 0:
        print('You win {} chips!'.format(payoffs[0]))
    elif payoffs[0] == 0:
        print('It is a tie.')
    else:
        print('You lose {} chips!'.format(-payoffs[0]))
    print('')

    input("Press any key to continue...")

>> Leduc Hold'em pre-trained model
>> Start a new game

┌─────────┐
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
└─────────┘
┌─────────┐
│Q        │
│         │
│         │
│    ♥    │
│         │
│         │
│        Q│
└─────────┘
Yours:   +
Agent 1: ++
0: call, 1: raise, 2: fold

>> You choose action (integer): 0
>> Player 1 chooses raise

┌─────────┐
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
└─────────┘
┌─────────┐
│Q        │
│         │
│         │
│    ♥    │
│         │
│         │
│        Q│
└─────────┘
Yours:   ++
Agent 1: ++++
0: call, 1: raise, 2: fold

>> You choose action (integer): 0
>> Player 1 chooses check

┌─────────┐
│J        │
│         │
│         │
│    ♥    │
│         │
│         │
│        J│
└─────────┘
┌─────────┐
│Q        │
│         │
│         │
│    ♥    │
│         │
│         │
│        Q│
└─────────┘
Yours:   ++++
Agent 1: ++++
0: raise, 1: fold, 2: check

>> You choos

In [None]:
eval_env = rlcard.make("leduc-holdem", config={"seed": SEED, "allow_step_back": True})
eval_env.set_agents([models.load("leduc-holdem-cfr").agents[0], models.load("leduc-holdem-nfsp").agents[0]])
print(tournament(eval_env, 100000))

In [None]:
agent.regrets

In [None]:
env = rlcard.make(
    "leduc-holdem",
    config={
        "allow_step_back": True,
        "allow_raw_data": True,
    },
)
env.reset()
env.get_state(0)

In [None]:
env.get_perfect_information()

In [None]:
env.set_agents([RandomAgent(env.action_num), RandomAgent(env.action_num)])

In [None]:
env.reset()

In [None]:
env.get_state(0)

In [None]:
env.step(0)

In [None]:
def make_agent(player_id):
    return PureCFRAgent(player_id, strategy=avg_strategy)

exploitability(env, agent, 50)

In [None]:
def make_agent(player_id):
    return RandomAgent(action_num=env.action_num)

exploitability(env, make_agent, 25)

In [None]:
eval_env = rlcard.make("leduc-holdem", config={"seed": SEED})

In [None]:
cfr_agent = PureCFRAgent(player_id=0, strategy=avg_strategy)
random_agent = RandomAgent(env.action_num)

eval_env.set_agents([cfr_agent, random_agent])

for i in range(1000):
    eval_env.reset()
    trajectories, payoffs = eval_env.run(is_training=False)

In [None]:
len(cfr_agent.unseen_states)

In [None]:
avg_strategy

In [None]:
class MCCFRAgent:
    def __init__(self):
        pass
    
    def train(self, num_iter):
        pass
    
    def eval_step(self, state):
        pass