<a href="https://colab.research.google.com/github/zhenbangt/aa228_final_project/blob/main/MCCFR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Solving Poker using MCCFR

In [1]:
!ls

env.py	sample_data


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
!pip install -q rlcard
!pip install -q rlcard[tensorflow]
!pip install -q tqdm

[K     |████████████████████████████████| 6.7MB 9.0MB/s 
[?25h  Building wheel for rlcard (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 110.5MB 36kB/s 
[K     |████████████████████████████████| 51kB 8.0MB/s 
[K     |████████████████████████████████| 3.8MB 52.4MB/s 
[K     |████████████████████████████████| 512kB 45.5MB/s 
[?25h  Building wheel for gast (setup.py) ... [?25l[?25hdone
[31mERROR: tensorflow-probability 0.11.0 has requirement gast>=0.3.2, but you'll have gast 0.2.2 which is incompatible.[0m


In [4]:
import math
import os
import pickle
import random
from copy import copy, deepcopy

import numpy as np
import rlcard
from pprint import pprint
from rlcard import models
from rlcard.agents import RandomAgent
from rlcard.agents.best_response_agent import BRAgent
from rlcard.core import Card
from rlcard.utils import tournament
from tqdm.notebook import tqdm, trange

from env import LeducholdemEnv

In [5]:
HANDS = ["SJ", "SQ", "SK", "HJ", "HQ", "HK"]
HAND2IDX = {hand: i for i, hand in enumerate(HANDS)}


def get_payoff(env, hand, our_id):
    orig_hand = copy(env.game.players[1 - our_id].hand)
    env.game.players[1 - our_id].hand = Card(suit=hand[0], rank=hand[1])
    payoffs = env.game.get_payoffs()
    env.game.players[1 - our_id].hand = orig_hand
    return payoffs[our_id]


def best_response_value(env, agent, our_id, num_sims, reach_probs):
    if env.is_over():
        # Fix reach probabilities by removing impossible entries,
        # in which our card is the same as the opponent's or the
        # public one.
        info = env.get_perfect_information()
        our_hand = info["hand_cards"][our_id]
        our_hand_idx = HAND2IDX[our_hand]
        reach_probs[our_hand_idx] = 0.0
        ri = np.array(reach_probs)
        if info["public_card"]:
            public_card = info["public_card"]
            public_card_idx = HAND2IDX[public_card]
            reach_probs[public_card_idx] = 0.0

        if np.sum(reach_probs) == 0:
            return 0.0
        
        reach_probs /= np.sum(reach_probs)
        
        # Compute expected value.
        ev = 0.0
        for hand, reach_prob in zip(HANDS, reach_probs):
            ev += reach_prob * get_payoff(env, hand, our_id)
        return ev

    curr_player = env.get_player_id()
    state = env.get_state(curr_player)
    legal_actions = state["legal_actions"]

    if our_id == curr_player:
        values = np.zeros(env.action_num)
        for a in legal_actions:
            # First, we check if this action runs into a chance node.
            round_before = env.get_perfect_information()["current_round"]
            env.step(a)
            round_after = env.get_perfect_information()["current_round"]
            is_chance_node = round_before != round_after and round_after == 1
            env.step_back()

            # Then, we compute the expected value of our action.
            action_evs = []
            for _ in range(num_sims if is_chance_node else 1):
                env.step(a)
                v = best_response_value(env, agent, our_id, num_sims, reach_probs)
                action_evs.append(v)
                env.step_back()
            values[a] = np.mean(action_evs)
            return np.max(values)
    else:
        # Compute probability of taking action with each holding in the
        # opponent's infoset.
        probs = np.zeros((6, 4))
        for i, hand in enumerate(HANDS):
            alt_state = deepcopy(state)
            alt_state["raw_obs"]["hand"] = hand
            _, prob = agent.eval_step(alt_state)
            probs[i] = prob

        # Recursively compute expected value of state for player with our_id.
        values = np.zeros(env.action_num)
        for a in legal_actions:
            # First, we check if this action runs into a chance node.
            round_before = env.get_perfect_information()["current_round"]
            env.step(a)
            round_after = env.get_perfect_information()["current_round"]
            is_chance_node = round_before != round_after and round_after == 1
            env.step_back()

            # Then, we compute the expected value of our action.
            action_evs = []
            for _ in range(num_sims if is_chance_node else 1):
                env.step(a)
                v = best_response_value(env, agent, our_id, num_sims, (probs.T)[a] * reach_probs)
                action_evs.append(v)
                env.step_back()
            values[a] = np.mean(action_evs)
        _, p = agent.eval_step(state)
        ev = sum(p[a] * values[a] for a in legal_actions)
        return ev
        

def exploitability(env, agent, our_id=0, num_sims=100):
    values = []
    for _ in range(num_sims):
        env.reset()
        v = best_response_value(env, agent, our_id, num_sims, reach_probs=np.ones(6))
        values.append(v)
    return values

In [8]:
def make_infoset_string(state):
    private_card = state["raw_obs"]["hand"]
    public_card = state["raw_obs"]["public_card"] or "Unknown"
    action_history = ":".join([record[1] for record in state["action_record"]])
    return f"{private_card}|{public_card}|{action_history}"

In [11]:
class MCCFR_ES_Agent:
    def __init__(self, env, _update_iterations=100):
        self.env = env
        self.regrets = {}
        self.avg_strategy = {}
        self.strategy = {}
        self.use_raw = False
        self.iteration = 0
        self.next_traverser = 0
        self._update_iterations = _update_iterations
    
    def regret_matching(self, infoset, legal_actions):
        if infoset not in self.regrets or np.all(self.regrets[infoset] <= 0):
            action_probs = np.zeros(self.env.action_num)
            action_probs[legal_actions] = 1.0 / len(legal_actions)
            return action_probs
        pos_regrets = np.maximum(self.regrets[infoset], 0)
        return pos_regrets / np.sum(pos_regrets)

    # def update_cumulative_profile(self):
    #     curr_player = self.env.get_player_id()
    #     if self.env.is_over():
    #         return
    #     state = self.env.get_state(curr_player)
    #     infoset = make_infoset_string(state)
    #     legal_actions = state["legal_actions"]

    #     if curr_player != self.next_traverser:
    #         for action in legal_actions:
    #             self.env.step(action)
    #             self.update_cumulative_profile()
    #             self.env.step_back()

    #     elif curr_player==self.next_traverser:
    #         action_probs = self.regret_matching(infoset, legal_actions)
    #         if infoset not in self.avg_strategy:
    #             self.avg_strategy[infoset] = np.zeros(env.action_num)
    #         for action in legal_actions:
    #             self.avg_strategy[infoset][action] += action_probs[action]
    #         action = np.random.choice(range(self.env.action_num), p=action_probs)
    #         self.env.step(action)
    #         self.update_cumulative_profile()
            
    def train_step(self):
        self.iteration += 1
        self.env.reset()
        # to make it fare also alternate player like purecfr
        self.walk_tree()

        # _player = self.next_traverser
        # for i in range(2):
        #   if self.iteration % self._update_iterations==0:
        #     self.next_traverser = i
        #     self.env.reset()
        #     self.update_cumulative_profile()
        # self.next_traverser = _player


        self.next_traverser = 1 - self.next_traverser
        for infoset in self.avg_strategy:
            self.strategy[infoset] = self.avg_strategy[infoset] / np.sum(self.avg_strategy[infoset])
        for infoset in self.regrets:
            self.regrets[infoset] = np.maximum(self.regrets[infoset], 0)
    
    def walk_tree(self):
        player = self.next_traverser
        if self.env.is_over():
            payoffs = self.env.get_payoffs()
            return payoffs[player]

        # Get state information.
        curr_player = self.env.get_player_id()
        state = self.env.get_state(curr_player)
        infoset = make_infoset_string(state)
        legal_actions = state["legal_actions"]
        # Compute action probabilities proportional to positive regrets.
        action_probs = self.regret_matching(infoset, legal_actions)

        ##########################################################
        ##########################################################
        # assert np.isclose(np.sum(action_probs), 1.0), (
        #     f"Sum of action probs must be approx. 1 but is {np.sum(action_probs)}. "
        #     f"Regrets: {self.regrets[infoset]}."
        # )
        # illegal_actions = ~np.in1d(range(self.env.action_num), legal_actions)
        # assert np.all(action_probs[illegal_actions] == 0), (
        #     f"Probability of illegal actions must be 0. "
        #     f"Legal actions: {legal_actions}. "
        #     f"Action probs: {action_probs}."
        # )
        ##########################################################
        ##########################################################

        if infoset not in self.regrets:
            self.regrets[infoset] = np.zeros(self.env.action_num)
        if infoset not in self.avg_strategy:
            self.avg_strategy[infoset] = np.zeros(env.action_num)

        if curr_player != player:
            for action in legal_actions:
                self.avg_strategy[infoset][action] += action_probs[action]
            # Sample action.
            action = np.random.choice(range(self.env.action_num), p=action_probs)
            env.step(action)
            return self.walk_tree()


        # Compute action values.
        action_values = np.zeros(self.env.action_num)
        for action in legal_actions:
            self.env.step(action)
            action_values[action] = self.walk_tree()
            self.env.step_back()

        expected_value = np.dot(action_values, action_probs)
        # Update regrets.
        for action in legal_actions:
            self.regrets[infoset][action] += action_values[action] - expected_value
            self.avg_strategy[infoset][action] += action_probs[action]
        return expected_value

    def eval_step(self, state):
        infoset = make_infoset_string(state)
        if infoset not in self.strategy:
            legal_actions = state["legal_actions"]
            action_probs = np.zeros(env.action_num)
            action_probs[legal_actions] = 1.0
            action_probs /= len(legal_actions)
        else:
            action_probs = self.strategy[infoset]
        action = np.random.choice(range(self.env.action_num), p=action_probs)
        return action, action_probs

In [13]:
NUM_EPOCHS = 25000
EVAL_INTERVAL = 1000
EVAL_ITERS = 1000
SAVE_DIR = "./agents"
SAVE_FILE = "mccfr_model.pkl"
SAVE_PATH = os.path.join(SAVE_DIR, SAVE_FILE)

os.makedirs(SAVE_DIR, exist_ok=True)

env = LeducholdemEnv(config={"allow_step_back": True, "allow_raw_data": True, "record_action": True})
eval_env_1 = LeducholdemEnv(config={"allow_step_back": True, "allow_raw_data": True, "record_action": True})
eval_env_2 = LeducholdemEnv(config={"allow_step_back": False, "allow_raw_data": True, "record_action": True})
eval_env_3 = LeducholdemEnv(config={"allow_step_back": False, "allow_raw_data": True, "record_action": True})
eval_env_4 = LeducholdemEnv(config={"allow_step_back": False, "allow_raw_data": True, "record_action": True})

agent = MCCFR_ES_Agent(env)
random_agent = RandomAgent(4)
nfsp_agent = models.load("leduc-holdem-nfsp").agents[0]
cfr_agent = models.load("leduc-holdem-cfr").agents[0]

eval_env_2.set_agents([agent, random_agent])
eval_env_3.set_agents([agent, nfsp_agent])
eval_env_4.set_agents([agent, cfr_agent])

random_rewards = []
nfsp_rewards = []
cfr_rewards = []
for epoch in trange(1, NUM_EPOCHS+1):
    agent.train_step()
    if epoch % EVAL_INTERVAL == 0:
        expl = exploitability(eval_env_1, agent, our_id=0, num_sims=50)

        vs_random = tournament(eval_env_2, EVAL_ITERS)[0]
        vs_nfsp = tournament(eval_env_3, EVAL_ITERS)[0]
        vs_cfr = tournament(eval_env_4, EVAL_ITERS)[0]
        
        random_rewards.append(vs_random)
        nfsp_rewards.append(nfsp_rewards)
        cfr_rewards.append(vs_cfr)

        print(f"Epoch: {epoch} \t vs random: {vs_random} \t vs nfsp: {vs_nfsp} \t vs cfr: {vs_cfr} \t exp: {np.mean(expl)} +- {np.std(expl)}")
        pickle.dump(agent, open(SAVE_PATH, "wb"))

INFO:tensorflow:Restoring parameters from /usr/local/lib/python3.6/dist-packages/rlcard/models/pretrained/leduc_holdem_nfsp/model


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))

Epoch: 1000 	 vs random: 0.1905 	 vs nfsp: -0.944 	 vs cfr: -0.7315 	 exp: 1.3671587633797457 +- 0.305916779394931
Epoch: 2000 	 vs random: 0.305 	 vs nfsp: -0.7955 	 vs cfr: -0.54 	 exp: 1.484202335030032 +- 0.26811065387590716
Epoch: 3000 	 vs random: 0.4485 	 vs nfsp: -0.7015 	 vs cfr: -0.6425 	 exp: 1.4892328413696723 +- 0.28255344907887464
Epoch: 4000 	 vs random: 0.452 	 vs nfsp: -0.7485 	 vs cfr: -0.4965 	 exp: 1.5077179803288778 +- 0.2515654630381018
Epoch: 5000 	 vs random: 0.385 	 vs nfsp: -0.607 	 vs cfr: -0.4915 	 exp: 1.4810275344322588 +- 0.21826057973157298
Epoch: 6000 	 vs random: 0.267 	 vs nfsp: -0.638 	 vs cfr: -0.6045 	 exp: 1.5191662985121581 +- 0.1693935180404285
Epoch: 7000 	 vs random: 0.408 	 vs nfsp: -0.669 	 vs cfr: -0.515 	 exp: 1.5465019319304865 +- 0.21573901804389864
Epoch: 8000 	 vs random: 0.4295 	 vs nfsp: -0.4835 	 vs cfr: -0.559 	 exp: 1.5358425177416672 +- 0.17142714776517334
Epoch: 9000 	 vs random: 0.382 	 vs nfsp: -0.5445 	 vs cfr: -0.3875 	 exp:

In [None]:
import rlcard
from rlcard import models
from rlcard.agents import LeducholdemHumanAgent as HumanAgent
from rlcard.utils import print_card

# Make environment
# Set 'record_action' to True because we need it to print results
env = rlcard.make('leduc-holdem', config={'record_action': True})
human_agent = HumanAgent(env.action_num)
env.set_agents([human_agent, agent])

print(">> Leduc Hold'em pre-trained model")

while True:
    print(">> Start a new game")

    trajectories, payoffs = env.run(is_training=False)
    # If the human does not take the final action, we need to
    # print other players action
    final_state = trajectories[0][-1][-2]
    action_record = final_state['action_record']
    state = final_state['raw_obs']
    _action_list = []
    for i in range(1, len(action_record)+1):
        if action_record[-i][0] == state['current_player']:
            break
        _action_list.insert(0, action_record[-i])
    for pair in _action_list:
        print('>> Player', pair[0], 'chooses', pair[1])

    # Let's take a look at what the agent card is
    print('===============     CFR Agent    ===============')
    print_card(env.get_perfect_information()['hand_cards'][1])

    print('===============     Result     ===============')
    if payoffs[0] > 0:
        print('You win {} chips!'.format(payoffs[0]))
    elif payoffs[0] == 0:
        print('It is a tie.')
    else:
        print('You lose {} chips!'.format(-payoffs[0]))
    print('')

    input("Press any key to continue...")

>> Leduc Hold'em pre-trained model
>> Start a new game

┌─────────┐
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
│░░░░░░░░░│
└─────────┘
┌─────────┐
│J        │
│         │
│         │
│    ♠    │
│         │
│         │
│        J│
└─────────┘
Yours:   +
Agent 1: ++
0: call, 1: raise, 2: fold

>> You choose action (integer): 0
>> Player 1 chooses check

┌─────────┐
│Q        │
│         │
│         │
│    ♠    │
│         │
│         │
│        Q│
└─────────┘
┌─────────┐
│J        │
│         │
│         │
│    ♠    │
│         │
│         │
│        J│
└─────────┘
Yours:   ++
Agent 1: ++
0: raise, 1: fold, 2: check

>> You choose action (integer): 2
>> Player 1 chooses raise

┌─────────┐
│Q        │
│         │
│         │
│    ♠    │
│         │
│         │
│        Q│
└─────────┘
┌─────────┐
│J        │
│         │
│         │
│    ♠    │
│         │
│         │
│        J│
└─────────┘
Yours:   ++
Agent 1: ++++++
0: call, 1: raise, 2: fold

>> You choose 

KeyboardInterrupt: Interrupted by user

In [None]:
eval_env = rlcard.make("leduc-holdem", config={"seed": SEED, "allow_step_back": True})
eval_env.set_agents([models.load("leduc-holdem-cfr").agents[0], models.load("leduc-holdem-nfsp").agents[0]])
print(tournament(eval_env, 100000))

In [None]:
class MCCFRAgent:
    def __init__(self):
        pass
    
    def train(self, num_iter):
        pass
    
    def eval_step(self, state):
        pass