<a href="https://colab.research.google.com/github/zhenbangt/aa228_final_project/blob/main/colab-testing-mccfr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# <a href='https://github.com/datamllab/rlcard'> <center> <img src='https://miro.medium.com/max/1000/1*_9abDpNTM9Cbsd2HEXYm9Q.png' width=500 class='center' /></a> 

## **Training CFR on Leduc Hold'em**
To show how we can use `step` and `step_back` to traverse the game tree, we provide an example of solving Leduc Hold'em with CFR:

* First, we install RLcard and Tensorflow. To use Tensorflow implementation of the example algorithms, we recommend installing the supported verison of Tensorflow with rlcard[tensorflow].

In [2]:
!pip install -q rlcard 
!pip install -q rlcard[tensorflow]

[K     |████████████████████████████████| 6.7MB 12.1MB/s 
[?25h  Building wheel for rlcard (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 110.5MB 54kB/s 
[K     |████████████████████████████████| 51kB 8.5MB/s 
[K     |████████████████████████████████| 512kB 53.7MB/s 
[K     |████████████████████████████████| 3.8MB 60.3MB/s 
[?25h  Building wheel for gast (setup.py) ... [?25l[?25hdone
[31mERROR: tensorflow-probability 0.11.0 has requirement gast>=0.3.2, but you'll have gast 0.2.2 which is incompatible.[0m


In [4]:
import numpy as np

import rlcard
from rlcard.agents import CFRAgent
from rlcard import models
from rlcard.utils import set_global_seed, tournament
from rlcard.utils import Logger
import collections

import os
import pickle

from rlcard.utils.utils import *

# Make environment and enable human mode
env = rlcard.make('leduc-holdem', config={'seed': 0, 'allow_step_back':True})
eval_env = rlcard.make('leduc-holdem', config={'seed': 0})

In [19]:
class MCCFRAgent:
    def __init__(self, env, model_path="./cfr_model"):
        """ 
            env: Simulator from RLcard
        """
        self.use_raw = False  # use_raw is indicator to the rlcard env
        self.env = env
        self.model_path = model_path

        # A policy is a dict state_str -> action probabilities
        self.policy = collections.defaultdict(list)
        self.average_policy = collections.defaultdict(np.array)
        # Regret is a dict state_str -> action regrets
        self.regrets = collections.defaultdict(np.array)
        self.iteration = 0

    def train(self):
        self.iteration += 1
        # Firstly, traverse tree to compute counterfactual regret for each player
        # The regrets are recorded in traversal
        for player_id in range(self.env.player_num):
            self.env.reset()
            probs = np.ones(self.env.player_num)
            self.traverse_tree(probs, player_id)

        # For MCCFR update policy after each traversal?
        self.update_policy()

    def traverse_tree(self, probs, player_id):
        """ Traverse the game tree, update the regrets
        Args:
            probs: The reach of the current node (probability given other player actions)
            player_id: The identified player to update the value
        Returns:
            state_utility (list): The expected utilities for all the players
        """
        if self.env.is_over():
            return self.env.get_payoffs()

        # determine who's turn
        current_player = self.env.get_player_id()
        # action_utilities is for current player; state_utility is for all players
        action_utilities = dict()
        state_utility = np.zeros(self.env.player_num)
        obs, legal_actions = self.get_state(current_player)
        # policies are proportional to postive regret
        action_probs = self.action_probs(obs, legal_actions, self.policy)

        if current_player != player_id:
            # for MCCFR this should be a an action according to the policy of current player?
            # sample an action
            action = np.random.default_rng().choice(range(self.env.action_num), p=action_probs)
            self.env.step(action)
            utility = self.traverse_tree(probs, player_id)
            self.env.step_back()
            ################## ? #################
            if obs not in self.average_policy:
                self.average_policy[obs] = np.zeros(self.env.action_num)
            ######################################
            self.average_policy[obs][action] +=  self.iteration * 1
            return utility

        # if current player is the identified player
        else:
            for action in legal_actions:
                action_prob = action_probs[action]
                # integers are not mutable; do not need to use deepcopy
                new_probs = np.copy(probs)
                new_probs[current_player] *= action_prob

                # Keep traversing the child state to get utility of the state after taking this action
                self.env.step(action)
                utility = self.traverse_tree(new_probs, player_id)
                self.env.step_back()

                # calculate expected state utility (vectorised for all agents)
                state_utility += action_prob * utility
                # store the "Q-value"
                action_utilities[action] = utility

            # If it is the identified player, we record the policy and compute regret
            # counterfactual_prob a.k.a reach (excluding self contribution)
            
            player_prob = probs[current_player] # self-contribtuion
            counterfactual_prob = np.prod(probs[:current_player]) * np.prod(probs[current_player + 1 :])
            player_state_utility = state_utility[current_player]

            # if this information set has not been discovered before:
            if obs not in self.regrets:
                self.regrets[obs] = np.zeros(self.env.action_num)
            if obs not in self.average_policy:
                self.average_policy[obs] = np.zeros(self.env.action_num)

            # compute the advantage (regret) for each action from the current policy
            for action in legal_actions:
                action_prob = action_probs[action]
                regret = counterfactual_prob * (action_utilities[action][current_player] - player_state_utility)
                regret = action_utilities[action][current_player] - player_state_utility
                self.regrets[obs][action] += regret
                self.average_policy[obs][action] += self.iteration * player_prob * action_prob
            return state_utility

    def update_policy(self):
        """ Apply regret matching to each infoset
        """
        for obs in self.regrets:
            self.policy[obs] = self.regret_matching(obs)

    def regret_matching(self, obs):
        """ Apply regret matching to an infoset
        """
        regret = self.regrets[obs]
        positive_regret_sum = sum([r for r in regret if r > 0])

        action_probs = np.zeros(self.env.action_num)
        if positive_regret_sum > 0:
            for action in range(self.env.action_num):
                action_probs[action] = max(0.0, regret[action] / positive_regret_sum)
        else:
            for action in range(self.env.action_num):
                action_probs[action] = 1.0 / self.env.action_num
        return action_probs

    def action_probs(self, obs, legal_actions, policy):
        """ Obtain the action probabilities of the current state
        Args:
            obs (str): state_str
            legal_actions (list): List of leagel actions
            policy (dict): The used policy
        Returns:
            (tuple) that contains:
                action_probs(numpy.array): The action probabilities
                legal_actions (list): Indices of legal actions
        """
        if obs not in policy.keys():
            # uniform
            action_probs = np.array([1.0 / self.env.action_num for _ in range(self.env.action_num)])
            self.policy[obs] = action_probs
        else:
            action_probs = policy[obs]
        # prune illegal actions
        action_probs = remove_illegal(action_probs, legal_actions)
        return action_probs

    def eval_step(self, state):
        """ Given a state, predict action based on average policy
        Args:
            state (numpy.array): State representation
        Returns:
            action (int): Predicted action
        """
        probs = self.action_probs(state["obs"].tostring(), state["legal_actions"], self.average_policy)
        action = np.random.choice(len(probs), p=probs)
        return action, probs

    def get_state(self, player_id):
        """ Get state_str of the player
        Args:
            player_id (int): The player id
        Returns:
            (tuple) that contains:
                state (str): The state str
                legal_actions (list): Indices of legal actions
        """
        state = self.env.get_state(player_id)
        return state["obs"].tostring(), state["legal_actions"]

    def save(self):
        """ Save model
        """
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)

        policy_file = open(os.path.join(self.model_path, "policy.pkl"), "wb")
        pickle.dump(self.policy, policy_file)
        policy_file.close()

        average_policy_file = open(os.path.join(self.model_path, "average_policy.pkl"), "wb")
        pickle.dump(self.average_policy, average_policy_file)
        average_policy_file.close()

        regrets_file = open(os.path.join(self.model_path, "regrets.pkl"), "wb")
        pickle.dump(self.regrets, regrets_file)
        regrets_file.close()

        iteration_file = open(os.path.join(self.model_path, "iteration.pkl"), "wb")
        pickle.dump(self.iteration, iteration_file)
        iteration_file.close()

    def load(self):
        """ Load model
        """
        if not os.path.exists(self.model_path):
            return

        policy_file = open(os.path.join(self.model_path, "policy.pkl"), "rb")
        self.policy = pickle.load(policy_file)
        policy_file.close()

        average_policy_file = open(os.path.join(self.model_path, "average_policy.pkl"), "rb")
        self.average_policy = pickle.load(average_policy_file)
        average_policy_file.close()

        regrets_file = open(os.path.join(self.model_path, "regrets.pkl"), "rb")
        self.regrets = pickle.load(regrets_file)
        regrets_file.close()

        iteration_file = open(os.path.join(self.model_path, "iteration.pkl"), "rb")
        self.iteration = pickle.load(iteration_file)
        iteration_file.close()

* Now we start to train CFR on Lecuc Hold'em. The training logs and the learning curves are shown as below.

In [None]:
# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 100
save_plot_every = 1000
evaluate_num = 10000
episode_num = 10000

# The paths for saving the logs and learning curves
log_dir = './experiments/leduc_holdem_cfr_result/'

# Set a global seed
set_global_seed(0)

# Initilize CFR Agent
agent = MCCFRAgent(env)
# agent.load()  # If we have saved model, we first load the model

# Evaluate CFR against pre-trained NFSP
eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]])

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

for episode in range(episode_num):
    agent.train()
    print('\rIteration {}'.format(episode), end='')
    # Evaluate the performance. Play with NFSP agents.
    if episode % evaluate_every == 0:
        agent.save() # Save model
        logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])

# Close files in the logger
logger.close_files()

# Plot the learning curve
logger.plot('CFR')

INFO:tensorflow:Restoring parameters from /usr/local/lib/python3.6/dist-packages/rlcard/models/pretrained/leduc_holdem_nfsp/model
Iteration 0
----------------------------------------
  timestep     |  103646
  reward       |  -0.83765
----------------------------------------
Iteration 100
----------------------------------------
  timestep     |  106730
  reward       |  0.2187
----------------------------------------
Iteration 200
----------------------------------------
  timestep     |  110051
  reward       |  0.3366
----------------------------------------
Iteration 300
----------------------------------------
  timestep     |  113510
  reward       |  0.36455
----------------------------------------
Iteration 400
----------------------------------------
  timestep     |  116949
  reward       |  0.3827
----------------------------------------
Iteration 500
----------------------------------------
  timestep     |  119973
  reward       |  0.36235
---------------------------------