In [1]:
%cd ..
%reload_ext autoreload
%autoreload 2

/Users/anantabhattarai/Documents/TUM/ADLR/tum-adlr-ws20-9


# Import Libraries

In [2]:
from alpha_one.game.information_set import InformationSetGenerator
import numpy as np
import pyspiel

from open_spiel.python.algorithms import mcts
from open_spiel.python.algorithms.alpha_zero import evaluator as evaluator_lib
from open_spiel.python.algorithms.alpha_zero import model as model_lib
from open_spiel.python.algorithms.mcts import SearchNode

from alpha_one.alg.imperfect_information import AlphaOneImperfectInformationMCTSEvaluator
from alpha_one.alg.mcts import ImperfectInformationMCTSBot

from alpha_one.utils.statemask import get_state_mask
from open_spiel.python.algorithms import get_all_states


from alpha_one.game import trajectory
from alpha_one.game import buffer

# Load game and define hyperparameters

In [3]:
uct_c = 3                                     
max_simulations = 100   
verbose = False
policy_epsilon = 0.25                             
policy_alpha = 1

In [4]:
# state_to_value maps each possible state in the game to the value
# for eg: intial state can be mapped as 1 and final state can be mapped as 53

game_name = "kuhn_poker"
game = pyspiel.load_game(game_name)
states = get_all_states.get_all_states(game)
state_to_value = {}
a = 0
for key, values in states.items():
    state_to_value[values.__str__()] = a
    a += 1

# Initialize Neural Network Model

In [5]:
model_saves_path = '../model_saves/kuhn_poker/obs_node_model'
nn_width = 50
nn_depth = 5
learning_rate = 0.001
weight_decay = 0.0001

model_type = 'mlp'
    
obs_model = model_lib.Model.build_model(
                              model_type, game.observation_tensor_shape(), len(state_to_value),
                              nn_width=nn_width, nn_depth=nn_depth, 
                              weight_decay=weight_decay, 
                              learning_rate=learning_rate, 
                              path=model_saves_path)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [6]:
model_saves_path = '../model_saves/kuhn_poker/game_node_model'
nn_width = 50
nn_depth = 5
learning_rate = 0.001
weight_decay = 0.0001

model_type = 'mlp'
    
game_model = model_lib.Model.build_model(
                              model_type, game.observation_tensor_shape(), game.num_distinct_actions(),
                              nn_width=nn_width, nn_depth=nn_depth, 
                              weight_decay=weight_decay, 
                              learning_rate=learning_rate, 
                              path=model_saves_path)

# Define Required Functions

In [7]:
evaluator = AlphaOneImperfectInformationMCTSEvaluator(state_to_value, obs_model, game_model)

In [8]:
# get policy and value at the observation node
def get_policy_value_obs_node(root, state_mask, index_track):
    
    # state_mask and state_masked_policy are used while training NN

    policy = np.zeros(len(root.children))
    for c in root.children:
        policy[c.action] = c.explore_count # used explore count because policy can be negative when using rewards

    policy /= policy.sum()

    state_masked_policy = np.zeros(len(state_mask))
    
    for i in range(len(index_track)):
        state_masked_policy[index_track] = policy[i]
        
    return state_masked_policy, policy

In [9]:
# get policy and value at the game node after guessing the state
def get_policy_value_game_node(root, guess_state):
    
    game_name = "kuhn_poker"
    game_temp = pyspiel.load_game(game_name)
    policy = np.zeros(game_temp.num_distinct_actions())
    
    for c in root.children[guess_state].children:
        policy[c.action] = c.explore_count
    
    policy /= policy.sum()
    
    return policy

In [10]:
def ii_mcts_agent(information_set_generator, state_to_value, ii_mcts_bot):
    root, _ = ii_mcts_bot.mcts_search(information_set_generator)
    
    information_set = information_set_generator.calculate_information_set()
    
    state_mask, index_track = get_state_mask(state_to_value, information_set)
    
    state_masked_policy, state_policy = get_policy_value_obs_node(root, state_mask, index_track)
    
    guess_state = np.argmax(state_policy)
    
    game_node_policy = get_policy_value_game_node(root, guess_state)

    return state_masked_policy, game_node_policy, information_set[guess_state], state_mask

In [11]:
def play_one_game():
    
    # trajectory of the observation nodes for training NN
    trajectory_obs = trajectory.GameTrajectory()
    
    # trajectory of the game nodes for training NN
    trajectory_game = trajectory.GameTrajectory()
    
    
    state = game.new_initial_state()
    information_set_generator = InformationSetGenerator(game)
    
    ii_mcts_bot = ImperfectInformationMCTSBot(game,
                                            uct_c,
                                                  max_simulations,
                                                  evaluator,
                                                  False,
                                                  child_selection_fn=SearchNode.puct_value)
    while not state.is_terminal():
        
        if state.current_player() < 0:
            action = np.random.choice(state.legal_actions())

            information_set_generator.register_action(action)
            state.apply_action(action)
            information_set_generator.register_observation(state)
            
            
        else:
            
            observations = information_set_generator._get_observation(state, state.current_player())
            
            state_masked_policy, game_node_policy, guess_state, state_mask = ii_mcts_agent(
                                                                     information_set_generator, 
                                                                     state_to_value, 
                                                                     ii_mcts_bot)
            action = np.argmax(game_node_policy)
            
            
            trajectory_obs.states.append(trajectory.TrajectoryState(observations, 
                                                                    state.current_player(), 
                                                                    state_mask, 
                                                                    action,
                                                                    state_masked_policy))
                                         
            trajectory_game.states.append(trajectory.TrajectoryState(guess_state.observation_tensor(), 
                                                                    state.current_player(), 
                                                                    state.legal_actions_mask(), 
                                                                    action,
                                                                    game_node_policy))
            
            
            information_set_generator.register_action(action)
            state.apply_action(action)
            information_set_generator.register_observation(state)
            
            
    trajectory_obs.returns = state.returns()
    trajectory_game.returns = state.returns()
    return trajectory_obs, trajectory_game

In [12]:
def collect_trajectories(replay_buffer_obs, replay_buffer_game):
    
    for j in range(30):
        trajectory_obs, trajectory_game = play_one_game()
        
        p1_outcome = trajectory_obs.returns[0]
        
        
        replay_buffer_obs.extend(model_lib.TrainInput(s.observation, s.legals_mask, s.policy, p1_outcome) 
                             for s in trajectory_obs.states)
        
        replay_buffer_game.extend(model_lib.TrainInput(s.observation, s.legals_mask, s.policy, p1_outcome) 
                             for s in trajectory_game.states)

In [13]:
replay_buffer_obs = buffer.ReplayBuffer(100)
replay_buffer_game = buffer.ReplayBuffer(100)

# Train Neural Network

In [14]:
for i in range(30):
    losses_obs = []
    losses_game = []
    
    collect_trajectories(replay_buffer_obs, replay_buffer_game)
    
    for _ in range(100 // 20):
        data_obs = replay_buffer_obs.sample(20)
        data_game = replay_buffer_game.sample(20)
        
        
        losses_obs.append(obs_model.update(data_obs))
        losses_game.append(game_model.update(data_game))
        
    losses = sum(losses_obs, model_lib.Losses(0, 0, 0)) / len(losses_obs)
        
    print(f"Loss at iteration {i} of observation model: {losses}")
        
    losses = sum(losses_game, model_lib.Losses(0, 0, 0)) / len(losses_game)
        
    print(f"Loss at iteration {i} of game model: {losses}")
    
    print("")

Loss at iteration 0 of observation model: Losses(total: 2.441, policy: 0.545, value: 1.878, l2: 0.018)
Loss at iteration 0 of game model: Losses(total: 2.557, policy: 0.687, value: 1.854, l2: 0.016)

Loss at iteration 1 of observation model: Losses(total: 2.069, policy: 0.592, value: 1.460, l2: 0.018)
Loss at iteration 1 of game model: Losses(total: 2.464, policy: 0.669, value: 1.779, l2: 0.015)

Loss at iteration 2 of observation model: Losses(total: 2.564, policy: 0.528, value: 2.019, l2: 0.017)
Loss at iteration 2 of game model: Losses(total: 2.573, policy: 0.659, value: 1.899, l2: 0.015)

Loss at iteration 3 of observation model: Losses(total: 2.633, policy: 0.531, value: 2.085, l2: 0.017)
Loss at iteration 3 of game model: Losses(total: 2.616, policy: 0.641, value: 1.961, l2: 0.014)

Loss at iteration 4 of observation model: Losses(total: 2.221, policy: 0.611, value: 1.593, l2: 0.016)
Loss at iteration 4 of game model: Losses(total: 2.489, policy: 0.617, value: 1.858, l2: 0.014)



# Evaluations

In [15]:
def initialize_bot(game, uct_c, max_simulations, policy_epsilon, policy_alpha):
    
    if policy_epsilon == None or policy_alpha == None:
        noise = None
    else:
        noise = (policy_epsilon, policy_alpha)
        

    evaluator = mcts.RandomRolloutEvaluator(n_rollouts=100)

    bot = mcts.MCTSBot(
          game,
          uct_c,
          max_simulations,
          evaluator,
          solve=False,
          dirichlet_noise=noise,
          child_selection_fn=mcts.SearchNode.puct_value,
          verbose=False)
    
    return bot

In [16]:
def mcts_agent(state, information_set_generator):
    current_player = state.current_player()
    information_set = information_set_generator.calculate_information_set(current_player)
    policy = np.zeros(game.num_distinct_actions())

    # Evaluate each state in the information set by MCTS independently.
    # After the searches are completed, the numbers of visits for each action from the root 
    # are summed across all trees, 
    # and an action is chosen that maximises the total number of visits.
    for s in information_set:
        bot = initialize_bot(game, uct_c, max_simulations, policy_epsilon, policy_alpha)
        root = bot.mcts_search(s)
        if verbose:
            print_game_tree(root)
        for c in root.children:
            policy[c.action] += c.explore_count
            #policy[c.action] += c.total_reward / c.explore_count  # Use value of node for selection
    return policy

In [17]:
game_returns = []
for _ in range(100):
    actions = []
    state = game.new_initial_state()
    information_set_generator = InformationSetGenerator(game)
    
    ii_mcts_bot = ImperfectInformationMCTSBot(game,
                                            uct_c,
                                                  max_simulations,
                                                  evaluator,
                                                  False,
                                                  child_selection_fn=SearchNode.puct_value)
    
    while not state.is_terminal():

        # environment state
        if state.current_player() < 0:
            action = np.random.choice(state.legal_actions())

            action_str = state.action_to_string(state.current_player(), action)
            actions.append(action_str)

            information_set_generator.register_action(action)
            state.apply_action(action)
            information_set_generator.register_observation(state)

        # player 1 as a MCTS bot
        elif state.current_player() == 0:
            policy = mcts_agent(state, information_set_generator)
            action = np.argmax(policy)

            action_str = state.action_to_string(state.current_player(), action)
            actions.append(action_str)

            information_set_generator.register_action(action)
            state.apply_action(action)
            information_set_generator.register_observation(state)

        # player 2 as IIG-MCTS bot
        else:
            # action = np.random.choice(state.legal_actions())
            
            state_masked_policy, game_node_policy, guess_state, state_mask = ii_mcts_agent(
                                                                     information_set_generator, 
                                                                     state_to_value, 
                                                                     ii_mcts_bot)
            action = np.argmax(game_node_policy)

            action_str = state.action_to_string(state.current_player(), action)
            actions.append(action_str)

            information_set_generator.register_action(action)
            state.apply_action(action)
            information_set_generator.register_observation(state)


    #print(actions)       
    #print(state.returns())
    #print()
    game_returns.append(state.returns())

In [18]:
game_returns = np.array(game_returns)
print(f"Average return: {game_returns.mean(axis=0)}")

Average return: [ 0.08 -0.08]
