In [27]:
#@title ##### License { display-mode: "form" }
# Copyright 2019 DeepMind Technologies Ltd. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# OpenSpiel

* This Colab gets you started the basics of OpenSpiel.
* OpenSpiel is a framework for reinforcement learning in games. The code is hosted [on github](https://github.com/deepmind/open_spiel/).
* There is an accompanying video tutorial that works through this colab. It will be linked here once it is live.
* There is also an [OpenSpiel paper](https://arxiv.org/abs/1908.09453) with more detail.

## Install

The following command will install OpenSpiel via pip.

Only the required dependencies are installed. You may need other dependencies if you use some of the algorithms. There is a [the complete list of packages and versions](https://github.com/deepmind/open_spiel/blob/master/open_spiel/scripts/python_extra_deps.sh) we install for the CI tests, which can be installed as necessary.


In [28]:
# !pip install --upgrade open_spiel

# Part 1. OpenSpiel API Basics.

In [29]:
# Importing pyspiel and showing the list of supported games.
import pyspiel
print(pyspiel.registered_names())
pyspiel

['2048', 'add_noise', 'amazons', 'backgammon', 'bargaining', 'battleship', 'blackjack', 'blotto', 'breakthrough', 'bridge', 'bridge_uncontested_bidding', 'catch', 'checkers', 'chess', 'cliff_walking', 'clobber', 'coin_game', 'colored_trails', 'connect_four', 'coop_box_pushing', 'coop_to_1p', 'coordinated_mp', 'crazy_eights', 'cursor_go', 'dark_chess', 'dark_hex', 'dark_hex_ir', 'deep_sea', 'dots_and_boxes', 'dou_dizhu', 'efg_game', 'euchre', 'first_sealed_auction', 'gin_rummy', 'go', 'goofspiel', 'havannah', 'hearts', 'hex', 'kriegspiel', 'kuhn_poker', 'laser_tag', 'leduc_poker', 'lewis_signaling', 'liars_dice', 'liars_dice_ir', 'maedn', 'mancala', 'markov_soccer', 'matching_pennies_3p', 'matrix_bos', 'matrix_brps', 'matrix_cd', 'matrix_coordination', 'matrix_mp', 'matrix_pd', 'matrix_rps', 'matrix_rpsw', 'matrix_sh', 'matrix_shapleys_game', 'mfg_crowd_modelling', 'mfg_crowd_modelling_2d', 'mfg_dynamic_routing', 'mfg_garnet', 'misere', 'morpion_solitaire', 'negotiation', 'nfg_game', 'n

<module 'pyspiel' from '/home/boulux/projects/open_spiel/build/python/pyspiel.so'>

In [30]:
# Loading a game (with no/default parameters).
game = pyspiel.load_game("tic_tac_toe")
print(game)

tic_tac_toe()


In [31]:
# Some properties of the games.
print(game.num_players())
print(game.max_utility())
print(game.min_utility())
print(game.num_distinct_actions())

2
1.0
-1.0
9


In [32]:
# Creating initial states.
state = game.new_initial_state()
print(state)

...
...
...


In [33]:
# Basic information about states.
print(state.current_player())
print(state.is_terminal())
print(state.returns())
print(state.legal_actions())

0
False
[0.0, 0.0]
[0, 1, 2, 3, 4, 5, 6, 7, 8]


In [34]:
# Playing the game: applying actions.
state = game.new_initial_state()
state.apply_action(1)
print(state)
print(state.current_player())
state.apply_action(2)
state.apply_action(4)
state.apply_action(0)
state.apply_action(7)
print(state)
print(state.is_terminal())
print(state.player_return(0))   # win for x (player 0)
print(state.current_player())

.x.
...
...
1
oxo
.x.
.x.
True
1.0
-4


In [35]:
# Different game: Breakthrough with default parameters (number of rows and columns are both 8)
game = pyspiel.load_game("breakthrough")
state = game.new_initial_state()
print(state)

8bbbbbbbb
7bbbbbbbb
6........
5........
4........
3........
2wwwwwwww
1wwwwwwww
 abcdefgh



In [36]:
# Parameterized games: loading a 6x6 Breakthrough.
game = pyspiel.load_game("breakthrough(rows=6,columns=6)")
state = game.new_initial_state()
print(state)
print(state.legal_actions())
print(game.num_distinct_actions())
for action in state.legal_actions():
  print(f"{action} {state.action_to_string(action)}")

6bbbbbb
5bbbbbb
4......
3......
2wwwwww
1wwwwww
 abcdef

[74, 76, 84, 86, 88, 96, 98, 100, 108, 110, 112, 120, 122, 124, 132, 134]
432
74 a5a4
76 a5b4
84 b5a4
86 b5b4
88 b5c4
96 c5b4
98 c5c4
100 c5d4
108 d5c4
110 d5d4
112 d5e4
120 e5d4
122 e5e4
124 e5f4
132 f5e4
134 f5f4


# Part 2. Normal-form Games and Evolutionary Dynamics in OpenSpiel.

In [37]:
import pyspiel
game = pyspiel.create_matrix_game([[1, -1], [-1, 1]], [[-1, 1], [1, -1]])
print(game)   # name not provided: uses a default
state = game.new_initial_state()
print(state)  # action names also not provided; defaults used

short_name()
Terminal? false
Row actions: row0 row1 
Col actions: col0 col1 
Utility matrix:
1,-1 -1,1 
-1,1 1,-1 



In [38]:
# Normal-form games are 1-step simultaneous-move games.
print(state.current_player())    # special player id 
print(state.legal_actions(0))    # query legal actions for each player
print(state.legal_actions(1))
print(state.is_terminal())


-2
[0, 1]
[0, 1]
False


In [39]:
# Applying a joint action (one action per player)
state.apply_actions([0, 0])
print(state.is_terminal())
print(state.returns())

True
[1.0, -1.0]


In [40]:
# Evolutionary dynamics in Rock, Paper, Scissors
from open_spiel.python.egt import dynamics
from open_spiel.python.egt.utils import game_payoffs_array
import numpy as np

game = pyspiel.load_matrix_game("matrix_rps")   # load the Rock, Paper, Scissors matrix game
payoff_matrix = game_payoffs_array(game)        # convert any normal-form game to a numpy payoff matrix

dyn = dynamics.SinglePopulationDynamics(payoff_matrix, dynamics.replicator)
x = np.array([0.2, 0.2, 0.6])                   # population heavily-weighted toward scissors
dyn(x)

array([ 0.08, -0.08,  0.  ])

In [41]:
# Choose a step size and apply the dynamic
alpha = 0.01
x += alpha * dyn(x)
print(x)
x += alpha * dyn(x)
print(x)
x += alpha * dyn(x)
x += alpha * dyn(x)
x += alpha * dyn(x)
x += alpha * dyn(x)
print(x)

[0.2008 0.1992 0.6   ]
[0.20160481 0.19840479 0.5999904 ]
[0.20487215 0.19527183 0.59985601]


# Part 3. Chance Nodes and Partially-Observable Games.

In [42]:
# Kuhn poker: simplified poker with a 3-card deck (https://en.wikipedia.org/wiki/Kuhn_poker)
import pyspiel
game = pyspiel.load_game("kuhn_poker")
print(game.num_distinct_actions())    # bet and fold


2


In [43]:
# Chance nodes.
state = game.new_initial_state()
print(state.current_player())     # special chance player id
print(state.is_chance_node())
print(state.chance_outcomes())    # distibution over outcomes as a list of (outcome, probability) pairs

-1
True
[(0, 0.3333333333333333), (1, 0.3333333333333333), (2, 0.3333333333333333)]


In [44]:
# Applying chance node outcomes: same function as applying actions.
state.apply_action(0)              # let's choose the first card (jack)
print(state.is_chance_node())      # still at a chance node (player 2's card).
print(state.chance_outcomes())     # jack no longer a possible outcome
state.apply_action(1)              # second player gets the queen
print(state.current_player())      # no longer chance node, time to play!

True
[(1, 0.5), (2, 0.5)]
0


In [45]:
# States vs. information states
print(state)                              # ground/world state (all information open)
print(state.legal_actions())
for action in state.legal_actions():
  print(state.action_to_string(action))
print(state.information_state_string())   # only current player's information!

0 1
[0, 1]
Pass
Bet
0


In [46]:
# Take an action (pass / check), second player's turn.
# Information state tensor is vector of floats (often bits) representing the information state.
state.apply_action(0)
print(state.current_player())
print(state.information_state_string())   # now contains second player's card and the public action sequence
print(state.information_state_tensor())

1
1p
[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [47]:
# Leduc poker is a larger game (6 cards, two suits), 3 actions: fold, check/call, raise.
game = pyspiel.load_game("leduc_poker")
print(game.num_distinct_actions())
state = game.new_initial_state()
print(state)
state.apply_action(0)     # first player gets first jack 
state.apply_action(1)     # second player gets second jack
print(state.current_player())
print(state.information_state_string())
print(state.information_state_tensor())


3
Round: 1
Player: -1
Pot: 2
Money (p1 p2 ...): 99 99
Cards (public p1 p2 ...): -10000 -10000 -10000 
Round 1 sequence: 
Round 2 sequence: 

0
[Observer: 0][Private: 0][Round 1][Player: 0][Pot: 2][Money: 99 99][Round1: ][Round2: ]
[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [48]:
# Let's check until the second round.
print(state.legal_actions_mask())    # Helper function for neural networks.
state.apply_action(1)     # check
state.apply_action(1)     # check
print(state)
print(state.chance_outcomes())   # public card (4 left in the deck)
state.apply_action(2)
print(state.information_state_string())   # player 0's turn again.

[0, 1, 1]
Round: 2
Player: -1
Pot: 2
Money (p1 p2 ...): 99 99
Cards (public p1 p2 ...): -10000 0 1 
Round 1 sequence: Call, Call
Round 2 sequence: 

[(2, 0.25), (3, 0.25), (4, 0.25), (5, 0.25)]
[Observer: 0][Private: 0][Round 2][Player: 0][Pot: 2][Money: 99 99][Public: 2][Round1: 1 1][Round2: ]


# Part 4. Basic RL: Self-play Q-Learning in Tic-Tac-Toe.

In [49]:
# Let's do independent Q-learning in Tic-Tac-Toe, and play it against random.
# RL is based on python/examples/independent_tabular_qlearning.py
from open_spiel.python import rl_environment
from open_spiel.python import rl_tools
from open_spiel.python.algorithms import tabular_qlearner

# Create the environment
env = rl_environment.Environment("tic_tac_toe")
num_players = env.num_players
num_actions = env.action_spec()["num_actions"]

# Create the agents
agents = [
    tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
    for idx in range(num_players)
]

In [50]:
# Train the Q-learning agents in self-play.
for cur_episode in range(25000):
  if cur_episode % 1000 == 0:
    print(f"Episodes: {cur_episode}")
  time_step = env.reset()
  while not time_step.last():
    player_id = time_step.observations["current_player"]
    agent_output = agents[player_id].step(time_step)
    time_step = env.step([agent_output.action])
  # Episode is over, step all agents with final info state.
  for agent in agents:
    agent.step(time_step)
print("Done!")

Episodes: 0
Episodes: 1000


Episodes: 2000
Episodes: 3000
Episodes: 4000
Episodes: 5000
Episodes: 6000
Episodes: 7000
Episodes: 8000
Episodes: 9000
Episodes: 10000
Episodes: 11000
Episodes: 12000
Episodes: 13000
Episodes: 14000
Episodes: 15000
Episodes: 16000
Episodes: 17000
Episodes: 18000
Episodes: 19000
Episodes: 20000
Episodes: 21000
Episodes: 22000
Episodes: 23000
Episodes: 24000
Done!


In [51]:
# Evaluate the Q-learning agent against a random agent.
from open_spiel.python.algorithms import random_agent
eval_agents = [agents[0], random_agent.RandomAgent(1, num_actions, "Entropy Master 2000") ]

time_step = env.reset()
while not time_step.last():
  print("")
  print(env.get_state)
  player_id = time_step.observations["current_player"]
  # Note the evaluation flag. A Q-learner will set epsilon=0 here.
  agent_output = eval_agents[player_id].step(time_step, is_evaluation=True)
  print(f"Agent {player_id} chooses {env.get_state.action_to_string(agent_output.action)}")
  time_step = env.step([agent_output.action])

print("")
print(env.get_state)
print(time_step.rewards)



...
...
...
Agent 0 chooses x(1,2)

...
..x
...
Agent 1 chooses o(0,2)

..o
..x
...
Agent 0 chooses x(1,1)

..o
.xx
...
Agent 1 chooses o(2,0)

..o
.xx
o..
Agent 0 chooses x(1,0)

..o
xxx
o..
[1.0, -1.0]
