In [0]:
# Create a Tic Tac Toe game were an AI uses Reinforcement Learning

In [0]:
import numpy as np
import random

class Agent:
  
  def __init__(self, alpha = 0.5, epsilon = 0.1, verbose = True):
    self.state_history = []
    self.alpha = alpha
    self.verbose = verbose
    self.epsilon = epsilon
    self.latest_pos = ()
    
  def attach_to_env(self, env):
    self.env = env
    self.V = np.zeros(self.env.num_of_sym**(self.env.board_dim[0]*self.env.board_dim[1]))
 
  def take_action(self, position):
    self.env.place_move(self.sym, position)
    self.latest_pos = position
    
  def undo_action(self):
    self.env.delete_move(self.latest_pos)
    
  def random_action(self):
    chosen_cell = random.choice(self.env.get_empty_cells())
    self.take_action(chosen_cell)
    
  def strategic_action(self):
    rand_num = random.random()
    if random_num < epsilon:
      self.random_action()
  
  def optimal_action(self):
    empty_cells = self.env.get_empty_cells()
    for cell in empty_cells:
      self.take_action(cell)
      state, winner, ended = self.env.get_swe_tuple()
      self.undo_action()

    
  def assign_symbol(self, sym):
    self.sym = sym
    
  def reset_state_history(self):
    self.state_history = []
  
  # Create an array to map states to V value of the player
  def initialize_v(self):
    for state, winner, ended in self.env.swe_tuples:
      if ended:        
        if winner == self.sym:
          v = 1.0
        else:
          v = 0.0
      else:
        v = 0.5
      self.V[state] = v
    # For debug  
    #np.set_printoptions(threshold=np.nan)
    #print(self.V)
  
  # Backtrack state history
  # Vprev_state = Vprev_state + alpha * (Vnext-state - Vprev-state)   
  # Where Vnext_state is the current state
  # Will be executed after the end of an episode
  def update_v(self):
    target = self.reward()
    for prev in reversed(self.state_history):
      value = self.V[prev] + self.alpha * (target - self.V[prev])
      self.V[prev] = value
      target = value
    self.reset_state_history()
      
  # Checks the reward at the end of the game
  # Very important part in update_v()
  def reward(self):
    if not env.has_ended():
      return 0
    if self.sym == env.get_winner():
      return 1
    else:
      return 0
  # Gets the state from the environment and update
  # the agent's state history.
  def update_state_history(self):
    self.state_history = self.env.get_state_in_hash()
    
class Environment:
   
  def __init__(self, name, verbose = True):
    self.name = name
    self.board_dim = (3,3)
    self.board_grid =  np.zeros(self.board_dim)
    self.move_history = []
    self.state_history = []
    self.env_detail_history = []
    self.num_of_sym = 3
    self.player_sym = ["X", "O"]
    self.board_sym = ["X", "O", " "]
    self.coefficients = (-1,0,1)
    self.swe_tuples = ()
    self.verbose = verbose

  # The order is important 
  # First player in the tuple is assigned to "X".
  def register_players(self, players):
    for i, player in enumerate(players):
      player.attach_to_env(self)
      player.assign_symbol(self.player_sym[i])
 
  # Gets the winner by returning "X" or "O". Returns "None" otherwise.
  def get_winner(self):
    sums = []
    sums.extend(self.board_grid.sum(0))  # sum of each columns
    sums.extend(self.board_grid.sum(1))  # sum of each rows
    sums.append(sum(self.board_grid.diagonal()))  # sum of top-left to bottom-right diagonal
    sums.append(sum(np.fliplr(self.board_grid).diagonal()))  # sum of bottom-left to top-right diagonal
    #print(sums)
    
    winner = []
    if (3.0 in sums):
      winner += "O"
    if (-3.0 in sums):
      winner += "X"
    
    if len(winner) == 1:
      return winner[0]
    else:
      return "None"
 
  def num_to_sym(self, num):
    switcher = {
        -1:"X",
        0: " ",
        1: "O"
    }
    return switcher.get(num, "invalid")
  
  def sym_to_num(self, sym):
    switcher = {
        "X":-1,
        " ":0 ,
        "O":1 
    }
    return switcher.get(sym, "invalid")
 
  def place_move(self, sym, position):
    env_detail = []
    i,j = position
    if self.board_grid[i,j] == 0:
      self.board_grid[i,j] = self.sym_to_num(sym) 
       
      env_detail.extend(self.get_swe_tuple())
      env_detail.extend(sym)
      env_detail.extend(position)
      self.env_detail_history.append(env_detail)
      
      self.state_history.append(env_detail[0])
      self.move_history.append(env_detail[3:6])            

  
  # Will undo the effects of place_move() and removes the record in move_history
  # Needs to be used with precaution and should only be used after place_move()
  def delete_move(self, position):
    i,j = position
    if self.board_grid[i,j] != 0:
      self.board_grid[i,j] = 0 
      del self.move_history[len(self.move_history)-1]
      del self.state_history[len(self.state_history)-1]
      del self.env_detail_history[len(self.env_detail_history)-1]
      
  
  def check_cell_if_empty(self, position):
    i,j = position
    if self.board_grid[i,j] == 0:
      return True
    else:
      return False
    
  def get_empty_cells(self):
    cells_list = []
    for i in range(self.board_dim[0]):
      for j in range (self.board_dim[1]):
        if self.check_cell_if_empty((i,j)) == True:
          cells_list.append((i,j))
    return cells_list
     
  def print_move_history(self):
    print("Move History")
    for i, move in enumerate(self.move_history):
      print(str(i) + ":" + str(move))
      
  def print_state_history(self):
    print("State History")
    for i, state in enumerate(self.state_history):
      print(str(i) + ":" + str(state))  

  def print_env_detail_history(self):
    print("Environment Detail History")
    for i, detail in enumerate(self.env_detail_history):
      print(str(i) + ":" + str(detail))  
  
  def draw_board_in_sym(self):
    print("---------")
    for i in range(0,self.board_dim[0]):
      line_buff = ""
      for j in range(0,self.board_dim[1]):
        char_buff = "|" + self.num_to_sym(self.board_grid[i,j]) + "|"
        line_buff += char_buff  
      print(line_buff)
      print("---------")
  
  def draw_board_in_num(self):
    print(self.board_grid)
    
  def has_ended(self): 
    if len(self.get_empty_cells()) == 0:
      return True
    else:
      if self.get_winner() == "None":
        return False
      else:  
        return True
    
  # Creates a hash from a given state. 
  # Based on converting the table into a decimal converted from base 3.
  def get_state_in_hash(self):
    # Coefficients taken from the board grid with these conditions
    # -1 -> 0, 0 -> 1, 1 -> 2
    coeff = np.ravel(self.board_grid + 1)
    
    # Hashing arithmetic
    power = np.arange(self.board_dim[0]*self.board_dim[1])
    base = np.full_like(coeff, self.num_of_sym)
    
    # hash taken from base3 to base10 conversion
    return np.sum((coeff*(base**power)).astype(int))
  
  def get_swe_tuple(self):
    state = self.get_state_in_hash()
    pos = -1
    for i, t in enumerate(env.swe_tuples):
      if state in t:
        pos = i
    
    if pos != -1:
      return self.swe_tuples[pos]
    else:
      return None
    
  
  
  # Permutation of states in a recurrent from function
  def recur_permutate_states(self, i=0, j=0):
    results = []

    for coeff in self.coefficients:
      self.board_grid[i,j] = coeff
      if j == self.board_dim[1]-1:
        if i == self.board_dim[1]-1:
          state_in_hash = self.get_state_in_hash()
          winner = self.get_winner()
          ended = self.has_ended()
          results.append((state_in_hash, winner, ended))
        else:
          results += self.recur_permutate_states(i+1, 0)
      else:
        results += self.recur_permutate_states(i, j+1)   
    return results
  
  # Permutate all state to create a look-up array and is
  # saved to a instance variable
  def permutate_states(self):
    self.swe_tuples = self.recur_permutate_states()
    self.board_grid.fill(0.0)
    
    if self.verbose:
      print("Permutating all states/moves...")   
      print(self.swe_tuples)
      
      
# Loops until the game is over
def play_game(env, players):
  env.verbose = False
  env.register_players(random.sample(players, len(players)))
  env.permutate_states()
  
  for count, player in enumerate(players):
    player.initialize_v() # initialize V values on each agent
    print("Player %s is %s" % (count, player.sym))
    
  #TEST
  players[1].optimal_action()
  
  while not env.has_ended():
    for player in players:
      player.random_action()
      if env.has_ended():
        env.draw_board_in_sym()
        break
  
  #env.print_move_history()
  #env.print_state_history()
  env.print_env_detail_history()
  #print("State-Winner-Ended Tuple:{}".format(env.get_swe_tuple()))
 
    
  
  
    
    
          
        
            
                       
       

In [307]:
# Choose between "O" or "X"
pO = Agent()
pX = Agent()

env = Environment("TTT")
#env.register_players((pX, pO))

#pO.take_action((0,0))
#pX.take_action((0,1))
#pX.take_action((0,2))
#pX.take_action((1,0))
#pO.take_action((1,1))
#pX.take_action((1,2))
#pX.take_action((2,0))
#pX.take_action((2,1))
#pO.take_action((2,2))

#print("Illustration of the board")
#env.draw_board_in_sym()
#print("Move list:")
#env.print_move_history()
#print("Output from get_winner()")
#print(env.get_winner())
##print(env.check_cell_if_empty((1,0)))
#print("Output from get_empty_cells()")
#print(env.get_empty_cells())
#print("Output from get_state_in_hash()")
#print(env.get_state_in_hash())
#print("Output from has_ended()")
#print(env.has_ended())
#print("Output from reward()")
#print(pO.reward())
#print(len(env.recur_permutate_states()))
#env.permutate_states()

#pO.initialize_v()
#pO.update_state_history()

#print(random.sample(("p1","p2"),2))

play_game(env,(pX,pO))

Player 0 is O
Player 1 is X
---------
|O||X|| |
---------
|O||O||O|
---------
| ||X||X|
---------
Environment Detail History
0:[10084, 'None', False, 'O', 1, 2]
1:[10081, 'None', False, 'X', 0, 1]
2:[10108, 'None', False, 'O', 1, 0]
3:[7921, 'None', False, 'X', 2, 1]
4:[7922, 'None', False, 'O', 0, 0]
5:[1361, 'None', False, 'X', 2, 2]
6:[1442, 'O', True, 'O', 1, 1]
