In [0]:
# Create a Tic Tac Toe game were an AI uses Reinforcement Learning
# CPU

In [0]:
import numpy as np
import random

class Agent:
  
  def __init__(self, alpha = 0.5, epsilon = 0.1, verbose = True):
    self.state_history = []
    self.alpha = alpha
    self.verbose = verbose
    self.epsilon = epsilon
    self.latest_pos = ()
    self.attached = False
    
  def attach_to_env(self, env):
    if self.attached == False:
      self.env = env
      self.V = np.zeros(self.env.num_of_sym**(self.env.board_dim[0]*self.env.board_dim[1]))
    self.attached = True
 
  def take_action(self, position):
    self.env.place_move(self.sym, position)
    self.latest_pos = position
    
  def undo_action(self):
    self.env.delete_move(self.latest_pos)
    
  def random_action(self):
    chosen_cell = random.choice(self.env.get_empty_cells())
    self.take_action(chosen_cell)
    
  def strategic_action(self):
    rand_num = random.random()
    if rand_num < self.epsilon:
      self.random_action()
    else:
      self.optimal_action()
      
  
  def optimal_action(self):
    empty_cells = self.env.get_empty_cells()
    best_value = -1
    chosen_cell = ()
    for cell in empty_cells:
      self.take_action(cell)
      state, winner, ended = self.env.get_swe_tuple()   
      self.undo_action()
      value = self.V[state]
      if value >= best_value:
        best_value = value
        chosen_cell = cell
    self.take_action(chosen_cell)
    
  def assign_symbol(self, sym):
    self.sym = sym
    
  def reset_state_history(self):
    self.state_history = []
  
  # Create an array to map states to V value of the player
  def initialize_v(self):
    for state, winner, ended in self.env.swe_tuples:
      if ended:        
        if winner == self.sym:
          v = 1.0
        else:
          v = 0.0
      else:
        v = 0.5
      self.V[state] = v
    # For debug  
    #np.set_printoptions(threshold=np.nan)
    #print(self.V)
  
  # Backtrack state history
  # Vprev_state = Vprev_state + alpha * (Vnext-state - Vprev-state)   
  # Where Vnext_state is the current state
  # Will be executed after the end of an episode
  def update_v(self):
    self.update_state_history()
    #print(self.state_history)
    target = self.reward()
    for prev in reversed(self.state_history):
      value = self.V[prev] + self.alpha * (target - self.V[prev])
      self.V[prev] = value
      #print("v_update():{},{}".format(prev,value))
      target = value
    self.reset_state_history()
      
  # Checks the reward at the end of the game
  # Very important part in update_v()
  def reward(self):
    if not env.has_ended():
      return 0
    if self.sym == env.get_winner():
      return 1
    else:
      return 0
  # Gets the state from the environment and update
  # the agent's state history.
  def update_state_history(self):
    self.state_history = self.env.state_history
    
class Environment:
   
  def __init__(self, name, verbose = True):
    self.name = name
    self.board_dim = (3,3)
    self.board_grid =  np.zeros(self.board_dim)
    self.move_history = []
    self.state_history = []
    self.env_detail_history = []
    self.num_of_sym = 3
    self.player_sym = ["X", "O"]
    self.board_sym = ["X", "O", " "]
    self.coefficients = (-1,0,1)
    self.swe_tuples = ()
    self.verbose = verbose

  # The order is important 
  # First player in the tuple is assigned to "X".
  def register_players(self, players):
    for i, player in enumerate(players):
      player.attach_to_env(self)
      player.assign_symbol(self.player_sym[i])
 
  # Gets the winner by returning "X" or "O". Returns "None" otherwise.
  def get_winner(self):
    sums = []
    sums.extend(self.board_grid.sum(0))  # sum of each columns
    sums.extend(self.board_grid.sum(1))  # sum of each rows
    sums.append(sum(self.board_grid.diagonal()))  # sum of top-left to bottom-right diagonal
    sums.append(sum(np.fliplr(self.board_grid).diagonal()))  # sum of bottom-left to top-right diagonal
    #print(sums)
    
    winner = []
    if (3.0 in sums):
      winner += "O"
    if (-3.0 in sums):
      winner += "X"
    
    if len(winner) == 1:
      return winner[0]
    else:
      return "None"
 
  def num_to_sym(self, num):
    switcher = {
        -1:"X",
        0: " ",
        1: "O"
    }
    return switcher.get(num, "invalid")
  
  def sym_to_num(self, sym):
    switcher = {
        "X":-1,
        " ":0 ,
        "O":1 
    }
    return switcher.get(sym, "invalid")
 
  def place_move(self, sym, position):
    env_detail = []
    i,j = position
    if self.board_grid[i,j] == 0:
      self.board_grid[i,j] = self.sym_to_num(sym) 
       
      env_detail.extend(self.get_swe_tuple())
      env_detail.extend(sym)
      env_detail.extend(position)
      self.env_detail_history.append(env_detail)
      
      self.state_history.append(env_detail[0])
      self.move_history.append(env_detail[3:6])            

  
  # Will undo the effects of place_move() and removes the record in move_history
  # Needs to be used with precaution and should only be used after place_move()
  def delete_move(self, position):
    i,j = position
    if self.board_grid[i,j] != 0:
      self.board_grid[i,j] = 0 
      del self.move_history[len(self.move_history)-1]
      del self.state_history[len(self.state_history)-1]
      del self.env_detail_history[len(self.env_detail_history)-1]
      
  
  def check_cell_if_empty(self, position):
    i,j = position
    if self.board_grid[i,j] == 0:
      return True
    else:
      return False
    
  def get_empty_cells(self):
    cells_list = []
    for i in range(self.board_dim[0]):
      for j in range (self.board_dim[1]):
        if self.check_cell_if_empty((i,j)) == True:
          cells_list.append((i,j))
    return cells_list
     
  def print_move_history(self):
    print("Move History")
    for i, move in enumerate(self.move_history):
      print(str(i) + ":" + str(move))
      
  def print_state_history(self):
    print("State History")
    for i, state in enumerate(self.state_history):
      print(str(i) + ":" + str(state))  

  def print_env_detail_history(self):
    print("Environment Detail History")
    for i, detail in enumerate(self.env_detail_history):
      print(str(i) + ":" + str(detail))  
      
  def print_env_winner(self):
    winner = self.get_winner()
    print("The winner is {}".format(winner))
  
  def draw_board_in_sym(self):
    print("---------")
    for i in range(0,self.board_dim[0]):
      line_buff = ""
      for j in range(0,self.board_dim[1]):
        char_buff = "|" + self.num_to_sym(self.board_grid[i,j]) + "|"
        line_buff += char_buff  
      print(line_buff)
      print("---------")
  
  def draw_board_in_num(self):
    print(self.board_grid)
    
  def has_ended(self): 
    if len(self.get_empty_cells()) == 0:
      return True
    else:
      if self.get_winner() == "None":
        return False
      else:  
        return True
    
  # Creates a hash from a given state. 
  # Based on converting the table into a decimal converted from base 3.
  def get_state_in_hash(self):
    # Coefficients taken from the board grid with these conditions
    # -1 -> 0, 0 -> 1, 1 -> 2
    coeff = np.ravel(self.board_grid + 1)
    
    # Hashing arithmetic
    power = np.arange(self.board_dim[0]*self.board_dim[1])
    base = np.full_like(coeff, self.num_of_sym)
    
    # hash taken from base3 to base10 conversion
    return np.sum((coeff*(base**power)).astype(int))
  
  def get_swe_tuple(self):
    state = self.get_state_in_hash()
    pos = -1
    for i, t in enumerate(env.swe_tuples):
      if state in t:
        pos = i
    
    if pos != -1:
      return self.swe_tuples[pos]
    else:
      return None
    
  
  
  # Permutation of states in a recurrent from function
  def recur_permutate_states(self, i=0, j=0):
    results = []

    for coeff in self.coefficients:
      self.board_grid[i,j] = coeff
      if j == self.board_dim[1]-1:
        if i == self.board_dim[1]-1:
          state_in_hash = self.get_state_in_hash()
          winner = self.get_winner()
          ended = self.has_ended()
          results.append((state_in_hash, winner, ended))
        else:
          results += self.recur_permutate_states(i+1, 0)
      else:
        results += self.recur_permutate_states(i, j+1)   
    return results
  
  # Permutate all state to create a look-up array and is
  # saved to a instance variable
  def permutate_states(self):
    self.swe_tuples = self.recur_permutate_states()
    self.board_grid.fill(0.0)
    
    if self.verbose:
      print("Permutating all states/moves...")   
      print(self.swe_tuples)
      
class Human(Agent):

  def strategic_action(self):
    self.input_based_action()
  
  def input_based_action(self):
    self.env.draw_board_in_sym()
    posX = int(input())
    posY = int(input())
    position = (posX,posY)
    self.env.place_move(self.sym, position)
    self.latest_pos = position
    
  
      
# Loops until the game is over
def play_game(env, players, verbose = False, display_board = False):
  env.verbose = verbose
  #env.register_players(random.sample(players, len(players)))
  env.register_players(players)
  env.permutate_states()
  
  for count, player in enumerate(players):
    player.initialize_v() # initialize V values on each agent
    if verbose:
      print("Player %s is %s" % (count, player.sym))
  
  while not env.has_ended():
    for player in players:
      player.strategic_action()
      if env.has_ended():
        break
  
  if display_board == True:
    env.draw_board_in_sym()
    env.print_env_winner()
 
  for player in players:
    player.update_v()

    
def train_agents(env, players, num_loop = 1):
  
  for i in range(num_loop):
      play_game(env,players)
      if i % 5 == 0:
        print(i)

  np.set_printoptions(threshold=np.nan)
  for player in players:
    print(player.V)
    

In [0]:
# Choose between "O" or "X"
pX = Agent()
pO = Agent()
pH = Human()

env = Environment("TTT")

train_agents(env,(pX,pO),20)
for i in range(5):
  play_game(env,(pX,pH), display_board = True)


In [0]:
# Create a Tic Tac Toe game were an AI uses Reinforcement Learning
# GPU
!pip3 install torch torchvision

In [0]:
import numpy as np
import random
import tensorflow as tf
import torch 
import time


# Acts as an abstract base class
class Agent:
  
  def __init__(self):
    self.attached = False
  
  def attach(self, env):
    self.env = env
    self.attached = True
  
  def do_action(self, action_info):
    pass

class Environment:
  
  def __init__(self):
    pass
  
  def attach_agents(self, agents):
    for agent in agents:
      if agent.attached == False:
        agent.attach(self)
      
  def visualize_world(self):
    pass
  
  
  
# Derivative Classes  
class AIplayer(Agent):
  
  def __init__(self, sym = " "):
    super().__init__()
    self.sym = sym
    self.action_info = []
    
  def do_action(self, action_info):
    ended = False
    position = self.__random_position()
    if position == None:
      ended = True
    action_info.extend((position, ended, self.sym))
    
  
  def __place_sym_in_grid(self, pos):
    self.env.grid_in_num[pos[0]][pos[1]] = self.__sym_to_num(self.sym)
    
  def __random_position(self):
    empty_cells = self.__get_empty_cells()
    if len(empty_cells) != 0:
      chosen_cell = random.choice(empty_cells)
      self.__place_sym_in_grid(chosen_cell) 
      return chosen_cell
    else:
      return None
    
  def __get_empty_cells(self):
    grid = (self.env.grid_in_num == 0).nonzero()
    empty_cells = []
    for cell in grid.tolist():
      empty_cells.append(cell)
    return empty_cells
  
  def __sym_to_num(self, sym):
    switcher = {
        "X":-1,
        " ":0 ,
        "O":1 
    }
    return switcher.get(sym, "invalid")
  
class TTTgame(Environment):
  
  def __init__(self):
    self.__initialize_world()
    self.player_sym = ("X","O")
  
  def __initialize_world(self, dim = (3,3)):
    self.grid_row = dim[0]
    self.grid_col = dim[1]
    self.grid_in_num = torch.zeros(dim[0],dim[1], dtype=torch.int)
    
  def get_winner(self):
    sum_row = torch.sum(self.grid_in_num, 1)
    sum_col = torch.sum(self.grid_in_num, 0)
    print((sum_row,sum_col))
    
  def visualize_world(self):
    self.__draw_grid_in_sym()
       
  def __draw_grid_in_sym(self):
    print("---------")
    for i in range(0,self.grid_row):
      line_buff = ""
      for j in range(0,self.grid_col):
        char_buff = "|" + self.__num_to_sym(self.grid_in_num[i][j].item()) + "|"
        line_buff += char_buff  
      print(line_buff)
      print("---------")
      
  def __num_to_sym(self, num):
    switcher = {
        -1:"X",
        0: " ",
        1: "O"
    }
    return switcher.get(num, "invalid")
 
  
def play_game(players, game):
  game.attach_agents(players)
  game.visualize_world()
  
  # Assign players with symbols.
  # Designed only for 2 players.
  for i, player in enumerate(players):
    player.sym = game.player_sym[i]
  
  # Game proper
  info_list = []
  ended = False
  while not ended:
    for player in players:
      info = []
      player.do_action(info)
      game.visualize_world()
      info_list.append(info)
      print(info)
      game.get_winner()
      ended = info[1]
      if ended == True:
        break
  print(info_list)
      
      
   
        
      
  
    
    
# Start of the main function  
if __name__ == '__main__':
  
  
  
  pX = AIplayer()
  pO = AIplayer()
  #pH = TODO 
  
  game = TTTgame()
  
  start = time.time()
  play_game((pX,pO),game)
  end = time.time()
  print(end - start)

  

---------
| || || |
---------
| || || |
---------
| || || |
---------
---------
| || ||X|
---------
| || || |
---------
| || || |
---------
[[0, 2], False, 'X']
(tensor([-1,  0,  0]), tensor([ 0,  0, -1]))
---------
|O|| ||X|
---------
| || || |
---------
| || || |
---------
[[0, 0], False, 'O']
(tensor([0, 0, 0]), tensor([ 1,  0, -1]))
---------
|O|| ||X|
---------
| ||X|| |
---------
| || || |
---------
[[1, 1], False, 'X']
(tensor([ 0, -1,  0]), tensor([ 1, -1, -1]))
---------
|O|| ||X|
---------
| ||X|| |
---------
| || ||O|
---------
[[2, 2], False, 'O']
(tensor([ 0, -1,  1]), tensor([ 1, -1,  0]))
---------
|O|| ||X|
---------
|X||X|| |
---------
| || ||O|
---------
[[1, 0], False, 'X']
(tensor([ 0, -2,  1]), tensor([ 0, -1,  0]))
---------
|O|| ||X|
---------
|X||X|| |
---------
|O|| ||O|
---------
[[2, 0], False, 'O']
(tensor([ 0, -2,  2]), tensor([ 1, -1,  0]))
---------
|O|| ||X|
---------
|X||X|| |
---------
|O||X||O|
---------
[[2, 1], False, 'X']
(tensor([ 0, -2,  1]), ten