Refactor

werner-duvaud · Apr 19, 2020 · 91afb1d · 91afb1d
1 parent b94cd65
commit 91afb1d
Show file tree

Hide file tree

Showing 10 changed files with 261 additions and 68 deletions.
diff --git a/games/abstract_game.py b/games/abstract_game.py
@@ -87,6 +87,17 @@ def human_to_action(self):
             choice = input("Enter another action : ")
         return int(choice)
 
+    @abstractmethod
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        pass
+
     @abstractmethod
     def action_to_string(self, action_number):
         """

diff --git a/games/breakout.py b/games/breakout.py
@@ -25,6 +25,10 @@ def __init__(self):
         self.players = [i for i in range(1)]  # List of players. You should only edit the length
         self.stacked_observations = 2  # Number of previous observations and previous actions to add to the current observation
 
+        ### Evaluate
+        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
+        self.opponent = None  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None / "random" / "expert" if implemented in the Game class
+
 
 
         ### Self-Play
@@ -206,6 +210,16 @@ def human_to_action(self):
         """
         pass
 
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        pass
+
     def action_to_string(self, action_number):
         """
         Convert an action number to a string representing the action.

diff --git a/games/cartpole.py b/games/cartpole.py
@@ -20,6 +20,10 @@ def __init__(self):
         self.players = [i for i in range(1)]  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
 
+        ### Evaluate
+        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
+        self.opponent = None  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None / "random" / "expert" if implemented in the Game class
+
 
 
         ### Self-Play
@@ -194,6 +198,16 @@ def human_to_action(self):
         """
         pass
 
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        pass
+
     def action_to_string(self, action_number):
         """
         Convert an action number to a string representing the action.

diff --git a/games/connect4.py b/games/connect4.py
@@ -20,6 +20,10 @@ def __init__(self):
         self.players = [i for i in range(2)]  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
 
+        ### Evaluate
+        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
+        self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None / "random" / "expert" if implemented in the Game class
+
 
 
         ### Self-Play
@@ -193,6 +197,16 @@ def human_to_action(self):
             choice = input("Enter another column : ")
         return int(choice)
 
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        return self.env.expert_action()
+
     def action_to_string(self, action_number):
         """
         Convert an action number to a string representing the action.
@@ -296,5 +310,43 @@ def is_finished(self):
 
         return False
 
+    def expert_action(self):
+        board = self.board
+        action = numpy.random.choice(self.legal_actions())
+        for k in range(3):
+            for l in range(4):
+                sub_board = board[k:k+4, l:l+4]
+                # Horizontal and vertical checks
+                for i in range(4):
+                    if abs(sum(sub_board[i, :])) == 3:
+                        ind = numpy.where(sub_board[i, :] == 0)[0][0]
+                        if numpy.count_nonzero(board[:, ind+l]) == i+k:
+                            action = ind + l
+                            if self.player * sum(sub_board[i, :]) > 0:
+                                return action
+
+                    if abs(sum(sub_board[:, i])) == 3:
+                        action = i + l
+                        if self.player * sum(sub_board[:, i]) > 0:
+                            return action
+                # Diagonal checks
+                diag = sub_board.diagonal()
+                anti_diag = numpy.fliplr(sub_board).diagonal()
+                if abs(sum(diag)) == 3:
+                    ind = numpy.where(diag == 0)[0][0]
+                    if numpy.count_nonzero(board[:, ind+l]) == ind+k:
+                        action = ind + l 
+                        if self.player * sum(diag) > 0:
+                            return action
+
+                if abs(sum(anti_diag)) == 3:
+                    ind = numpy.where(anti_diag == 0)[0][0]
+                    if numpy.count_nonzero(board[:, 3-ind+l]) == ind+k:
+                        action = 3-ind+l
+                        if self.player * sum(anti_diag) > 0:
+                            return action
+
+        return action
+
     def render(self):
         print(self.board[::-1])
diff --git a/games/gomoku.py b/games/gomoku.py
@@ -21,6 +21,10 @@ def __init__(self):
         self.players = [i for i in range(2)]  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
 
+        ### Evaluate
+        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
+        self.opponent = "random"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None / "random" / "expert" if implemented in the Game class
+
 
 
         ### Self-Play
@@ -194,6 +198,16 @@ def human_to_action(self):
         while not valid:
             valid, action = self.env.human_input_to_action()
         return action
+
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        pass
 
     def action_to_string(self, action):
         """

diff --git a/games/lunarlander.py b/games/lunarlander.py
@@ -20,6 +20,10 @@ def __init__(self):
         self.players = [i for i in range(1)]  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
 
+        ### Evaluate
+        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
+        self.opponent = None  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None / "random" / "expert" if implemented in the Game class
+
 
 
         ### Self-Play
@@ -194,6 +198,16 @@ def human_to_action(self):
         """
         pass
 
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        pass
+
     def action_to_string(self, action_number):
         """
         Convert an action number to a string representing the action.

diff --git a/games/tictactoe.py b/games/tictactoe.py
@@ -20,6 +20,10 @@ def __init__(self):
         self.players = [i for i in range(2)]  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
 
+        ### Evaluate
+        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
+        self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None / "random" / "expert" if implemented in the Game class
+
 
 
         ### Self-Play
@@ -217,6 +221,16 @@ def human_to_action(self):
             print("Wrong input, try again")
         return choice
 
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        return self.env.expert_action()
+
     def action_to_string(self, action_number):
         """
         Convert an action number to a string representing the action.
@@ -300,6 +314,40 @@ def is_finished(self):
             return True
 
         return False
+
+    def expert_action(self):
+        board = self.board
+        action = numpy.random.choice(self.legal_actions())
+        # Horizontal and vertical checks
+        for i in range(3):
+            if abs(sum(board[i, :])) == 2:
+                ind = numpy.where(board[i, :] == 0)[0][0]
+                action = numpy.ravel_multi_index((numpy.array([i]), numpy.array([ind])), (3, 3))[0]
+                if self.player * sum(board[i, :]) > 0:
+                    return action 
+
+            if abs(sum(board[:, i])) == 2:
+                ind = numpy.where(board[:, i] == 0)[0][0]
+                action = numpy.ravel_multi_index((numpy.array([ind]), numpy.array([i])), (3, 3))[0]
+                if self.player * sum(board[:, i]) > 0:
+                    return action
+
+        # Diagonal checks
+        diag = board.diagonal()
+        anti_diag = numpy.fliplr(board).diagonal()
+        if abs(sum(diag)) == 2:
+            ind = numpy.where(diag == 0)[0][0]
+            action = numpy.ravel_multi_index((numpy.array([ind]), numpy.array([ind])), (3, 3))[0]
+            if self.player * sum(diag) > 0:
+                return action
+
+        if abs(sum(anti_diag)) == 2:
+            ind = numpy.where(anti_diag == 0)[0][0]
+            action = numpy.ravel_multi_index((numpy.array([ind]), numpy.array([2 - ind])), (3, 3))[0]
+            if self.player * sum(anti_diag) > 0:
+                return action
+
+        return action
 
     def render(self):
         print(self.board[::-1])
diff --git a/muzero.py b/muzero.py
@@ -120,13 +120,13 @@ def train(self):
                     "1.Total reward/3.Episode length", infos["episode_length"], counter,
                 )
                 writer.add_scalar(
-                    "1.Total reward/4.Player 0 MuZero reward",
-                    infos["player_0_reward"],
+                    "1.Total reward/4.MuZero reward",
+                    infos["muzero_reward"],
                     counter,
                 )
                 writer.add_scalar(
-                    "1.Total reward/5.Player 1 Random reward",
-                    infos["player_1_reward"],
+                    "1.Total reward/5.Opponent reward",
+                    infos["opponent_reward"],
                     counter,
                 )
                 writer.add_scalar(