fix bugs and make it work :) fat model, but its avg. okay

GreenWizard2015 · GreenWizard2015 · commit 18e9ddeb55fb · 2020-12-19T18:56:41.000+02:00
diff --git a/CMazeExperience.py b/CMazeExperience.py
@@ -13,13 +13,6 @@ def __init__(self, maxSize):
   def addEpisode(self, replay):
     score = sum(x[2] for x in replay)
     if score < self.minScore: return
-    
-#     for i in range(len(replay)):
-#       state, act, score, nextState = replay[i]
-#       gamma = self.gamma
-#       for j in range(i + 1, len(replay)):
-#         score += gamma * replay[j][2]
-#         gamma *= self.gamma
     self.episodes.append((replay, score))
       
     if self.sizeLimit < len(self.episodes):
@@ -57,7 +50,6 @@ def take_batch(self, batch_size):
         nextStateWeight = 1 if ind < len(episode) - 1 else 0 
         batch.append((state, act, score, nextState, nextStateWeight))
 
-    
     return (
       np.array([x[0] for x in batch]),
       np.array([x[1] for x in batch]),
diff --git a/Core/CMazeEnviroment.py b/Core/CMazeEnviroment.py
@@ -77,9 +77,8 @@ def _takeShot(self):
     return (data[x - d:x + d + 1, y - d:y + d + 1] for data in (maze, fog, moves))
   
   def minimap(self):
-    #maze, fog, moves = self._takeShot()
-    maze, fog, moves = self.maze, self.fog, self.moves
-    return (maze * fog, moves)
+    maze, fog, moves = self._takeShot()
+    return ((maze * fog) - (1 - fog), moves)
   
   @property
   def state(self):
@@ -120,7 +119,7 @@ def invalidActions(self):
   
   def state2input(self):
     maze, moves = self.minimap()
-    state = np.dstack((maze, ))
+    state = np.dstack((maze, moves))
     return state
 
   @property
diff --git a/model.py b/model.py
@@ -1,5 +1,6 @@
 import tensorflow.keras as keras
 import tensorflow.keras.layers as layers
+import tensorflow as tf
 
 def convBlock(prev, sz, filters):
   conv_1 = layers.Convolution2D(filters, (sz, sz), padding="same", activation="relu")(prev)
@@ -14,28 +15,21 @@ def createModel(shape):
   res = convBlock(res, 3, filters=32)
   
   res = layers.Flatten()(res)
-
-  res = layers.Dense(16 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
-  res = layers.Dense(16 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
-  res = layers.Dense(16 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
-  res = layers.Dense(8 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
-  res = layers.Dense(8 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
-  res = layers.Dense(8 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
-  res = layers.Dense(4 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
-  res = layers.Dense(4 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
-  res = layers.Dense(4 ** 2, activation='relu')(res)
-  res = layers.Dropout(.2)(res)
   
-  res = layers.Dense(4, activation='linear')(res)
-  return keras.Model(
-    inputs=inputs,
-    outputs=res
-  )
+  # dueling dqn
+  valueBranch = layers.Dense(32, activation='relu')(res)
+  valueBranch = layers.Dense(32, activation='relu')(valueBranch)
+  valueBranch = layers.Dense(32, activation='relu')(valueBranch)
+  valueBranch = layers.Dense(1, activation='linear')(valueBranch)
+  
+  actionsBranch = layers.Dense(128, activation='relu')(res)
+  actionsBranch = layers.Dense(64, activation='relu')(actionsBranch)
+  actionsBranch = layers.Dense(64, activation='relu')(actionsBranch)
+  actionsBranch = layers.Dense(64, activation='relu')(actionsBranch)
+  actionsBranch = layers.Dense(4, activation='linear')(actionsBranch)
+  
+  res = layers.Lambda(
+    lambda x: x[1] + (x[0] - tf.reduce_mean(x[0], axis=-1, keepdims=True))
+  )([actionsBranch, valueBranch])
+
+  return keras.Model(inputs=inputs, outputs=res)
diff --git a/train.py b/train.py
@@ -39,83 +39,82 @@ def emulate(env, model, exploreRate, exploreDecay, steps, stopOnInvalid=False):
       probe = model.predict(np.array([state]))[0]
       if not stopOnInvalid:
         for i in env.invalidActions():
-          probe[i] = -1
+          probe[i] = -float('inf')
       act = np.argmax(probe)
 
     if stopOnInvalid and not (act in valid):
-      episodeReplay.append([state, act, -1, env.state2input()])
+      episodeReplay.append([state, act, -10, env.state2input()])
       break
     
     prevScore = env.score
     env.apply(MAZE_ACTIONS[act])
-    normedScore = 1 if 0 < (env.score - prevScore) else -.1
+    normedScore = 1 if 0 < (env.score - prevScore) else -0.1
     episodeReplay.append([state, act, normedScore, env.state2input()])
     
     done = env.done
-    exploreRate = max((.01, exploreRate * exploreDecay))
+    exploreRate = max((.001, exploreRate * exploreDecay))
   return episodeReplay
 
 if __name__ == "__main__":
-  sz = 32
+  sz = 64
   env = CMazeEnviroment(
     maze=(0.8 < np.random.rand(sz, sz)).astype(np.float32),
     pos=(0, 0),
     FOV=3,
     minimapSize=8
   )
-  memory = CMazeExperience(maxSize=100)
+  memory = CMazeExperience(maxSize=1000)
   done = False
-  batch_size = 64
-  playSteps = 64
+  batch_size = 256
+  playSteps = 96
   
-  bestModelScore = 0
+  bestModelScore = -float('inf')
   model = createModel(shape=env.input_size)
   model.compile(
     optimizer=Adam(lr=1e-3),
     loss='mean_squared_error'
   )
-#   model.load_weights('model.h5')
+  #model.load_weights('weights/best.h5')
   
   targetModel = createModel(shape=env.input_size)
-  np.set_printoptions(precision=3)
   # collect data
-  while len(memory) < 50:
+  while len(memory) < 100:
     env.respawn()
     episodeReplay = emulate(
       env, model,
-      exploreRate=0.9,
-      exploreDecay=0.9,
+      exploreRate=1,
+      exploreDecay=1,
       steps=playSteps,
       stopOnInvalid=False
     ) 
     #################
     if 1 < len(episodeReplay):
       memory.addEpisode(episodeReplay)
       print(len(memory), env.score)
-  memory.update()
 
-  train_episodes = 500
-  test_episodes = 10
-  exploreRate = 1
-  exploreDecayPerEpoch = .9
-  exploreDecay = .9
+  train_episodes = 100
+  test_episodes = 20
+  exploreRate = .5
+  exploreDecayPerEpoch = .95
+  exploreDecay = .95
   for epoch in range(5000):
     print('Epoch %d' % epoch)
     # train
     targetModel.set_weights(model.get_weights())
     lossSum = 0
     for n in range(train_episodes):
       states, actions, rewards, nextStates, nextReward = memory.take_batch(batch_size)
-      targets = targetModel.predict(nextStates)
-      targets[np.arange(len(targets)), actions] = rewards + np.max(targets, axis=1) * .9 * nextReward
+      nextScores = targetModel.predict(nextStates)
+      targets = targetModel.predict(states)
+      targets[np.arange(len(targets)), actions] = rewards + np.max(nextScores, axis=1) * .95 * nextReward
 
       lossSum += model.fit(
         states, targets,
         epochs=1,
         verbose=0
       ).history['loss'][0]
+    
     print('Avg. train loss: %.4f' % (lossSum / train_episodes))
-    print(targets[0])
 
     # test
     print('Epoch %d testing' % epoch)
@@ -141,6 +140,6 @@ def emulate(env, model, exploreRate, exploreDecay, steps, stopOnInvalid=False):
     if bestModelScore < scoreSum:
       bestModelScore = scoreSum
       print('save best model')
-      model.save_weights('model.h5')
-    model.save_weights('latest.h5')
+      model.save_weights('weights/best.h5')
+    model.save_weights('weights/latest.h5')
     exploreRate *= exploreDecayPerEpoch
diff --git a/view_maze.py b/view_maze.py
@@ -1,18 +1,13 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-import sys
-import os
 import tensorflow as tf
+import os
 
-if 'COLAB_GPU' in os.environ:
-  # fix resolve modules
-  from os.path import dirname
-  sys.path.append(dirname(dirname(dirname(__file__))))
-else: # local GPU
-  gpus = tf.config.experimental.list_physical_devices('GPU')
-  tf.config.experimental.set_virtual_device_configuration(
-    gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1 * 1024)]
-  )
+# limit GPU usage
+gpus = tf.config.experimental.list_physical_devices('GPU')
+tf.config.experimental.set_virtual_device_configuration(
+  gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1 * 1024)]
+)
 
 from Core.CMazeEnviroment import CMazeEnviroment, MazeActions
 import numpy as np
@@ -44,6 +39,7 @@ class Colors:
 
 class App:
   MODES = ['manual', 'random', 'agent']
+  NETWORKS = ['best', 'latest']
   
   def __init__(self):
     self._running = True
@@ -52,6 +48,8 @@ def __init__(self):
     self._mode = 'manual'
     self._paused = True
     self._speed = 20
+    self._usedNetwork = self.NETWORKS[0]
+    return
   
   def _createMaze(self):
     self._maze = createMaze()
@@ -65,7 +63,18 @@ def on_init(self):
     pygame.display.set_caption('Deep maze')
     self._font = pygame.font.Font(pygame.font.get_default_font(), 16)
     self._running = True
- 
+  
+  def _createNewAgent(self):
+    filename = 'weights/%s.h5' % self._usedNetwork
+    if not os.path.exists(filename):
+      self._usedNetwork = self.NETWORKS[0]
+      filename = 'weights/%s.h5' % self._usedNetwork
+      
+    self._agent = createModel(shape=self._maze.input_size)
+    self._agent.load_weights(filename)
+    self._paused = True
+    return
+
   def on_event(self, event):
     if event.type == G.QUIT:
       self._running = False
@@ -77,12 +86,21 @@ def on_event(self, event):
         self._paused = True
         
         if 'agent' == self._mode:
-          self._agent = createModel(shape=self._maze.input_size)
-          self._agent.load_weights('model.h5')
+          self._createNewAgent()
         
       if G.K_SPACE == event.key:
         self._paused = not self._paused
 
+      if 'agent' == self._mode:
+        if G.K_r == event.key:
+          self._createMaze()
+        if G.K_n == event.key:
+          self._createNewAgent()
+        if G.K_t == event.key:
+          network = next((i for i, x in enumerate(self.NETWORKS) if x == self._usedNetwork))
+          self._usedNetwork = self.NETWORKS[(network + 1) % len(self.NETWORKS)]
+          self._createNewAgent()
+          
       if G.K_ESCAPE == event.key:
         self._running = False
       
@@ -121,7 +139,7 @@ def on_loop(self):
     if 'agent' == self._mode:
       probe = self._agent.predict(np.array([self._maze.state2input()]))[0]
       for i in self._maze.invalidActions():
-        probe[i] = -1
+        probe[i] = -float('inf')
       pred = np.argmax(probe)
       
       act = list(MazeActions)[pred]
@@ -196,12 +214,20 @@ def _renderInfo(self):
         False, Colors.BLUE
       ), (655, 35)
     )
+    
+    if 'agent' == self._mode:
+      self._display_surf.blit(
+        self._font.render(
+          'Network: %s' % (self._usedNetwork),
+          False, Colors.BLUE
+        ), (655, 55)
+      ) 
     return
   
   def on_render(self):
     self._display_surf.fill(Colors.SILVER)
     self._renderMaze()
-#     self._renderMazeMinimap()
+    self._renderMazeMinimap()
     self._renderInfo()
     pygame.display.flip()
  
diff --git a/weights/best.h5 b/weights/best.h5
diff --git a/weights/latest.h5 b/weights/latest.h5