Update tutorial1 to work with Python3

Signed-off-by: Víctor Mayoral Vilches <v.mayoralv@gmail.com>
vmayoral · Jul 14, 2023 · e1e97ff · e1e97ff
1 parent 6dca76d
commit e1e97ff
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 27 deletions.
diff --git a/tutorial1/cellular.py b/tutorial1/cellular.py
@@ -99,7 +99,7 @@ def __init__(self, cell=None, width=None, height=None, directions=8, filename=No
         self.display = makeDisplay(self)
         self.directions = directions
         if filename is not None:
-            data = file(filename).readlines()
+            data = open(filename).readlines()
             if height is None:
                 height = len(data)
             if width is None:
@@ -168,7 +168,7 @@ def load(self, f):
         if not hasattr(self.Cell, 'load'):
             return
         if isinstance(f, type('')):
-            f = file(f)
+            f = open(f)
         lines = f.readlines()
         lines = [x.rstrip() for x in lines]
         fh = len(lines)
@@ -188,7 +188,7 @@ def load(self, f):
         for j in range(fh):
             line = lines[j]
             for i in range(min(fw, len(line))):
-                self.grid[starty + j][startx + i].load(line[i])
+                self.grid[int(starty + j)][int(startx + i)].load(line[i])
 
     def update(self, fed=None, eaten=None):
         if hasattr(self.Cell, 'update'):
@@ -518,7 +518,7 @@ def redraw(self):
                     try:
                         self.screen.fill(c, (sx, sy, self.size, self.size))
                     except TypeError:
-                        print 'Error: invalid colour:', c
+                        print('Error: invalid colour:', c)
                 sx += self.size
             odd = not odd
             sy += self.size

diff --git a/tutorial1/egoMouseLook.py b/tutorial1/egoMouseLook.py
@@ -1,14 +1,18 @@
+import importlib
 import time
 import random
 import shelve
 
 import pdb
 
 import cellular
-reload(cellular)
+# reload(cellular)  # Python 2
+importlib.reload(cellular)
 import qlearn_mod_random as qlearn # to use the alternative exploration method
 #import qlearn # to use standard exploration method
-reload(qlearn)
+
+# reload(qlearn)  # Python 2
+importlib.reload(cellular)  # Python 3
 
 directions = 8
 
@@ -103,7 +107,7 @@ def update(self):
 
         # Choose a new action and execute it
         state = self.calcState()
-        print(state)
+        # print(state)
         action = self.ai.chooseAction(state)
         self.lastState = state
         self.lastAction = action
@@ -139,7 +143,7 @@ def cellvalue(cell):
 epsilony = (0.1,0)
 epsilonm = (epsilony[1] - epsilony[0]) / (epsilonx[1] - epsilonx[0])
 
-endAge = world.age + 10000
+endAge = world.age + 100000
 
 while world.age < endAge:
     world.update()
@@ -150,17 +154,19 @@ def cellvalue(cell):
                             epsilonm*(world.age - epsilonx[0]) + epsilony[0])'''
 
     if world.age % 10000 == 0:
-        print "{:d}, e: {:0.2f}, W: {:d}, L: {:d}"\
-            .format(world.age, mouse.ai.epsilon, mouse.fed, mouse.eaten)
+        print("{:d}, e: {:0.2f}, W: {:d}, L: {:d}"\
+            .format(world.age, mouse.ai.epsilon, mouse.fed, mouse.eaten))
         mouse.eaten = 0
         mouse.fed = 0
 
 world.display.activate(size=30)
 world.display.delay = 1
+
 while 1:
     world.update(mouse.fed, mouse.eaten)
-    print len(mouse.ai.q) # print the amount of state/action, reward 
+    # print(len(mouse.ai.q)) # print the amount of state/action, reward 
                           # elements stored
     import sys
     bytes = sys.getsizeof(mouse.ai.q)
-    print "Bytes: {:d} ({:d} KB)".format(bytes, bytes/1024)
+    # print("Bytes: {:d} ({:d} KB)".format(bytes, bytes/1024))
+    print("Bytes: {:d} ({:.2f} KB)".format(bytes, bytes/1024))
diff --git a/tutorial1/qlearn.py b/tutorial1/qlearn.py
@@ -1,39 +1,79 @@
+import math
 import random
 
 
 class QLearn:
-    def __init__(self, actions, epsilon=0.1, alpha=0.2, gamma=0.9):
+    """Q-Learning class. Implements the Q-Learning algorithm."""
+
+    def __init__(self,
+                 actions,
+                 epsilon=0.1,
+                 alpha=0.2,
+                 gamma=0.9):
+        """Initialize an empty dictionary for Q-Values."""
+        # Q-Values are stored in a dictionary, with the state-action
         self.q = {}
 
-        self.epsilon = epsilon  # exploration constant
-        self.alpha = alpha      # discount constant
+        # Epsilon is the exploration factor. A higher epsilon
+        # encourages more exploration, risking more but potentially
+        # gaining more too.
+        self.epsilon = epsilon
+
+        # Alpha is the learning rate. If Alpha is high, then the
+        # learning is faster but may not converge. If Alpha is low,
+        # the learning is slower but convergence may be more stable.
+        self.alpha = alpha
+
+        # Gamma is the discount factor.
+        # It prioritizes present rewards over future ones.
         self.gamma = gamma
+
+        # Actions available in the environment
         self.actions = actions
 
     def getQ(self, state, action):
+        """Get Q value for a state-action pair.
+
+        If the state-action pair is not found in the dictionary,
+            return 0.0 if not found in our dictionary
+        """
         return self.q.get((state, action), 0.0)
-        # return self.q.get((state, action), 1.0)
 
     def learnQ(self, state, action, reward, value):
-        '''
-        Q-learning:        
-            Q(s, a) += alpha * (reward(s,a) + max(Q(s') - Q(s,a))
-        '''
+        """Updates the Q-value for a state-action pair.
+
+        The core Q-Learning update rule.
+            Q(s, a) += alpha * (reward(s,a) + max(Q(s')) - Q(s,a))
+
+        This function updates the Q-value for a state-action pair
+        based on the reward and maximum estimated future reward.
+        """
         oldv = self.q.get((state, action), None)
         if oldv is None:
+            # If no previous Q-Value exists, then initialize
+            # it with the current reward
             self.q[(state, action)] = reward
         else:
+            # Update the Q-Value with the weighted sum of old
+            # value and the newly found value.
+            #
+            # Alpha determines how much importance we give to the
+            # new value compared to the old value.
             self.q[(state, action)] = oldv + self.alpha * (value - oldv)
 
     def chooseAction(self, state):
+        """Epsilon-Greedy approach for action selection."""
         if random.random() < self.epsilon:
+            # With probability epsilon, we select a random action
             action = random.choice(self.actions)
         else:
+            # With probability 1-epsilon, we select the action
+            # with the highest Q-value
             q = [self.getQ(state, a) for a in self.actions]
             maxQ = max(q)
             count = q.count(maxQ)
-            # In case there're several state-action max values 
-            # we select a random one among them
+            # If there are multiple actions with the same Q-Value,
+            # then choose randomly among them
             if count > 1:
                 best = [i for i in range(len(self.actions)) if q[i] == maxQ]
                 i = random.choice(best)
@@ -44,13 +84,20 @@ def chooseAction(self, state):
         return action
 
     def learn(self, state1, action1, reward, state2):
+        """Get the maximum Q-Value for the next state."""
         maxqnew = max([self.getQ(state2, a) for a in self.actions])
-        self.learnQ(state1, action1, reward, reward + self.gamma*maxqnew)
 
-import math
-def ff(f,n):
-    fs = "{:f}".format(f)
+        # Learn the Q-Value based on current reward and future
+        # expected rewards.
+        self.learnQ(state1, action1, reward, reward + self.gamma * maxqnew)
+
+
+# A utility function to format floating point numbers. Not
+# directly related to Q-learning.
+def ff(f, n):
+    """Format a floating point number to a string with n digits."""
+    fs = '{:f}'.format(f)
     if len(fs) < n:
-        return ("{:"+n+"s}").format(fs)
+        return ('{:'+n+'s}').format(fs)
     else:
         return fs[:n]