diff --git a/tutorial1/cellular.py b/tutorial1/cellular.py index 8fe7837..10fd1c7 100755 --- a/tutorial1/cellular.py +++ b/tutorial1/cellular.py @@ -99,7 +99,7 @@ def __init__(self, cell=None, width=None, height=None, directions=8, filename=No self.display = makeDisplay(self) self.directions = directions if filename is not None: - data = file(filename).readlines() + data = open(filename).readlines() if height is None: height = len(data) if width is None: @@ -168,7 +168,7 @@ def load(self, f): if not hasattr(self.Cell, 'load'): return if isinstance(f, type('')): - f = file(f) + f = open(f) lines = f.readlines() lines = [x.rstrip() for x in lines] fh = len(lines) @@ -188,7 +188,7 @@ def load(self, f): for j in range(fh): line = lines[j] for i in range(min(fw, len(line))): - self.grid[starty + j][startx + i].load(line[i]) + self.grid[int(starty + j)][int(startx + i)].load(line[i]) def update(self, fed=None, eaten=None): if hasattr(self.Cell, 'update'): @@ -518,7 +518,7 @@ def redraw(self): try: self.screen.fill(c, (sx, sy, self.size, self.size)) except TypeError: - print 'Error: invalid colour:', c + print('Error: invalid colour:', c) sx += self.size odd = not odd sy += self.size diff --git a/tutorial1/egoMouseLook.py b/tutorial1/egoMouseLook.py index 9c162ac..cccfaee 100644 --- a/tutorial1/egoMouseLook.py +++ b/tutorial1/egoMouseLook.py @@ -1,3 +1,4 @@ +import importlib import time import random import shelve @@ -5,10 +6,13 @@ import pdb import cellular -reload(cellular) +# reload(cellular) # Python 2 +importlib.reload(cellular) import qlearn_mod_random as qlearn # to use the alternative exploration method #import qlearn # to use standard exploration method -reload(qlearn) + +# reload(qlearn) # Python 2 +importlib.reload(cellular) # Python 3 directions = 8 @@ -103,7 +107,7 @@ def update(self): # Choose a new action and execute it state = self.calcState() - print(state) + # print(state) action = self.ai.chooseAction(state) self.lastState = state self.lastAction = action @@ -139,7 +143,7 @@ def cellvalue(cell): epsilony = (0.1,0) epsilonm = (epsilony[1] - epsilony[0]) / (epsilonx[1] - epsilonx[0]) -endAge = world.age + 10000 +endAge = world.age + 100000 while world.age < endAge: world.update() @@ -150,17 +154,19 @@ def cellvalue(cell): epsilonm*(world.age - epsilonx[0]) + epsilony[0])''' if world.age % 10000 == 0: - print "{:d}, e: {:0.2f}, W: {:d}, L: {:d}"\ - .format(world.age, mouse.ai.epsilon, mouse.fed, mouse.eaten) + print("{:d}, e: {:0.2f}, W: {:d}, L: {:d}"\ + .format(world.age, mouse.ai.epsilon, mouse.fed, mouse.eaten)) mouse.eaten = 0 mouse.fed = 0 world.display.activate(size=30) world.display.delay = 1 + while 1: world.update(mouse.fed, mouse.eaten) - print len(mouse.ai.q) # print the amount of state/action, reward + # print(len(mouse.ai.q)) # print the amount of state/action, reward # elements stored import sys bytes = sys.getsizeof(mouse.ai.q) - print "Bytes: {:d} ({:d} KB)".format(bytes, bytes/1024) + # print("Bytes: {:d} ({:d} KB)".format(bytes, bytes/1024)) + print("Bytes: {:d} ({:.2f} KB)".format(bytes, bytes/1024)) diff --git a/tutorial1/qlearn.py b/tutorial1/qlearn.py index 7426b1f..a83f83f 100755 --- a/tutorial1/qlearn.py +++ b/tutorial1/qlearn.py @@ -1,39 +1,79 @@ +import math import random class QLearn: - def __init__(self, actions, epsilon=0.1, alpha=0.2, gamma=0.9): + """Q-Learning class. Implements the Q-Learning algorithm.""" + + def __init__(self, + actions, + epsilon=0.1, + alpha=0.2, + gamma=0.9): + """Initialize an empty dictionary for Q-Values.""" + # Q-Values are stored in a dictionary, with the state-action self.q = {} - self.epsilon = epsilon # exploration constant - self.alpha = alpha # discount constant + # Epsilon is the exploration factor. A higher epsilon + # encourages more exploration, risking more but potentially + # gaining more too. + self.epsilon = epsilon + + # Alpha is the learning rate. If Alpha is high, then the + # learning is faster but may not converge. If Alpha is low, + # the learning is slower but convergence may be more stable. + self.alpha = alpha + + # Gamma is the discount factor. + # It prioritizes present rewards over future ones. self.gamma = gamma + + # Actions available in the environment self.actions = actions def getQ(self, state, action): + """Get Q value for a state-action pair. + + If the state-action pair is not found in the dictionary, + return 0.0 if not found in our dictionary + """ return self.q.get((state, action), 0.0) - # return self.q.get((state, action), 1.0) def learnQ(self, state, action, reward, value): - ''' - Q-learning: - Q(s, a) += alpha * (reward(s,a) + max(Q(s') - Q(s,a)) - ''' + """Updates the Q-value for a state-action pair. + + The core Q-Learning update rule. + Q(s, a) += alpha * (reward(s,a) + max(Q(s')) - Q(s,a)) + + This function updates the Q-value for a state-action pair + based on the reward and maximum estimated future reward. + """ oldv = self.q.get((state, action), None) if oldv is None: + # If no previous Q-Value exists, then initialize + # it with the current reward self.q[(state, action)] = reward else: + # Update the Q-Value with the weighted sum of old + # value and the newly found value. + # + # Alpha determines how much importance we give to the + # new value compared to the old value. self.q[(state, action)] = oldv + self.alpha * (value - oldv) def chooseAction(self, state): + """Epsilon-Greedy approach for action selection.""" if random.random() < self.epsilon: + # With probability epsilon, we select a random action action = random.choice(self.actions) else: + # With probability 1-epsilon, we select the action + # with the highest Q-value q = [self.getQ(state, a) for a in self.actions] maxQ = max(q) count = q.count(maxQ) - # In case there're several state-action max values - # we select a random one among them + # If there are multiple actions with the same Q-Value, + # then choose randomly among them if count > 1: best = [i for i in range(len(self.actions)) if q[i] == maxQ] i = random.choice(best) @@ -44,13 +84,20 @@ def chooseAction(self, state): return action def learn(self, state1, action1, reward, state2): + """Get the maximum Q-Value for the next state.""" maxqnew = max([self.getQ(state2, a) for a in self.actions]) - self.learnQ(state1, action1, reward, reward + self.gamma*maxqnew) -import math -def ff(f,n): - fs = "{:f}".format(f) + # Learn the Q-Value based on current reward and future + # expected rewards. + self.learnQ(state1, action1, reward, reward + self.gamma * maxqnew) + + +# A utility function to format floating point numbers. Not +# directly related to Q-learning. +def ff(f, n): + """Format a floating point number to a string with n digits.""" + fs = '{:f}'.format(f) if len(fs) < n: - return ("{:"+n+"s}").format(fs) + return ('{:'+n+'s}').format(fs) else: return fs[:n]