Skip to content

Commit

Permalink
Update tutorial1 to work with Python3
Browse files Browse the repository at this point in the history
Signed-off-by: Víctor Mayoral Vilches <v.mayoralv@gmail.com>
  • Loading branch information
vmayoral committed Jul 14, 2023
1 parent 6dca76d commit e1e97ff
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 27 deletions.
8 changes: 4 additions & 4 deletions tutorial1/cellular.py
Expand Up @@ -99,7 +99,7 @@ def __init__(self, cell=None, width=None, height=None, directions=8, filename=No
self.display = makeDisplay(self)
self.directions = directions
if filename is not None:
data = file(filename).readlines()
data = open(filename).readlines()
if height is None:
height = len(data)
if width is None:
Expand Down Expand Up @@ -168,7 +168,7 @@ def load(self, f):
if not hasattr(self.Cell, 'load'):
return
if isinstance(f, type('')):
f = file(f)
f = open(f)
lines = f.readlines()
lines = [x.rstrip() for x in lines]
fh = len(lines)
Expand All @@ -188,7 +188,7 @@ def load(self, f):
for j in range(fh):
line = lines[j]
for i in range(min(fw, len(line))):
self.grid[starty + j][startx + i].load(line[i])
self.grid[int(starty + j)][int(startx + i)].load(line[i])

def update(self, fed=None, eaten=None):
if hasattr(self.Cell, 'update'):
Expand Down Expand Up @@ -518,7 +518,7 @@ def redraw(self):
try:
self.screen.fill(c, (sx, sy, self.size, self.size))
except TypeError:
print 'Error: invalid colour:', c
print('Error: invalid colour:', c)
sx += self.size
odd = not odd
sy += self.size
Expand Down
22 changes: 14 additions & 8 deletions tutorial1/egoMouseLook.py
@@ -1,14 +1,18 @@
import importlib
import time
import random
import shelve

import pdb

import cellular
reload(cellular)
# reload(cellular) # Python 2
importlib.reload(cellular)
import qlearn_mod_random as qlearn # to use the alternative exploration method
#import qlearn # to use standard exploration method
reload(qlearn)

# reload(qlearn) # Python 2
importlib.reload(cellular) # Python 3

directions = 8

Expand Down Expand Up @@ -103,7 +107,7 @@ def update(self):

# Choose a new action and execute it
state = self.calcState()
print(state)
# print(state)
action = self.ai.chooseAction(state)
self.lastState = state
self.lastAction = action
Expand Down Expand Up @@ -139,7 +143,7 @@ def cellvalue(cell):
epsilony = (0.1,0)
epsilonm = (epsilony[1] - epsilony[0]) / (epsilonx[1] - epsilonx[0])

endAge = world.age + 10000
endAge = world.age + 100000

while world.age < endAge:
world.update()
Expand All @@ -150,17 +154,19 @@ def cellvalue(cell):
epsilonm*(world.age - epsilonx[0]) + epsilony[0])'''

if world.age % 10000 == 0:
print "{:d}, e: {:0.2f}, W: {:d}, L: {:d}"\
.format(world.age, mouse.ai.epsilon, mouse.fed, mouse.eaten)
print("{:d}, e: {:0.2f}, W: {:d}, L: {:d}"\
.format(world.age, mouse.ai.epsilon, mouse.fed, mouse.eaten))
mouse.eaten = 0
mouse.fed = 0

world.display.activate(size=30)
world.display.delay = 1

while 1:
world.update(mouse.fed, mouse.eaten)
print len(mouse.ai.q) # print the amount of state/action, reward
# print(len(mouse.ai.q)) # print the amount of state/action, reward
# elements stored
import sys
bytes = sys.getsizeof(mouse.ai.q)
print "Bytes: {:d} ({:d} KB)".format(bytes, bytes/1024)
# print("Bytes: {:d} ({:d} KB)".format(bytes, bytes/1024))
print("Bytes: {:d} ({:.2f} KB)".format(bytes, bytes/1024))
77 changes: 62 additions & 15 deletions tutorial1/qlearn.py
@@ -1,39 +1,79 @@
import math
import random


class QLearn:
def __init__(self, actions, epsilon=0.1, alpha=0.2, gamma=0.9):
"""Q-Learning class. Implements the Q-Learning algorithm."""

def __init__(self,
actions,
epsilon=0.1,
alpha=0.2,
gamma=0.9):
"""Initialize an empty dictionary for Q-Values."""
# Q-Values are stored in a dictionary, with the state-action
self.q = {}

self.epsilon = epsilon # exploration constant
self.alpha = alpha # discount constant
# Epsilon is the exploration factor. A higher epsilon
# encourages more exploration, risking more but potentially
# gaining more too.
self.epsilon = epsilon

# Alpha is the learning rate. If Alpha is high, then the
# learning is faster but may not converge. If Alpha is low,
# the learning is slower but convergence may be more stable.
self.alpha = alpha

# Gamma is the discount factor.
# It prioritizes present rewards over future ones.
self.gamma = gamma

# Actions available in the environment
self.actions = actions

def getQ(self, state, action):
"""Get Q value for a state-action pair.
If the state-action pair is not found in the dictionary,
return 0.0 if not found in our dictionary
"""
return self.q.get((state, action), 0.0)
# return self.q.get((state, action), 1.0)

def learnQ(self, state, action, reward, value):
'''
Q-learning:
Q(s, a) += alpha * (reward(s,a) + max(Q(s') - Q(s,a))
'''
"""Updates the Q-value for a state-action pair.
The core Q-Learning update rule.
Q(s, a) += alpha * (reward(s,a) + max(Q(s')) - Q(s,a))
This function updates the Q-value for a state-action pair
based on the reward and maximum estimated future reward.
"""
oldv = self.q.get((state, action), None)
if oldv is None:
# If no previous Q-Value exists, then initialize
# it with the current reward
self.q[(state, action)] = reward
else:
# Update the Q-Value with the weighted sum of old
# value and the newly found value.
#
# Alpha determines how much importance we give to the
# new value compared to the old value.
self.q[(state, action)] = oldv + self.alpha * (value - oldv)

def chooseAction(self, state):
"""Epsilon-Greedy approach for action selection."""
if random.random() < self.epsilon:
# With probability epsilon, we select a random action
action = random.choice(self.actions)
else:
# With probability 1-epsilon, we select the action
# with the highest Q-value
q = [self.getQ(state, a) for a in self.actions]
maxQ = max(q)
count = q.count(maxQ)
# In case there're several state-action max values
# we select a random one among them
# If there are multiple actions with the same Q-Value,
# then choose randomly among them
if count > 1:
best = [i for i in range(len(self.actions)) if q[i] == maxQ]
i = random.choice(best)
Expand All @@ -44,13 +84,20 @@ def chooseAction(self, state):
return action

def learn(self, state1, action1, reward, state2):
"""Get the maximum Q-Value for the next state."""
maxqnew = max([self.getQ(state2, a) for a in self.actions])
self.learnQ(state1, action1, reward, reward + self.gamma*maxqnew)

import math
def ff(f,n):
fs = "{:f}".format(f)
# Learn the Q-Value based on current reward and future
# expected rewards.
self.learnQ(state1, action1, reward, reward + self.gamma * maxqnew)


# A utility function to format floating point numbers. Not
# directly related to Q-learning.
def ff(f, n):
"""Format a floating point number to a string with n digits."""
fs = '{:f}'.format(f)
if len(fs) < n:
return ("{:"+n+"s}").format(fs)
return ('{:'+n+'s}').format(fs)
else:
return fs[:n]

0 comments on commit e1e97ff

Please sign in to comment.