In [1]:
import sys
import re

def lex(characters, token_exprs):
    pos = 0
    tokens = []
    while pos < len(characters):
        match = None
        for token_expr in token_exprs:
            pattern, tag = token_expr
            regex = re.compile(pattern)
            match = regex.match(characters, pos)
            if match:
                text = match.group(0)
                if tag:
                    token = (text, tag)
                    tokens.append(token)
                break
        if not match:
            sys.stderr.write('Illegal character: %s\\n' % characters[pos])
            sys.exit(1)
        else:
            pos = match.end(0)
    return tokens

In [2]:
RESERVED = 'RESERVED'
INT      = 'INT'
ID       = 'ID'

In [3]:
token_exprs = [
    (r'[ \n\t]+',              None),
    (r'#[^\n]*',               None),
    (r'\:=',                   RESERVED),
    (r'\(',                    RESERVED),
    (r'\)',                    RESERVED),
    (r';',                     RESERVED),
    (r'\+',                    RESERVED),
    (r'-',                     RESERVED),
    (r'\*',                    RESERVED),
    (r'/',                     RESERVED),
    (r'<=',                    RESERVED),
    (r'<',                     RESERVED),
    (r'>=',                    RESERVED),
    (r'>',                     RESERVED),
    (r'=',                     RESERVED),
    (r'!=',                    RESERVED),
    (r'and',                   RESERVED),
    (r'or',                    RESERVED),
    (r'not',                   RESERVED),
    (r'if',                    RESERVED),
    (r'then',                  RESERVED),
    (r'else',                  RESERVED),
    (r'while',                 RESERVED),
    (r'do',                    RESERVED),
    (r'end',                   RESERVED),
    (r'[0-9]+',                INT),
    (r'[A-Za-z][A-Za-z0-9_]*', ID),
]

In [4]:
def imp_lex(characters):
    return lex(characters, token_exprs)

In [5]:
program = '''
n := 1570;
p := 1;
while n > 0 do
  p := p * n;
  n := n - 1
end
'''

In [6]:
tokens = imp_lex(program)
for token in tokens:
    print (token)

('n', 'ID')
(':=', 'RESERVED')
('1570', 'INT')
(';', 'RESERVED')
('p', 'ID')
(':=', 'RESERVED')
('1', 'INT')
(';', 'RESERVED')
('while', 'RESERVED')
('n', 'ID')
('>', 'RESERVED')
('0', 'INT')
('do', 'RESERVED')
('p', 'ID')
(':=', 'RESERVED')
('p', 'ID')
('*', 'RESERVED')
('n', 'ID')
(';', 'RESERVED')
('n', 'ID')
(':=', 'RESERVED')
('n', 'ID')
('-', 'RESERVED')
('1', 'INT')
('end', 'RESERVED')
