In [1]:
import sys
import re

def lex(characters, token_exprs):
    pos = 0
    tokens = []
    while pos < len(characters):
        match = None
        for token_expr in token_exprs:
            pattern, tag = token_expr
            regex = re.compile(pattern)
            match = regex.match(characters, pos)
            if match:
                text = match.group(0)
                if tag:
                    token = (text, tag)
                    tokens.append(token)
                break
        if not match:
            sys.stderr.write('Illegal character: %s\\n' % characters[pos])
            sys.exit(1)
        else:
            pos = match.end(0)
    return tokens

In [2]:
RESERVED = 'RESERVED'
INT      = 'INT'
ID       = 'ID'

In [3]:
token_exprs = [
    (r'[ \n\t]+',              None),
    (r'#[^\n]*',               None),
    (r'\:=',                   RESERVED),
    (r'\(',                    RESERVED),
    (r'\)',                    RESERVED),
    (r';',                     RESERVED),
    (r'\+',                    RESERVED),
    (r'-',                     RESERVED),
    (r'\*',                    RESERVED),
    (r'/',                     RESERVED),
    (r'<=',                    RESERVED),
    (r'<',                     RESERVED),
    (r'>=',                    RESERVED),
    (r'>',                     RESERVED),
    (r'=',                     RESERVED),
    (r'!=',                    RESERVED),
    (r'and',                   RESERVED),
    (r'or',                    RESERVED),
    (r'not',                   RESERVED),
    (r'if',                    RESERVED),
    (r'then',                  RESERVED),
    (r'else',                  RESERVED),
    (r'while',                 RESERVED),
    (r'do',                    RESERVED),
    (r'end',                   RESERVED),
    (r'[0-9]+',                INT),
    (r'[A-Za-z][A-Za-z0-9_]*', ID),
]

In [4]:
def imp_lex(characters):
    return lex(characters, token_exprs)

In [5]:
program = '''
n := 1570;
p := 1;
while n > 0 do
  p := p * n;
  n := n - 1
end
'''

In [6]:
tokens = imp_lex(program)
for token in tokens:
    print (token)

('n', 'ID')
(':=', 'RESERVED')
('1570', 'INT')
(';', 'RESERVED')
('p', 'ID')
(':=', 'RESERVED')
('1', 'INT')
(';', 'RESERVED')
('while', 'RESERVED')
('n', 'ID')
('>', 'RESERVED')
('0', 'INT')
('do', 'RESERVED')
('p', 'ID')
(':=', 'RESERVED')
('p', 'ID')
('*', 'RESERVED')
('n', 'ID')
(';', 'RESERVED')
('n', 'ID')
(':=', 'RESERVED')
('n', 'ID')
('-', 'RESERVED')
('1', 'INT')
('end', 'RESERVED')


In [None]:
program

'\n"n := 5;\np := 1;\nwhile n > 0 do\n  p := p * n;\n  n := n - 1\nend"\n'

In [None]:
class Interpreter:
    def __init__(self, code):
        self.code = code
        self.pos = 0
        self.current_char = self.code[self.pos]
        
    def error(self):
        raise Exception('Invalid syntax')
        
    def advance(self):
        self.pos += 1
        if self.pos < len(self.code):
            self.current_char = self.code[self.pos]
        else:
            self.current_char = None
            
    def skip_whitespace(self):
        while self.current_char is not None and self.current_char.isspace():
            self.advance()
            
    def parse_number(self):
        num = ''
        while self.current_char is not None and self.current_char.isdigit():
            num += self.current_char
            self.advance()
        return int(num)
    
    def parse_expr(self):
        self.skip_whitespace()
        
        if self.current_char is None:
            self.error()
        
        if self.current_char.isdigit():
            return self.parse_number()
        
        if self.current_char == '(':
            self.advance()
            result = self.parse_expr()
            if self.current_char != ')':
                self.error()
            self.advance()
            return result
        
        self.error()
        
    def parse_if_statement(self):
        self.skip_whitespace()
        
        if self.current_char is None or self.current_char != 'i':
            self.error()
        
        self.advance()
        
        if self.current_char is None or self.current_char != 'f':
            self.error()
        
        self.advance()
        
        condition = self.parse_expr()
        
        self.skip_whitespace()
        
        if self.current_char is None or self.current_char != ':':
            self.error()
            
        self.advance()
        
        self.skip_whitespace()
        
        if condition:
            return self.parse_statements()
        else:
            self.skip_until_end_of_statement()
            
    def parse_statements(self):
        statements = []
        while self.current_char is not None:
            if self.current_char == 'i':
                statements.append(self.parse_if_statement())
            else:
                statements.append(self.parse_expr())
            self.skip_whitespace()
            if self.current_char == ';':
                self.advance()
                self.skip_whitespace()
            elif self.current_char is None:
                break
            else:
                self.error()
        return statements
    
    def skip_until_end_of_statement(self):
        while self.current_char is not None and self.current_char != ';':
            self.advance()
        if self.current_char == ';':
            self.advance()
            
    def run(self):
        statements = self.parse_statements()
        return statements


In [None]:
interpreter = Interpreter("if 1 + 1 == 2: 4 + 5; else: 6 + 7;")
result = interpreter.run()
print(result)

Exception: Invalid syntax