In [90]:
import re

patterns = {
    "identifier":"([a-z] | [A-Z])([a-z] | [A-Z] | [0-9])*",
    "number": '[0-9]+',
    "symbol": '\+ | \- | \* | / | \( | \)',

}
whitespace = '\s'

class Scanner:
    def __init__(self):
        self.MARKS = {
            'identifier': 'IDENTIFIER',
            'number': 'NUMBER',
            'symbol':'SYMBOL'
        }
        self.symbols = ("+", "-", "*", "/", "(", ")")
        self.tokenTypes = ['identifier', 'number'] # exclude symbol
        self.patterns = {
            "identifier":"([a-z]|[A-Z])([a-z]|[A-Z]|[0-9])*",
            "number": '[0-9]+',
            "symbol": self._generateSymbolsPattern(self.symbols),
            "validChar": '[a-z]|[A-Z]|[0-9]|' + self._generateSymbolsPattern(self.symbols)
        }
        self.validCharPattern = '[a-z]|[A-Z]|[0-9]|\+|\-|\*|\/|\(|\)'
        self.whitespace = '\s'
        self.tokens = []
        self.currentToken = ''

    def scan(self, input):
        for ch in input:
            if self._isWhitespace(ch):
                self._parseToken()

            elif self._isSymbol(ch):
                self._parseToken()
                self.currentToken = ch
                self._parseSymbol()

            elif self._isValidChar(ch):
                self.currentToken += ch

            else:
                print('invalid char')
                ## @@@@@@ throw an exception
                break
        self._parseToken()
        return self.tokens
    def _parseSymbol(self):
        # short cut for parsing symbol
        self.tokens.append((self.currentToken, self.MARKS['symbol']))
        self.currentToken = ''
        

    def _parseToken(self):
        if not self.currentToken: return 
        longestMatchedToken = ''
        longestMatchedTokenType = ''
        for type in self.tokenTypes:
            matched = re.match(self.patterns[type], self.currentToken)
            if matched:
                longestMatchedToken = matched.group() if len(longestMatchedToken) < len(matched.group()) else longestMatchedToken
                longestMatchedTokenType = self.MARKS[type]
        print('longest', longestMatchedToken)
        if longestMatchedToken and longestMatchedTokenType:
            self.tokens.append((longestMatchedToken, longestMatchedTokenType))
            self.currentToken = self.currentToken[len(longestMatchedToken):]
            self._parseToken()
        else:
            # no token can be matched
            print('error')
                
        # @@@@@@ throw an error here. 
        # the token doesn't match any existing pattern, but it doesn't contain any illegal character, it needs to be reprocessed. 
        self.currentToken = ''

    def _isWhitespace(self, char):
        return re.fullmatch(self.whitespace, char)
    
    def _isSymbol(self, char):
        return re.fullmatch(self.patterns['symbol'], char)

    def _isValidChar(self, char):
        return re.fullmatch(self.validCharPattern,char)

    def _generateSymbolsPattern(self, symbols):
        return ''.join(["\\"+s+'|' for s in symbols])[:-1]


s = Scanner()
tests = ['234avd s1233asd23 sfd22*3 ']
for t in tests:
    print(s.scan(t))


type identifier
type number
234
longest 234
type identifier
avd
type number
longest avd
type identifier
s1233asd23
type number
longest s1233asd23
type identifier
sfd22
type number
longest sfd22
type identifier
type number
3
longest 3
[('234', 'NUMBER'), ('avd', 'IDENTIFIER'), ('s1233asd23', 'IDENTIFIER'), ('sfd22', 'IDENTIFIER'), ('*', 'SYMBOL'), ('3', 'NUMBER')]


In [86]:
import re
symbols = ("+", "-", "*", "/", "(", ")")
def _generateSymbolsPattern(symbols):
    return ''.join(["\\"+s+'|' for s in symbols])[:-1]
pattern = "([a-z]|[A-Z])([a-z]|[A-Z]|[0-9])*"
numberP = '[0-9]+'
print(pattern)
re.match(numberP,'234avds1233asd23').group()


([a-z]|[A-Z])([a-z]|[A-Z]|[0-9])*


'234'

In [71]:
c = {"a":1}
b = "a"
print(c[b])

1


In [50]:
max('abc','bc')

'bc'

In [92]:
import re

class Scanner:
    def __init__(self):
        self.MARKS = {
            'identifier': 'IDENTIFIER',
            'number': 'NUMBER',
            'symbol':'SYMBOL'
        }
        self.symbols = ("+", "-", "*", "/", "(", ")")
        self.tokenTypes = ['identifier', 'number'] # exclude symbol
        self.patterns = {
            "identifier":"([a-z]|[A-Z])([a-z]|[A-Z]|[0-9])*",
            "number": '[0-9]+',
            "symbol": self._generateSymbolsPattern(self.symbols),
            "validChar": '[a-z]|[A-Z]|[0-9]|' + self._generateSymbolsPattern(self.symbols)
        }
        self.validCharPattern = '[a-z]|[A-Z]|[0-9]|\+|\-|\*|\/|\(|\)'
        self.whitespace = '\s'
        self.tokens = []
        self.currentToken = ''

    def scan(self, input):
        for ch in input:
            if self._isWhitespace(ch):
                self._parseToken()

            elif self._isSymbol(ch):
                self._parseToken()
                self.currentToken = ch
                self._parseSymbol()

            elif self._isValidChar(ch):
                self.currentToken += ch

            else:
                print('invalid char')
                ## @@@@@@ throw an exception
                break
        self._parseToken()
        return self.tokens
    def _parseSymbol(self):
        # short cut for parsing symbol
        self.tokens.append((self.currentToken, self.MARKS['symbol']))
        self.currentToken = ''
        

    def _parseToken(self):
        if not self.currentToken: return 
        longestMatchedToken = ''
        longestMatchedTokenType = ''
        for type in self.tokenTypes:
            matched = re.match(self.patterns[type], self.currentToken)
            if matched:
                longestMatchedToken = matched.group() if len(longestMatchedToken) < len(matched.group()) else longestMatchedToken
                longestMatchedTokenType = self.MARKS[type]
        if longestMatchedToken and longestMatchedTokenType:
            self.tokens.append((longestMatchedToken, longestMatchedTokenType))
            self.currentToken = self.currentToken[len(longestMatchedToken):]
            self._parseToken()
        else:
            # no token can be matched
            print('error')
                
        # @@@@@@ throw an error here. 
        # the token doesn't match any existing pattern, but it doesn't contain any illegal character, it needs to be reprocessed. 
        self.currentToken = ''

    def _isWhitespace(self, char):
        return re.fullmatch(self.whitespace, char)
    
    def _isSymbol(self, char):
        return re.fullmatch(self.patterns['symbol'], char)

    def _isValidChar(self, char):
        return re.fullmatch(self.validCharPattern,char)

    def _generateSymbolsPattern(self, symbols):
        return ''.join(["\\"+s+'|' for s in symbols])[:-1]

def serializeToken(token):
    if not token: return ''
    return ':'.join(token)


print(serializeToken(('hello',"world")))





hello:world
