In [1]:
import re
from tokenize import tokenize, ENCODING, NEWLINE, ENDMARKER
from io import BytesIO
import json


In [2]:
KEYWORDS = ("and", "as", "assert", "await", "async", "break", "class", "continue", "def", "del", "elif", "else", "except", "exec", "finally", "for", "from",
            "global", "nonlocal", "if", "import", "in", "is", "lambda", "not", "or", "pass", "print", "raise", "return", "try", "while", "with", "yield", "range", "hex")

In [3]:
UPPER_KEYWORDS = [x.upper() for x in KEYWORDS]
print(UPPER_KEYWORDS)

['AND', 'AS', 'ASSERT', 'AWAIT', 'ASYNC', 'BREAK', 'CLASS', 'CONTINUE', 'DEF', 'DEL', 'ELIF', 'ELSE', 'EXCEPT', 'EXEC', 'FINALLY', 'FOR', 'FROM', 'GLOBAL', 'NONLOCAL', 'IF', 'IMPORT', 'IN', 'IS', 'LAMBDA', 'NOT', 'OR', 'PASS', 'PRINT', 'RAISE', 'RETURN', 'TRY', 'WHILE', 'WITH', 'YIELD', 'RANGE', 'HEX']


In [4]:
TOKENS = (
    (r'[a-zA-Z_]\w*', 'VAR'),
    (r'0', 'INT'),
    (r'[-+]?\d+[eE][-+]?\d+[jJ]', 'FLOAT_EXPONANT_COMPLEX'),
    (r'[-+]?\d+.\d?[eE][-+]?\d+[jJ]', 'FLOAT_EXPONANT_COMPLEX'),
    (r'[-+]?\d?.\d+[eE][-+]?\d+[jJ]', 'FLOAT_EXPONANT_COMPLEX'),
    (r'\d+[eE][-+]?\d*', 'FLOAT_EXPONANT'),
    (r'\d+\.\d*[eE][-+]?\d*', 'FLOAT_EXPONANT'),
    (r'\.\d+[eE][-+]?\d*', 'FLOAT_EXPONANT'),
    (r'\d*\.\d+[jJ]', 'COMPLEX'),
    (r'\d+\.[jJ]', 'COMPLEX'),
    (r'\d+[jJ]', 'COMPLEX'),
    (r'\d+\.', 'FLOAT'),
    (r'\d*[_\d]*\.[_\d]+[lL]?', 'FLOAT'),
    (r'\d+[_\d]+\.[_\d]*[lL]?', 'FLOAT'),
    (r'\.', 'DOT'),
    (r'[1-9]+[_\d]*[lL]', 'LONG'),
    (r'[1-9]+[_\d]*', 'INT'),
    (r'0[xX][\d_a-fA-F]+[lL]?', 'HEXA'),
    (r'(0[oO][0-7]+)|(0[0-7_]*)[lL]?', 'OCTA'),
    (r'0[bB][01_]+[lL]?', 'BINARY'),
    (r'\(', 'LEFT_PARENTHESIS'),
    (r'\)', 'RIGHT_PARENTHESIS'),
    (r':', 'COLON'),
    (r',', 'COMMA'),
    (r';', 'SEMICOLON'),
    (r'@', 'AT'),
    (r'\+', 'PLUS'),
    (r'-', 'MINUS'),
    (r'\*', 'STAR'),
    (r'/', 'SLASH'),
    (r'\|', 'VBAR'),
    (r'&', 'AMPER'),
    (r'@', 'AT'),
    (r'<', 'LESS'),
    (r'>', 'GREATER'),
    (r'=', 'EQUAL'),
    (r'%', 'PERCENT'),
    (r'\[', 'LEFT_SQUARE_BRACKET'),
    (r'\]', 'RIGHT_SQUARE_BRACKET'),
    (r'\{', 'LEFT_BRACKET'),
    (r'\}', 'RIGHT_BRACKET'),
    (r'`', 'BACKQUOTE'),
    (r'==', 'EQUAL_EQUAL'),
    (r'<>', 'NOT_EQUAL'),
    (r'!=', 'NOT_EQUAL'),
    (r'<=', 'LESS_EQUAL'),
    (r'>=', 'GREATER_EQUAL'),
    (r'~', 'TILDE'),
    (r'\^', 'CIRCUMFLEX'),
    (r'<<', 'LEFT_SHIFT'),
    (r'>>', 'RIGHT_SHIFT'),
    (r'\*\*', 'DOUBLE_STAR'),
    (r'\+=', 'PLUS_EQUAL'),
    (r'-=', 'MINUS_EQUAL'),
    (r'@=', 'AT_EQUAL'),
    (r'\*=', 'STAR_EQUAL'),
    (r'/=', 'SLASH_EQUAL'),
    (r'%=', 'PERCENT_EQUAL'),
    (r'&=', 'AMPER_EQUAL'),
    (r'\|=', 'VBAR_EQUAL'),
    (r'\^=', 'CIRCUMFLEX_EQUAL'),
    (r'<<=', 'LEFT_SHIFT_EQUAL'),
    (r'>>=', 'RIGHT_SHIFT_EQUAL'),
    (r'\.\.\.', 'ELLIPSIS'),
    (r'->', 'RIGHT_ARROW'),
    (r'\*\*=', 'DOUBLE_STAR_EQUAL'),
    (r'//', 'DOUBLE_SLASH'),
    (r'//=', 'DOUBLE_SLASH_EQUAL'),
    (r'\n', 'ENDL'),
    (r'\r\n', 'ENDL'),
    (r'#.*', 'COMMENT'),
    (r'(\s|\\\n|\\\r\n)+', 'SPACE'),
    (r'["\'](.|\n|\r)*["\']', 'STRING'),
    (r'[uU]["\'](.|\n|\r)*["\']', 'UNICODE_STRING'),
    (r'[fF]["\'](.|\n|\r)*["\']', 'INTERPOLATED_STRING'),
    (r'[rR]["\'](.|\n|\r)*["\']', 'RAW_STRING'),
    (r'[bB]["\'](.|\n|\r)*["\']', 'BINARY_STRING'),
    (r'[uU][rR]["\'](.|\n|\r)*["\']', 'UNICODE_RAW_STRING'),
    (r'[bB][rR]["\'](.|\n|\r)*["\']', 'BINARY_RAW_STRING'),
    (r'[fF][rR]["\'](.|\n|\r)*["\']', 'INTERPOLATED_RAW_STRING'),
    (r'[rR][fF]["\'](.|\n|\r)*["\']', 'INTERPOLATED_RAW_STRING'),
)

In [5]:
TOKENS = [(re.compile('^' + x[0] + '$'), x[1]) for x in TOKENS]

In [6]:
""" b = [(idx, item) for idx,item in enumerate(TOKENS)]
print(b) """

' b = [(idx, item) for idx,item in enumerate(TOKENS)]\nprint(b) '

In [7]:
def more_tokenize(sequence, print_function=False):
    return list(tokenize_generator(sequence))

In [8]:
def tokenize_current_keywords(print_function=False):
    if print_function is True:
        return [x for x in KEYWORDS if x != "print"]
    else:
        return KEYWORDS

In [9]:
def tokenize_generator(sequence):
    print(sequence)
    current_keywords = tokenize_current_keywords()
    for item in sequence:
        if item in current_keywords:
            yield [item.upper(), item]
            continue
        
        for candidate, token_name in TOKENS:
            if candidate.match(item):
                yield [token_name, item]
                break
        else:
            raise Exception(
                "Can't find a matching token for this item: '%s'" % item)
    # yield ('ENDMARKER', '')
    # yield

In [10]:

def call_moretokenizer(errorline):
    g = tokenize(BytesIO(errorline.encode('utf-8')).readline)
    tokenarray = []
    for toknum, tokval, _, _, _ in g:
        if toknum not in [ENCODING, NEWLINE, ENDMARKER] and tokval != '':
            tokenarray.append(tokval)
    more_tkn = more_tokenize(tokenarray)
    tkn_named_entity = []
    for tkn in more_tkn:
        tkn_named_entity.append(tkn[0])
        
    return tkn_named_entity


In [11]:
""" list2=[]
for i in range(len(TOKENS)):
    list2.append(i)
print(list2) """

' list2=[]\nfor i in range(len(TOKENS)):\n    list2.append(i)\nprint(list2) '

In [12]:
errorcode="""
def say_hello():
	print()
say_hello()
if 
for
"""

deneme=call_moretokenizer(errorcode)


print(deneme)


['\n', 'def', 'say_hello', '(', ')', ':', '\t', 'print', '(', ')', 'say_hello', '(', ')', 'if', 'for']
['ENDL', 'DEF', 'VAR', 'LEFT_PARENTHESIS', 'RIGHT_PARENTHESIS', 'COLON', 'SPACE', 'PRINT', 'LEFT_PARENTHESIS', 'RIGHT_PARENTHESIS', 'VAR', 'LEFT_PARENTHESIS', 'RIGHT_PARENTHESIS', 'IF', 'FOR']


In [13]:
list1=[]
for i in TOKENS:
    list1.append(i[1])
print(list1)

['VAR', 'INT', 'FLOAT_EXPONANT_COMPLEX', 'FLOAT_EXPONANT_COMPLEX', 'FLOAT_EXPONANT_COMPLEX', 'FLOAT_EXPONANT', 'FLOAT_EXPONANT', 'FLOAT_EXPONANT', 'COMPLEX', 'COMPLEX', 'COMPLEX', 'FLOAT', 'FLOAT', 'FLOAT', 'DOT', 'LONG', 'INT', 'HEXA', 'OCTA', 'BINARY', 'LEFT_PARENTHESIS', 'RIGHT_PARENTHESIS', 'COLON', 'COMMA', 'SEMICOLON', 'AT', 'PLUS', 'MINUS', 'STAR', 'SLASH', 'VBAR', 'AMPER', 'AT', 'LESS', 'GREATER', 'EQUAL', 'PERCENT', 'LEFT_SQUARE_BRACKET', 'RIGHT_SQUARE_BRACKET', 'LEFT_BRACKET', 'RIGHT_BRACKET', 'BACKQUOTE', 'EQUAL_EQUAL', 'NOT_EQUAL', 'NOT_EQUAL', 'LESS_EQUAL', 'GREATER_EQUAL', 'TILDE', 'CIRCUMFLEX', 'LEFT_SHIFT', 'RIGHT_SHIFT', 'DOUBLE_STAR', 'PLUS_EQUAL', 'MINUS_EQUAL', 'AT_EQUAL', 'STAR_EQUAL', 'SLASH_EQUAL', 'PERCENT_EQUAL', 'AMPER_EQUAL', 'VBAR_EQUAL', 'CIRCUMFLEX_EQUAL', 'LEFT_SHIFT_EQUAL', 'RIGHT_SHIFT_EQUAL', 'ELLIPSIS', 'RIGHT_ARROW', 'DOUBLE_STAR_EQUAL', 'DOUBLE_SLASH', 'DOUBLE_SLASH_EQUAL', 'ENDL', 'ENDL', 'COMMENT', 'SPACE', 'STRING', 'UNICODE_STRING', 'INTERPOLATE

In [14]:
keywords_tokens_list=list1+UPPER_KEYWORDS
print(keywords_tokens_list)

['VAR', 'INT', 'FLOAT_EXPONANT_COMPLEX', 'FLOAT_EXPONANT_COMPLEX', 'FLOAT_EXPONANT_COMPLEX', 'FLOAT_EXPONANT', 'FLOAT_EXPONANT', 'FLOAT_EXPONANT', 'COMPLEX', 'COMPLEX', 'COMPLEX', 'FLOAT', 'FLOAT', 'FLOAT', 'DOT', 'LONG', 'INT', 'HEXA', 'OCTA', 'BINARY', 'LEFT_PARENTHESIS', 'RIGHT_PARENTHESIS', 'COLON', 'COMMA', 'SEMICOLON', 'AT', 'PLUS', 'MINUS', 'STAR', 'SLASH', 'VBAR', 'AMPER', 'AT', 'LESS', 'GREATER', 'EQUAL', 'PERCENT', 'LEFT_SQUARE_BRACKET', 'RIGHT_SQUARE_BRACKET', 'LEFT_BRACKET', 'RIGHT_BRACKET', 'BACKQUOTE', 'EQUAL_EQUAL', 'NOT_EQUAL', 'NOT_EQUAL', 'LESS_EQUAL', 'GREATER_EQUAL', 'TILDE', 'CIRCUMFLEX', 'LEFT_SHIFT', 'RIGHT_SHIFT', 'DOUBLE_STAR', 'PLUS_EQUAL', 'MINUS_EQUAL', 'AT_EQUAL', 'STAR_EQUAL', 'SLASH_EQUAL', 'PERCENT_EQUAL', 'AMPER_EQUAL', 'VBAR_EQUAL', 'CIRCUMFLEX_EQUAL', 'LEFT_SHIFT_EQUAL', 'RIGHT_SHIFT_EQUAL', 'ELLIPSIS', 'RIGHT_ARROW', 'DOUBLE_STAR_EQUAL', 'DOUBLE_SLASH', 'DOUBLE_SLASH_EQUAL', 'ENDL', 'ENDL', 'COMMENT', 'SPACE', 'STRING', 'UNICODE_STRING', 'INTERPOLATE

In [15]:
for idx, x in enumerate(keywords_tokens_list):
    print(idx, x)

0 VAR
1 INT
2 FLOAT_EXPONANT_COMPLEX
3 FLOAT_EXPONANT_COMPLEX
4 FLOAT_EXPONANT_COMPLEX
5 FLOAT_EXPONANT
6 FLOAT_EXPONANT
7 FLOAT_EXPONANT
8 COMPLEX
9 COMPLEX
10 COMPLEX
11 FLOAT
12 FLOAT
13 FLOAT
14 DOT
15 LONG
16 INT
17 HEXA
18 OCTA
19 BINARY
20 LEFT_PARENTHESIS
21 RIGHT_PARENTHESIS
22 COLON
23 COMMA
24 SEMICOLON
25 AT
26 PLUS
27 MINUS
28 STAR
29 SLASH
30 VBAR
31 AMPER
32 AT
33 LESS
34 GREATER
35 EQUAL
36 PERCENT
37 LEFT_SQUARE_BRACKET
38 RIGHT_SQUARE_BRACKET
39 LEFT_BRACKET
40 RIGHT_BRACKET
41 BACKQUOTE
42 EQUAL_EQUAL
43 NOT_EQUAL
44 NOT_EQUAL
45 LESS_EQUAL
46 GREATER_EQUAL
47 TILDE
48 CIRCUMFLEX
49 LEFT_SHIFT
50 RIGHT_SHIFT
51 DOUBLE_STAR
52 PLUS_EQUAL
53 MINUS_EQUAL
54 AT_EQUAL
55 STAR_EQUAL
56 SLASH_EQUAL
57 PERCENT_EQUAL
58 AMPER_EQUAL
59 VBAR_EQUAL
60 CIRCUMFLEX_EQUAL
61 LEFT_SHIFT_EQUAL
62 RIGHT_SHIFT_EQUAL
63 ELLIPSIS
64 RIGHT_ARROW
65 DOUBLE_STAR_EQUAL
66 DOUBLE_SLASH
67 DOUBLE_SLASH_EQUAL
68 ENDL
69 ENDL
70 COMMENT
71 SPACE
72 STRING
73 UNICODE_STRING
74 INTERPOLATED_STRING


In [16]:
list_ind= []
for item in deneme:
    index=keywords_tokens_list.index(item)
    list_ind.append(index)
print(list_ind)

[68, 89, 0, 20, 21, 22, 71, 108, 20, 21, 0, 20, 21, 100, 96]


In [17]:
predict=5
token_name=TOKENS[predict]
print(token_name)

(re.compile('^\\d+[eE][-+]?\\d*$'), 'FLOAT_EXPONANT')
