-
Notifications
You must be signed in to change notification settings - Fork 0
/
scan.py
173 lines (149 loc) · 4.8 KB
/
scan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# ------------------------------------------------------------
# calclex.py
#
# tokenizer for a simple expression evaluator for
# numbers and +,-,*,/
# ------------------------------------------------------------
import ply.lex as lex
from ply.lex import TOKEN
from tokens import tokens, Tokens, literals, reserved
D = r'[0-9]'
L = r'[a-zA-Z_]'
H = r'[a-fA-F0-9]'
E = r'[Ee][+-]?(' + D + ')+'
FS = r'(f|F|l|L)'
IS = r'(u|U|l|L)'
class C_Lexer(object):
# List of token names. This is always required
tokens = tokens
literals = literals
def __init__(self):
self.token_stack = []
self.next_token = None
comment = r'(/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/)|(//.*)'
@TOKEN(comment)
def t_COMMENT(self, token):
#print token.lexer.lineno, len(token.value.split('\n')), token.value.split('\n')
lines = len(token.value.split('\n')) - 1
if lines < 0: lines = 0
token.lexer.lineno += lines
identifier = '(' + L + ')((' + L + ')|(' + D + '))*'
@TOKEN(identifier)
def t_IDENTIFIER(self, token):
token.type = reserved.get(token.value,'IDENTIFIER')
return token
const_char = "'(\\.|[^\\'])+'"
@TOKEN(const_char)
def t_CONST_CHAR(self, token):
token.type = 'CONSTANT'
token.value = token.value[1]
return token
const_hex = '0[xX](' + H + ')+(' + IS + ')?'
@TOKEN(const_hex)
def t_CONST_HEX(self, token):
token.type = 'CONSTANT'
token.value = int(token.value, 16)
return token
const_float1 = '(' + D + ')+' + '(' + E + ')' + '(' + FS + ')?'#{D}+{E}{FS}?
@TOKEN(const_float1)
def t_CONST_FLOAT1(self, token):
token.type = 'CONSTANT'
token.value = float(token.value)
return token
const_float2 = '(' + D + ')*\.(' + D + ')+(' + E + ')?' + '(' + FS + ')?'#{D}*"."{D}+({E})?{FS}?
@TOKEN(const_float2)
def t_CONST_FLOAT2(self, token):
token.type = 'CONSTANT'
token.value = float(token.value)
return token
const_float3 = '(' + D + ')+\.(' + D + ')*(' + E + ')?' + '(' + FS + ')?'#{D}+"."{D}*({E})?{FS}?
@TOKEN(const_float3)
def t_CONST_FLOAT3(self, token):
token.type = 'CONSTANT'
token.value = float(token.value)
return token
const_dec_oct = '(' + D + ')+(' + IS + ')?'
@TOKEN(const_dec_oct)
def t_CONST_DEC_OCT(self, token):
token.type = 'CONSTANT'
if len(token.value) > 1 and token.value[0] == '0':
token.value = int(token.value, 8)
else:
token.value = int(token.value, 10)
return token
string_literal = r'\"(\\.|[^\\"])*\"'
@TOKEN(string_literal)
def t_STRING_LITERAL(self, token):
token.type = 'STRING_LITERAL'
token.value = token.value[1:-1]
return token;
#0[xX]{H}+{IS}?
# Regular expression rules for simple tokens
t_RIGHT_ASSIGN = r'>>='
t_LEFT_ASSIGN = r'<<='
t_ADD_ASSIGN = r'\+='
t_SUB_ASSIGN = r'\-='
t_MUL_ASSIGN = r'\*='
t_DIV_ASSIGN = r'\/='
t_MOD_ASSIGN = r'\%='
t_AND_ASSIGN = r'\&='
t_XOR_ASSIGN = r'\^='
t_OR_ASSIGN = r'\|='
t_RIGHT_OP = r'>>'
t_LEFT_OP = r'<<'
t_INC_OP = r'\+\+'
t_DEC_OP = r'\-\-'
t_PTR_OP = r'\->'
t_AND_OP = r'\&\&'
t_OR_OP = r'\|\|'
t_LE_OP = r'\<='
t_GE_OP = r'\>='
t_EQ_OP = r'=='
t_NE_OP = r'\!='
# Define a rule so we can track line numbers
def t_newline(self,t):
r'\n+'
t.lexer.lineno += len(t.value)
# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t\v\f'
# Error handling rule
def t_error(self,t):
print "Illegal character '%s'" % t.value[0]
t.lexer.skip(1)
# Build the lexer
def build(self,**kwargs):
self.lexer = lex.lex(object=self, **kwargs)
def h(self, f, *args, **kwargs):
def token(*args, **kwargs):
'''A decorator on the original token function'''
t = f()
self.token_stack.append(self.next_token)
self.next_token = t
return t
return token
self.lexer.token = h(self, self.lexer.token)
# Test it output
def test(self,data):
self.lexer.input(data)
while 1:
tok = lexer.token()
if not tok: break
print tok
# Build the lexer and try it out
#m = C_Lexer()
#m.build() # Build the lexer
#lexer = m.lexer
if __name__ == '__main__':
# Test it out
f = open('test.c', 'r')
data = f.read()
f.close()
#3 + 4 * 10
#+ -20 *2
# Give the lexer some input
lexer.input(data)
# Tokenize
while 1:
tok = lexer.token()
if not tok: break # No more input
print tok, tok.value