-
Notifications
You must be signed in to change notification settings - Fork 19
/
scanner.py
129 lines (92 loc) · 2.56 KB
/
scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import ply.lex as lex
import raco.myrial.exceptions
keywords = ['WHILE', 'DO', 'DEF', 'APPLY', 'CASE', 'WHEN', 'THEN',
'ELSE', 'END', 'CONST', 'LOAD', 'DUMP', 'CSV', 'SCHEMA',
'UDA', 'TRUE', 'FALSE']
types = ['INT', 'STRING', 'FLOAT', 'BOOLEAN']
comprehension_keywords = ['SELECT', 'AS', 'EMIT', 'FROM', 'WHERE']
word_operators = ['AND', 'OR', 'NOT']
builtins = ['EMPTY', 'WORKER_ID', 'SCAN', 'COUNTALL', 'COUNT', 'STORE',
'DIFF', 'CROSS', 'JOIN', 'UNIONALL', 'INTERSECT', 'DISTINCT',
'LIMIT', 'SINK']
# identifiers with special meaning; case-insensitive
reserved = (keywords + types + comprehension_keywords
+ word_operators + builtins)
# Token types; required by ply to have this variable name
tokens = ['LPAREN', 'RPAREN', 'LBRACKET', 'RBRACKET', 'DOT', 'PLUS', 'MINUS',
'TIMES', 'DIVIDE', 'IDIVIDE', 'MOD', 'LT', 'GT', 'GE', 'GE2',
'LE', 'LE2', 'EQ', 'NE', 'NE2', 'NE3', 'COMMA', 'SEMI', 'EQUALS',
'COLON', 'DOLLAR', 'ID',
'STRING_LITERAL', 'INTEGER_LITERAL', 'FLOAT_LITERAL',
'LBRACE', 'RBRACE'] + reserved
# Regular expression rules for simple tokens
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_LBRACKET = r'\['
t_RBRACKET = r'\]'
t_LBRACE = r'\{'
t_RBRACE = r'\}'
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_IDIVIDE = r'//'
t_MOD = r'%'
t_LT = r'<'
t_GT = r'>'
t_LE = r'<='
t_LE2 = u'≤'
t_GE = r'>='
t_GE2 = u'≥'
t_EQ = r'=='
t_NE = r'!='
t_NE2 = r'<>'
t_NE3 = u'≠'
t_DOT = r'\.'
t_COMMA = r','
t_SEMI = r';'
t_EQUALS = r'='
t_COLON = r':'
t_DOLLAR = r'\$'
# Regular expressions for non-trivial tokens
def t_ID(t):
r'[a-zA-Z_][a-zA-Z_0-9]*'
global reserved
upped = t.value.upper()
if upped in reserved:
t.type = upped
t.value = upped
return t
else:
t.type = 'ID'
return t
def t_FLOAT_LITERAL(t):
r"""\d*\.\d+"""
t.value = float(t.value)
return t
def t_INTEGER_LITERAL(t):
r'\d+'
t.value = int(t.value)
return t
def t_STRING_LITERAL(t):
r'"([^\\\n"]|\\.)*"'
t.value = t.value[1:-1].decode("string_escape")
return t
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
# C-style comments
def t_c_comment(t):
r'/\*(.|\n)*?\*/'
t.lexer.lineno += t.value.count('\n')
# database-style comments
def t_db_comment(t):
r'--.*'
# Always ignore whitespace (spaces and tabs)
t_ignore = ' \t\v'
# Error handling rule
def t_error(t):
raise raco.myrial.exceptions.MyrialScanException(t)
lexer = lex.lex()