/
dfa_machine.go
153 lines (147 loc) · 3.82 KB
/
dfa_machine.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
package machines
// DFATrans represents a Deterministic Finite Automatons state transition table
type DFATrans [][256]int
// DFAAccepting represents maps from accepting DFA states to match identifiers.
// These both identify which states are accepting states and which matches they
// belong to from the AST.
type DFAAccepting map[int]int
type lineCol struct {
line, col int
}
// Compute the line and column of a particular index inside of a byte slice.
func mapLineCols(text []byte) []lineCol {
m := make([]lineCol, len(text))
line := 1
col := 0
for i := 0; i < len(text); i++ {
if text[i] == '\n' {
col = 0
line++
} else {
col++
}
m[i] = lineCol{line: line, col: col}
}
return m
}
// DFALexerEngine does the actual tokenization of the byte slice text using the
// DFA state machine. If the lexing process fails the Scanner will return
// an UnconsumedInput error.
func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAccepting, text []byte) Scanner {
lineCols := mapLineCols(text)
done := false
matchID := -1
matchTC := -1
var scan Scanner
scan = func(tc int) (int, *Match, error, Scanner) {
if done && tc == len(text) {
return tc, nil, nil, nil
}
startTC := tc
if tc < matchTC {
// we back-tracked so reset the last matchTC
matchTC = -1
} else if tc == matchTC {
// the caller did not reset the tc, we are where we left
} else if matchTC != -1 && tc > matchTC {
// we skipped text
matchTC = tc
}
state := startState
for ; tc < len(text) && state != errorState; tc++ {
if match, has := accepting[state]; has {
matchID = match
matchTC = tc
}
state = trans[state][text[tc]]
if state == errorState && matchID > -1 {
startLC := lineCols[startTC]
endLC := lineCols[matchTC-1]
match := &Match{
PC: matchID,
TC: startTC,
StartLine: startLC.line,
StartColumn: startLC.col,
EndLine: endLC.line,
EndColumn: endLC.col,
Bytes: text[startTC:matchTC],
}
if matchTC == startTC {
err := &EmptyMatchError{
MatchID: matchID,
TC: tc,
Line: startLC.line,
Column: startLC.col,
}
return startTC, nil, err, scan
}
matchID = -1
return matchTC, match, nil, scan
}
}
if match, has := accepting[state]; has {
matchID = match
matchTC = tc
}
if startTC <= len(text) && matchID > -1 && matchTC == startTC {
var startLC lineCol
if startTC < len(text) {
startLC = lineCols[startTC]
}
err := &EmptyMatchError{
MatchID: matchID,
TC: tc,
Line: startLC.line,
Column: startLC.col,
}
matchID = -1
return startTC, nil, err, scan
}
if startTC < len(text) && matchTC <= len(text) && matchID > -1 {
startLC := lineCols[startTC]
endLC := lineCols[matchTC-1]
match := &Match{
PC: matchID,
TC: startTC,
StartLine: startLC.line,
StartColumn: startLC.col,
EndLine: endLC.line,
EndColumn: endLC.col,
Bytes: text[startTC:matchTC],
}
matchID = -1
return matchTC, match, nil, scan
}
if matchTC != len(text) && startTC >= len(text) {
// the user has moved us farther than the text. Assume that was
// the intent and return EOF.
return tc, nil, nil, nil
} else if matchTC != len(text) {
done = true
if matchTC == -1 {
matchTC = 0
}
startLC := lineCols[startTC]
etc := tc
var endLC lineCol
if etc >= len(lineCols) {
endLC = lineCols[len(lineCols)-1]
} else {
endLC = lineCols[etc]
}
err := &UnconsumedInput{
StartTC: startTC,
FailTC: etc,
StartLine: startLC.line,
StartColumn: startLC.col,
FailLine: endLC.line,
FailColumn: endLC.col,
Text: text,
}
return tc, nil, err, scan
} else {
return tc, nil, nil, nil
}
}
return scan
}