-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.go
179 lines (158 loc) · 4.73 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
//go:generate re2c --lang go --no-generation-date comments.re -8 -o comments.go
//go:generate re2c --lang go --no-generation-date cpp.re -8 -o cpp.go
//go:generate re2c --lang go --no-generation-date csharp.re -8 -o csharp.go
//go:generate re2c --lang go --no-generation-date java.re -8 -o java.go
//go:generate re2c --lang go --no-generation-date js.re -8 -o js.go
//go:generate re2c --lang go --no-generation-date py.re -8 -o py.go
//go:generate re2c --lang go --no-generation-date go.re -8 -o go.go
//go:generate re2c --lang go --no-generation-date perl.re -8 -o perl.go
//go:generate re2c --lang go --no-generation-date txt.re -8 -o txt.go
//go:generate re2c --lang go --no-generation-date xml.re -8 -o xml.go
//go:generate re2c --lang go --no-generation-date commented_txt.re -8 -o commented_txt.go
package golexers
import (
"bytes"
"fmt"
"os"
"path/filepath"
"unicode/utf8"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)
type LexFunc func(input *Input) TokenType
var langMap = map[string]LexFunc{}
func Register(exts []string, lexFunc LexFunc) {
for _, ext := range exts {
//fmt.Printf("Register %s\n", ext)
langMap[ext] = lexFunc
}
}
func RegisterAlias(alias string, ext string) {
function, present := langMap[ext]
if !present {
fmt.Fprintf(os.Stderr, "trying to register alias for unknown extension %s\n", ext)
return
}
langMap[alias] = function
}
type Lexer struct {
input *Input
lex_func LexFunc
tokenType TokenType
tokenLine int
tokenStart int
tokenEnd int
pendingTokenType TokenType
}
func CanLex(filename string) bool {
ext := filepath.Ext(filename)
_, found := langMap[ext]
return found
}
func NewLexer(filename string, input []byte) *Lexer {
if !CanLex(filename) {
return nil
}
// Skip BOM if there but keep all byte offsets in file correct
if len(input) > 3 && input[0] == 0xef && input[1] == 0xbb && input[2] == 0xbf {
input = input[3:]
} else if len(input) > 2 {
// Convert from UTF16 to UTF8 if we recognise the BOM
var err error
if input[0] == 0xff && input[1] == 0xfe {
input, _, err = transform.Bytes(unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder(), input[2:])
} else if input[0] == 0xfe && input[1] == 0xff {
input, _, err = transform.Bytes(unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder(), input[2:])
}
if err != nil {
return nil
}
}
// Check it looks like the file at least starts with valid UTF8 - in case the extension is just misleading
lim := len(input)
if lim > 512 {
lim = 512
}
w := input
for i := 0; i < lim; i++ {
r, n := utf8.DecodeRune(w)
if r == utf8.RuneError {
return nil
}
w = w[n:]
i += n
}
in := &Input{
filename: filename,
file: nil,
data: input,
unmatched_start: -1,
cursor: 0,
marker: 0,
token: -1,
limit: len(input),
line: 1,
state: STATE_NORMAL,
eof: false,
bolcursor: 0,
}
lex_func := langMap[filepath.Ext(filename)]
return &Lexer{input: in, lex_func: lex_func, pendingTokenType: INVALID}
}
func (lexer *Lexer) Lex() TokenType {
if lexer.pendingTokenType != INVALID {
lexer.tokenType = lexer.pendingTokenType
lexer.pendingTokenType = INVALID
} else {
tt := lexer.lex_func(lexer.input)
//fmt.Printf("returned %s\n", TypeString(tt))
if lexer.input.unmatched_start >= 0 {
//fmt.Printf("got token %s us %d %d\n", lexer.input.data[lexer.input.unmatched_start:lexer.input.token], lexer.input.unmatched_start, lexer.input.token)
lexer.pendingTokenType = tt
lexer.tokenLine = lexer.input.line
lexer.tokenType = lexer.input.unmatched_token
lexer.tokenStart = lexer.input.unmatched_start
lexer.tokenEnd = lexer.input.token
lexer.input.unmatched_start = -1
return lexer.tokenType
} else {
lexer.tokenType = tt
}
}
lexer.tokenStart = lexer.input.token
lexer.tokenEnd = lexer.input.cursor
lexer.tokenLine = lexer.input.line
return lexer.tokenType
}
func (lexer *Lexer) Line() int {
return lexer.tokenLine
}
func (lexer *Lexer) TokenType() TokenType {
return lexer.tokenType
}
func (lexer *Lexer) TokenPos() (int, int) {
return lexer.tokenStart, lexer.tokenEnd
}
func (lexer *Lexer) Token() []byte {
return lexer.input.data[lexer.tokenStart:lexer.tokenEnd]
}
func (lexer *Lexer) LineText() []byte {
e := bytes.IndexByte(lexer.input.data[lexer.input.bolcursor:], '\n')
if e == -1 {
e = len(lexer.input.data)
} else {
e += lexer.input.bolcursor
}
return lexer.input.data[lexer.input.bolcursor:e]
}
/*
for {
l := lex(in)
//fmt.Printf("lex returns %d\n", l)
if l < 0 {
break
}
fmt.Printf("%d: next token %s %d \"%s\"\n", in.line, typeString(l), in.token, in.data[in.token:in.cursor])
}
}
*/