-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.go
150 lines (136 loc) · 2.99 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package lex
import (
"io"
"os"
"strings"
"text/scanner"
"unicode"
"cmd/asm/internal/flags"
"cmd/internal/objabi"
"cmd/internal/src"
)
// A Tokenizer is a simple wrapping of text/scanner.Scanner, configured
// for our purposes and made a TokenReader. It forms the lowest level,
// turning text from readers into tokens.
type Tokenizer struct {
tok ScanToken
s *scanner.Scanner
base *src.PosBase
line int
file *os.File // If non-nil, file descriptor to close.
}
func NewTokenizer(name string, r io.Reader, file *os.File) *Tokenizer {
var s scanner.Scanner
s.Init(r)
// Newline is like a semicolon; other space characters are fine.
s.Whitespace = 1<<'\t' | 1<<'\r' | 1<<' '
// Don't skip comments: we need to count newlines.
s.Mode = scanner.ScanChars |
scanner.ScanFloats |
scanner.ScanIdents |
scanner.ScanInts |
scanner.ScanStrings |
scanner.ScanComments
s.Position.Filename = name
s.IsIdentRune = isIdentRune
return &Tokenizer{
s: &s,
base: src.NewFileBase(name, objabi.AbsFile(objabi.WorkingDir(), name, *flags.TrimPath)),
line: 1,
file: file,
}
}
// We want center dot (·) and division slash (∕) to work as identifier characters.
func isIdentRune(ch rune, i int) bool {
if unicode.IsLetter(ch) {
return true
}
switch ch {
case '_': // Underscore; traditional.
return true
case '\u00B7': // Represents the period in runtime.exit. U+00B7 '·' middle dot
return true
case '\u2215': // Represents the slash in runtime/debug.setGCPercent. U+2215 '∕' division slash
return true
}
// Digits are OK only after the first character.
return i > 0 && unicode.IsDigit(ch)
}
func (t *Tokenizer) Text() string {
switch t.tok {
case LSH:
return "<<"
case RSH:
return ">>"
case ARR:
return "->"
case ROT:
return "@>"
}
return t.s.TokenText()
}
func (t *Tokenizer) File() string {
return t.base.Filename()
}
func (t *Tokenizer) Base() *src.PosBase {
return t.base
}
func (t *Tokenizer) SetBase(base *src.PosBase) {
t.base = base
}
func (t *Tokenizer) Line() int {
return t.line
}
func (t *Tokenizer) Col() int {
return t.s.Pos().Column
}
func (t *Tokenizer) Next() ScanToken {
s := t.s
for {
t.tok = ScanToken(s.Scan())
if t.tok != scanner.Comment {
break
}
length := strings.Count(s.TokenText(), "\n")
t.line += length
// TODO: If we ever have //go: comments in assembly, will need to keep them here.
// For now, just discard all comments.
}
switch t.tok {
case '\n':
t.line++
case '-':
if s.Peek() == '>' {
s.Next()
t.tok = ARR
return ARR
}
case '@':
if s.Peek() == '>' {
s.Next()
t.tok = ROT
return ROT
}
case '<':
if s.Peek() == '<' {
s.Next()
t.tok = LSH
return LSH
}
case '>':
if s.Peek() == '>' {
s.Next()
t.tok = RSH
return RSH
}
}
return t.tok
}
func (t *Tokenizer) Close() {
if t.file != nil {
t.file.Close()
}
}