-
Notifications
You must be signed in to change notification settings - Fork 301
/
tokenizer.go
54 lines (46 loc) · 1.1 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
package rty
import (
"io"
"unicode"
)
// A tokenizer that breaks a string up by spaces.
//
// Ideally, we'd use the table-based algorithm defined in:
// http://www.unicode.org/reports/tr14/
// like this package does:
// https://godoc.org/github.com/gorilla/i18n/linebreak
// but I didn't find a good implementation of that algorithm in Go
// (the one above is half-implemented and doesn't work for
// the most basic things).
//
// This is a half-assed implementation that should have a similar interface
// to a "real" implementation.
type Tokenizer struct {
runes []rune
pos int
}
func NewTokenizer(s string) *Tokenizer {
return &Tokenizer{
runes: []rune(s),
pos: 0,
}
}
func (t *Tokenizer) Next() ([]rune, error) {
if t.pos >= len(t.runes) {
return nil, io.EOF
}
firstRune := t.runes[t.pos]
isSpace := unicode.IsSpace(firstRune)
result := []rune{t.runes[t.pos]}
t.pos++
for t.pos < len(t.runes) {
nextRune := t.runes[t.pos]
isNextSpace := unicode.IsSpace(nextRune)
if isNextSpace || isSpace {
return result, nil
}
result = append(result, nextRune)
t.pos++
}
return result, nil
}