-
-
Notifications
You must be signed in to change notification settings - Fork 176
/
token.go
118 lines (108 loc) · 3.26 KB
/
token.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package parser
import (
"bufio"
"io"
"strings"
"unicode/utf8"
"github.com/go-errors/errors"
"github.com/spf13/viper"
)
// Equal to `startBufSize` from `bufio/scan.go`
const startBufSize = 4096
// MaxScannerCapacity defaults to 64 * 1024 which is not enough for certain lines
// containing e.g. geographical data. 256K ought to be enough for anybody...
var MaxScannerCapacity = 256 * 1024
// State transition table for tokenizer:
//
// Ready -> Ready (default)
// Ready -> Error (on invalid syntax)
// Ready -> Done (on ;, emit token)
// Ready -> Done (on EOF, emit token)
//
// Ready -> Comment (on --)
// Comment -> Comment (default)
// Comment -> Ready (on \n)
//
// Ready -> Block (on /*)
// Block -> Block (on /*, +-depth)
// Block -> Ready (on */, depth 0)
//
// Ready -> Quote (on ')
// Quote -> Quote (on '', default)
// Quote -> Ready (on ')
//
// Ready -> Dollar (on $tag$)
// Dollar -> Dollar (default)
// Dollar -> Ready (on $tag$)
//
// Ready -> Escape (on \)
// Escape -> Ready (on next)
type tokenizer struct {
state State
last int
}
func (t *tokenizer) ScanToken(data []byte, atEOF bool) (advance int, token []byte, err error) {
// If we requested more data, resume from last position.
for width := 1; t.last < len(data); t.last += width {
r, width := utf8.DecodeRune(data[t.last:])
end := t.last + width
t.state = t.state.Next(r, data[:end])
// Emit token
if t.state == nil {
t.last = 0
t.state = &ReadyState{}
return end, data[:end], nil
}
}
if !atEOF || len(data) == 0 {
// Request more data or end the stream
return 0, nil, nil
}
// We're at EOF. If we have a final, non-terminated token, return it.
return len(data), data, nil
}
// Use bufio.Scanner to split a PostgreSQL string into multiple statements.
//
// The core problem is to figure out whether the current ; separator is inside
// an escaped string literal. PostgreSQL has multiple ways of opening a string
// literal, $$, ', --, /*, etc. We use a FSM to guarantee these states are
// entered exclusively. If not in one of the above escape states, the next ;
// token can be parsed as statement separator.
//
// Each statement is split as it is, without removing comments or white spaces.
func Split(sql io.Reader, transform ...func(string) string) (stats []string, err error) {
t := tokenizer{state: &ReadyState{}}
scanner := bufio.NewScanner(sql)
// Increase scanner capacity to support very long lines containing e.g. geodata
buf := make([]byte, startBufSize)
maxbuf := int(viper.GetSizeInBytes("SCANNER_BUFFER_SIZE"))
if maxbuf == 0 {
maxbuf = MaxScannerCapacity
}
scanner.Buffer(buf, maxbuf)
scanner.Split(t.ScanToken)
var token string
for scanner.Scan() {
token = scanner.Text()
trim := token
for _, apply := range transform {
trim = apply(trim)
}
if len(trim) > 0 {
stats = append(stats, trim)
}
}
err = scanner.Err()
if err != nil {
err = errors.Errorf("%w\nAfter statement %d: %s", err, len(stats), token)
}
if errors.Is(err, bufio.ErrTooLong) {
err = errors.Errorf("%w\nTry setting SUPABASE_SCANNER_BUFFER_SIZE=5MB (current size is %dKB)", err, maxbuf>>10)
}
return stats, err
}
func SplitAndTrim(sql io.Reader) (stats []string, err error) {
return Split(sql, func(token string) string {
return strings.TrimRight(token, ";")
}, strings.TrimSpace)
}