/
position.go
73 lines (65 loc) · 2.39 KB
/
position.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package filesystem
import (
"unicode/utf16"
"unicode/utf8"
"github.com/apparentlymart/go-textseg/textseg"
"github.com/tomhjp/vault-ls/internal/source"
)
func ByteOffsetForPos(lines source.Lines, pos Pos) (int, error) {
if pos.Line+1 > len(lines) {
return 0, &InvalidPosErr{Pos: pos}
}
return byteOffsetForLSPColumn(lines[pos.Line], pos.Column), nil
}
// byteForLSPColumn takes an lsp.Position.Character value for the receving line
// and finds the byte offset of the start of the UTF-8 sequence that represents
// it in the overall source buffer. This is different than the byte returned
// by posForLSPColumn because it can return offsets that are partway through
// a grapheme cluster, while HCL positions always round to the nearest
// grapheme cluster.
//
// Note that even this can't produce an exact result; if the column index
// refers to the second unit of a UTF-16 surrogate pair then it is rounded
// down the first unit because UTF-8 sequences are not divisible in the same
// way.
func byteOffsetForLSPColumn(l source.Line, lspCol int) int {
if lspCol < 0 {
return l.Range().Start.Byte
}
// Normally ASCII-only lines could be short-circuited here
// but it's not as easy to tell whether a line is ASCII-only
// based on column/byte differences as we also scan newlines
// and a single line range technically spans 2 lines.
// If there are non-ASCII characters then we need to edge carefully
// along the line while counting UTF-16 code units in our UTF-8 buffer,
// since LSP columns are a count of UTF-16 units.
byteCt := 0
utf16Ct := 0
colIdx := 1
remain := l.Bytes()
for {
if len(remain) == 0 { // ran out of characters on the line, so given column is invalid
return l.Range().End.Byte
}
if utf16Ct >= lspCol { // we've found it
return l.Range().Start.Byte + byteCt
}
// Unlike our other conversion functions we're intentionally using
// individual UTF-8 sequences here rather than grapheme clusters because
// an LSP position might point into the middle of a grapheme cluster.
adv, chBytes, _ := textseg.ScanUTF8Sequences(remain, true)
remain = remain[adv:]
byteCt += adv
colIdx++
for len(chBytes) > 0 {
r, l := utf8.DecodeRune(chBytes)
chBytes = chBytes[l:]
c1, c2 := utf16.EncodeRune(r)
if c1 == 0xfffd && c2 == 0xfffd {
utf16Ct++ // codepoint fits in one 16-bit unit
} else {
utf16Ct += 2 // codepoint requires a surrogate pair
}
}
}
}