@@ -2,14 +2,24 @@ package UTF8
2
2
3
3
import (
4
4
"fmt"
5
- "unicode "
5
+ "math "
6
6
"unicode/utf16"
7
7
"unicode/utf8"
8
8
9
9
"github.com/aws/aws-cryptographic-material-providers-library/releases/go/smithy-dafny-standard-library/Wrappers"
10
10
"github.com/dafny-lang/DafnyRuntimeGo/v4/dafny"
11
11
)
12
12
13
+ // The following constants are copied from the Go utf16 lib and are used
14
+ // to check the validity of the utf16 surrogate pairs.
15
+ const (
16
+ // 0xd800-0xdc00 encodes the high 10 bits of a pair.
17
+ // 0xdc00-0xe000 encodes the low 10 bits of a pair.
18
+ surr1 = 0xd800
19
+ surr2 = 0xdc00
20
+ surr3 = 0xe000
21
+ )
22
+
13
23
//IMP: The below extern implementations are only compatible
14
24
//with unicode-char:false transpiled code.
15
25
@@ -19,46 +29,94 @@ import (
19
29
// we need to encode the result in compatible dafny utf16 string before returning
20
30
// the result.
21
31
func Decode (utf8EncodedDafnySeq dafny.Sequence ) Wrappers.Result {
22
- utf8EncodedByteArray := dafny .ToByteArray (utf8EncodedDafnySeq )
23
-
24
- utf16Encoded := utf16 .Encode ([]rune (string (utf8EncodedByteArray )))
25
- var dafnyCharArray []dafny.Char
26
- for _ , c := range utf16Encoded {
27
- if c == unicode .ReplacementChar {
28
- err := fmt .Errorf ("Encountered Not Allowed Replacement Character: %s " , unicode .ReplacementChar )
29
- return Wrappers .Companion_Result_ .Create_Failure_ (dafny .SeqOfString (err .Error ()))
30
- }
31
- dafnyCharArray = append (dafnyCharArray , dafny .Char (c ))
32
+ res , err := DecodeFromNativeGoByteArray (dafny .ToByteArray (utf8EncodedDafnySeq ))
33
+ if err != nil {
34
+ return Wrappers .Companion_Result_ .Create_Failure_ (dafny .SeqOfString (err .Error ()))
32
35
}
33
- return Wrappers .Companion_Result_ .Default (dafny .SeqOfChars (dafnyCharArray ... ))
36
+
37
+ return Wrappers .Companion_Result_ .Create_Success_ (res )
34
38
}
35
39
36
40
// Encode encodes utf16 encoded dafny char (rune) to utf-8 Go rune sequence.
37
41
// Anything we receive here is supposed to be utf16 encoded Go rune
38
42
// since this extern is for unicode-char:false.
39
43
func Encode (utf16EncodedDafnySeq dafny.Sequence ) Wrappers.Result {
40
- encodedUtf16 := utf16EncodedDafnySeqToUint16 (utf16EncodedDafnySeq )
41
- decodedUtf16 := utf16 .Decode (encodedUtf16 )
42
- var utf8EncodedBytes []byte
43
- for _ , r := range decodedUtf16 {
44
- if ! utf8 .ValidRune (r ) || r == unicode .ReplacementChar {
45
- return Wrappers .Companion_Result_ .Create_Failure_ (dafny .SeqOfString ("Failed to utf8 encode rune" ))
46
- }
47
- buf := make ([]byte , utf8 .RuneLen (r ))
48
- n := utf8 .EncodeRune (buf , r )
49
- utf8EncodedBytes = append (utf8EncodedBytes , buf [:n ]... )
44
+ utf8EncodedBytes , err := decodeUtf16 (utf16EncodedDafnySeq )
45
+ if err != nil {
46
+ return Wrappers .Companion_Result_ .Create_Failure_ (dafny .SeqOfString (err .Error ()))
50
47
}
51
48
return Wrappers .Companion_Result_ .Create_Success_ (dafny .SeqOfBytes (utf8EncodedBytes ))
52
49
}
53
50
54
- func utf16EncodedDafnySeqToUint16 (seq dafny.Sequence ) []uint16 {
55
- var r []uint16
51
+ // This method is to be called from the Type Conversion layer.
52
+ // We reuse the same method so that all conversions are consistent.
53
+ func DecodeFromNativeGoByteArray (utf8EncodedByteArray []byte ) (dafny.Sequence , error ) {
54
+ if ! utf8 .Valid (utf8EncodedByteArray ) {
55
+ return nil , fmt .Errorf ("invalid utf8 encoded sequence: %v" , utf8EncodedByteArray )
56
+ }
57
+ utf16Encoded := utf16 .Encode ([]rune (string (utf8EncodedByteArray )))
58
+ var dafnyCharArray []dafny.Char
59
+ for _ , c := range utf16Encoded {
60
+ dafnyCharArray = append (dafnyCharArray , dafny .Char (c ))
61
+ }
62
+ return dafny .SeqOfChars (dafnyCharArray ... ), nil
63
+ }
64
+
65
+ // decode appends to buf the Unicode code point sequence represented
66
+ // by the UTF-16 encoding seq, then encode the code point as utf8 and return the utf8 buffer
67
+ func decodeUtf16 (seq dafny.Sequence ) ([]byte , error ) {
68
+ utf8EncodedBytes := []byte {}
69
+
56
70
for i := dafny .Iterate (seq ); ; {
57
- val , ok := i ()
58
- if ! ok {
59
- return r
71
+ firstVal , firstValExists := i ()
72
+ if ! firstValExists {
73
+ // Iterator has finished, return the buffer
74
+ return utf8EncodedBytes , nil
60
75
} else {
61
- r = append (r , uint16 (val .(dafny.Char )))
76
+ var ar rune
77
+
78
+ // We should be able to rely on dafny that anything inside the seq is utf16 encoded
79
+ // with unicode-char: false. But given the Long Psi issue, it's better to be safe.
80
+ // First check if it's a dafny.Char type, then check if it's within the limits of uint16.
81
+ firstChar , firstValIsAChar := firstVal .(dafny.Char )
82
+ if ! firstValIsAChar || firstChar > math .MaxUint16 || firstChar < 0 {
83
+ return nil , fmt .Errorf ("invalid utf16 encoded sequence: %v" , seq )
84
+ }
85
+
86
+ // Downcast to uint16
87
+ switch r1 := uint16 (firstChar ); {
88
+
89
+ case r1 < surr1 , surr3 <= r1 :
90
+ // normal rune
91
+ ar = rune (r1 )
92
+
93
+ case utf16 .IsSurrogate (rune (r1 )):
94
+ // If firstVal is surrogate, then we need the secondVal to construct the pair
95
+ secondVal , ok := i ()
96
+
97
+ // Same sanity check as line 84
98
+ secondChar , secondValIsAChar := secondVal .(dafny.Char )
99
+ if ! ok || ! secondValIsAChar || secondChar > math .MaxUint16 || secondChar < 0 {
100
+ return nil , fmt .Errorf ("invalid utf16 encoded sequence: %v" , seq )
101
+ }
102
+
103
+ // Check if the secondVal is within the valid low surrogate range
104
+ if surr2 <= uint16 (secondChar ) && uint16 (secondChar ) < surr3 {
105
+ // valid surrogate sequence
106
+ ar = utf16 .DecodeRune (rune (r1 ), rune (uint16 (secondChar )))
107
+ } else {
108
+ return nil , fmt .Errorf ("invalid utf16 encoded sequence: %v" , seq )
109
+ }
110
+ default :
111
+ return nil , fmt .Errorf ("invalid utf16 encoded sequence: %v" , seq )
112
+ }
113
+
114
+ // Create the buffer (upto 4 bytes) to hold the utf8 rune
115
+ buf := make ([]byte , utf8 .RuneLen (ar ))
116
+ n := utf8 .EncodeRune (buf , ar )
117
+
118
+ // Append to the result
119
+ utf8EncodedBytes = append (utf8EncodedBytes , buf [:n ]... )
62
120
}
63
121
}
64
122
}
0 commit comments