-
Notifications
You must be signed in to change notification settings - Fork 15
/
Char.x
1419 lines (1289 loc) · 47.5 KB
/
Char.x
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import collections.ConstOrdinalList;
import io.IllegalUTF;
import numbers.IntConvertible;
/**
* This is the implementation of a Unicode character, which represents a single Unicode _codepoint_.
*
* The programming term "char" or "character" is simultaneously both well-known and yet wildly
* ambiguous, and depending on the context (e.g. programming language and platform), it may indicate
* any of: a 7- or 8-bit character of any of a number of encodings (e.g. ASCII, ANSI, etc.), a
* 16-bit value (e.g. DBCS, UTF-16, etc.), a variable-length encoding (e.g. MBCS, UTF-8, UTF-16), or
* even a 32-bit value, which is typically a Unicode codepoint.
*
* Unicode codepoints are just unsigned integer values, in the range `0..0x10FFFF`. (That range is
* subject to change again in any new version of Unicode, of course.) For each value in that range,
* the codepoint is either defined by the Unicode standard, or is considered "Unassigned". There are
* a few special sections -- called "surrogates" -- within the codepoint range that are reserved for
* combining two 16-bit values to form a codepoint that is outside of the 16-bit range; this is a
* historic anachronism that dates back to the transition from when all Unicode codepoints were in
* the 16-bit range.
*
* This `Character` class represents each character as a Unicode codepoint, expressed as a 32-bit
* unsigned integer value. However, even with 32 bits of information, that if often not enough space
* to represent what an end user may perceive as a character. Specifically, when the user sees a
* "character" on a screen, that is a _glyph_, which is a graphical representation of a _grapheme_,
* which is a single human-recognizable "character" that could for example be selected on a screen
* using a mouse or other input method. A grapheme is usually a single Unicode character, but it can
* also be composed of a _sequence_ of multiple Unicode characters. This is common in certain
* human languages that have different ways of composing various elements and accents into a single
* grapheme, but it is also common with characters like emojis, which can combine up to a half dozen
* (!!!) or more separate emoji characters into a single emoji that is displayed to the user. In
* other words, an Ecstasy `Character` is usually a grapheme, but some graphemes require more than
* one Ecstasy `Character` in order to specify the entire set of details about the grapheme.
*
* The implementation of this class is dependent on information from the Unicode standard, which is
* published by the [unicode.org](http://unicode.org/) web site. When a new version of the Unicode
* standard is released, the organization releases a "Unicode Character Database" (UCD), which
* contains the information about each Unicode codepoint. Ecstasy includes extracted portions of
* that information in order to answer specific questions about a given character, such as "Is this
* character a lower-case letter?", and "What is the upper-case form of this character?" The
* information is encoded using [ConstOrdinalList], and consists of the following files in the
* Ecstasy module:
*
* * CharBlocks.dat - specifies the block name for each character
* * CharCats.dat - specifies the Category for each character
* * CharCCCs.dat - Combining Character Class value for each character
* * CharDecs.dat - specifies the decimal value for characters that denote a numeric value
* * CharLowers.dat - specifies the lower case character for characters that have a lower case form
* * CharNums.dat - specifies the numeric string for characters that denote a numeric value
* * CharTitles.dat - specifies the title case character for characters that have a title case form
* * CharUppers.dat - specifies the upper case character for characters that have a upper case form
*
* Note: Conversions to integer types are **not** `@Auto`, because a Char is not considered to be a
* number.
*/
const Char(UInt32 codepoint)
delegates IntConvertible-Object(codepoint)
implements Sequential
implements Destringable
default('\u0000') {
// ----- constructors --------------------------------------------------------------------------
assert() {
assert:bounds codepoint <= 0x10FFFF as $"Character code-point ({codepoint}) out of Unicode range";
assert:bounds !(0xD7FF < codepoint < 0xE000) as
$|Character code-point ({codepoint}) is a Unicode surrogate value;\
| surrogate values are not valid Unicode characters
;
}
/**
* Construct a character from the codepoint indicated by the passed UTF8 value.
*
* @param utf8 the character, in UTF8 format
*/
construct(Byte[] utf8) {
Int length = utf8.size;
UInt32 codepoint;
switch (length) {
case 1:
// ASCII value
codepoint = utf8[0].toUInt32();
if (codepoint > 0x7F) {
throw new IllegalUTF($"Illegal ASCII code in 1-byte UTF8 format: {codepoint}");
}
break;
case 2..6:
// #1s first byte trailing # trailing bits code-points
// --- ---------- -------- ---------- ---- -----------------------
// 0 0xxxxxxx n/a 0 7 U+0000 - U+007F (ASCII)
// 2 110xxxxx 10xxxxxx 1 11 U+0080 - U+07FF
// 3 1110xxxx 10xxxxxx 2 16 U+0800 - U+FFFF
// 4 11110xxx 10xxxxxx 3 21 U+10000 - U+1FFFFF
// 5 111110xx 10xxxxxx 4 26 U+200000 - U+3FFFFFF
// 6 1111110x 10xxxxxx 5 31 U+4000000 - U+7FFFFFFF
codepoint = utf8[0].toUInt32();
Int bits = (~codepoint).leftmostBit.trailingZeroCount;
if (length != 7 - bits) {
throw new IllegalUTF($"Expected UTF8 length of {7 - bits} bytes; actual length is {length} bytes");
}
codepoint &= 0b11111 >>> 5 - bits;
for (Int i : 1 ..< length) {
Byte b = utf8[i];
if (b & 0b11000000 != 0b10000000) {
throw new IllegalUTF("trailing unicode byte does not match 10xxxxxx");
}
codepoint = codepoint << 6 | (b & 0b00111111).toUInt32();
}
break;
default:
throw new IllegalUTF($"Illegal UTF8 encoding length: {length}");
}
construct Char(codepoint);
}
/**
* Construct a character from the codepoint indicated by the passed `Byte` value. This is
* primarily useful for codepoints in the ASCII range.
*
* @param codepoint the codepoint for the character
*/
construct(Byte codepoint) {
construct Char(codepoint.toUInt32());
}
/**
* Construct a character from the codepoint indicated by the passed `Int` value.
*
* @param n the codepoint for the character
*/
construct(Int n) {
construct Char(codepoint.toUInt32());
}
@Override
construct(String text) {
assert:arg text.size == 1;
construct Char(text[0].codepoint);
}
// ----- properties ----------------------------------------------------------------------------
/**
* The unicode code-point for this character.
*/
UInt32 codepoint;
// ----- Sequential ----------------------------------------------------------------------------
@Override
conditional Char prev() = codepoint > 0 ? (True, new Char(codepoint - 1)) : False;
@Override
conditional Char next() = codepoint < MaxValue ? (True, new Char(codepoint + 1)) : False;
@Override
Int stepsTo(Char that) = that - this;
@Override
Char skip(Int steps) = this + steps.toUInt32();
// ----- operators ---------------------------------------------------------------------------
@Op("+")
Char add(UInt32 n) = new Char(codepoint + n);
@Op("+")
String add(Char ch) = new StringBuffer(2).add(this).add(ch).toString();
@Op("+")
String add(String s) = new StringBuffer(1 + s.size).add(this).addAll(s).toString();
@Op("-")
Char sub(UInt32 n) = new Char(codepoint - n);
@Op("-")
UInt32 sub(Char ch) = this.codepoint - ch.codepoint;
@Op("*")
String dup(Int n) {
if (n == 0) {
return "";
}
assert n > 0;
StringBuffer buf = new StringBuffer(n);
for (Int i = 0; i < n; ++i) {
buf.add(this);
}
return buf.toString();
}
// ----- conversions ---------------------------------------------------------------------------
/**
* A direct conversion from the Char to a Byte is supported because of ASCII. An out-of-range
* value (anything not an ASCII character) will result in an exception; this is subtly different
* from [toUInt8], which supports any value up to 0xFF.
*/
Byte toByte() {
assert codepoint <= 0x7F;
return codepoint.toByte();
}
/**
* A conversion to Byte[] results in a byte array with between 1-6 bytes containing
* a UTF-8 formatted codepoint.
*
* Note: The current version 9 of Unicode limits code points to 0x10FFFF, which
* means that all UTF-8 encoding will use between 1-4 bytes.
*/
immutable Byte[] utf8() {
Int length = calcUtf8Length();
Byte[] bytes = new Byte[length];
Int actual = formatUtf8(bytes, 0);
assert actual == length;
return bytes.makeImmutable();
}
/**
* @return the character as it would appear in source code as a character literal
*/
String toSourceString() {
Int len = 1;
len := isEscaped();
StringBuffer buf = new StringBuffer(len+2);
buf.add('\'');
appendEscaped(buf);
buf.add('\'');
return buf.toString();
}
// ----- helper methods ------------------------------------------------------------------------
/**
* Determine if the specified character is considered to be white-space.
*
* @return True iff this character is considered to be an Ecstasy whitespace character
*/
Boolean isWhitespace() {
// optimize for the ASCII range
if (codepoint <= 0x7F) {
return 0x09 <= codepoint <= 0x20
// 2 1 0
// 0FEDCBA9876543210FEDCBA9
&& UInt64:1 << codepoint-9 & 0b111110100000000000011111 != 0;
}
return switch (codepoint) {
// case 0x0009: // U+0009 9 HT Horizontal Tab
// case 0x000A: // U+000A 10 LF Line Feed
// case 0x000B: // U+000B 11 VT Vertical Tab
// case 0x000C: // U+000C 12 FF Form Feed
// case 0x000D: // U+000D 13 CR Carriage Return
// case 0x001A: // U+001A 26 SUB End-of-File, or "control-Z"
// case 0x001C: // U+001C 28 FS File Separator
// case 0x001D: // U+001D 29 GS Group Separator
// case 0x001E: // U+001E 30 RS Record Separator
// case 0x001F: // U+001F 31 US Unit Separator
// case 0x0020: // U+0020 32 SP Space
case 0x0085: // U+0085 133 NEL Next Line
case 0x00A0: // U+00A0 160 Non-breaking space
case 0x1680: // U+1680 5760 Ogham Space Mark
case 0x2000: // U+2000 8192 En Quad
case 0x2001: // U+2001 8193 Em Quad
case 0x2002: // U+2002 8194 En Space
case 0x2003: // U+2003 8195 Em Space
case 0x2004: // U+2004 8196 Three-Per-Em Space
case 0x2005: // U+2005 8197 Four-Per-Em Space
case 0x2006: // U+2006 8198 Six-Per-Em Space
case 0x2007: // U+2007 8199 Figure Space
case 0x2008: // U+2008 8200 Punctuation Space
case 0x2009: // U+2009 8201 Thin Space
case 0x200A: // U+200A 8202 Hair Space
case 0x2028: // U+2028 8232 LS Line Separator
case 0x2029: // U+2029 8233 PS Paragraph Separator
case 0x202F: // U+202F 8239 Narrow No-Break Space
case 0x205F: // U+205F 8287 Medium Mathematical Space
case 0x3000: True; // U+3000 12288 Ideographic Space
default : False;
};
}
/**
* Determine if the character acts as a line terminator.
*
* @return True iff this character acts as an Ecstasy line terminator
*/
Boolean isLineTerminator() {
// optimize for the ASCII range
if (codepoint <= 0x7F) {
// this handles the following cases:
// U+000A 10 LF Line Feed
// U+000B 11 VT Vertical Tab
// U+000C 12 FF Form Feed
// U+000D 13 CR Carriage Return
return 0x0A <= codepoint <= 0x0D;
}
// this handles the following cases:
// U+0085 133 NEL Next Line
// U+2028 8232 LS Line Separator
// U+2029 8233 PS Paragraph Separator
return codepoint == 0x0085 || codepoint == 0x2028 || codepoint == 0x2029;
}
/**
* @return the minimum number of bytes necessary to encode the character in UTF8 format
*/
Int calcUtf8Length() {
if (codepoint <= 0x7f) {
return 1;
}
UInt32 codepoint = this.codepoint >> 11;
Int length = 2;
while (codepoint != 0) {
codepoint >>= 5;
++length;
}
return length;
}
/**
* Encode this character into the passed byte array using the UTF8 format.
*
* @param bytes the byte array to write the UTF8 bytes into
* @param of the offset into the byte array to write the first byte
*
* @return the number of bytes used to encode the character in UTF8 format
*/
Int formatUtf8(Byte[] bytes, Int of) {
UInt32 cp = codepoint;
if (cp <= 0x7F) {
// ASCII - single byte 0xxxxxxx format
bytes[of] = cp.toByte();
return 1;
}
// otherwise the format is based on the number of significant bits:
// bits code-points first byte trailing # trailing
// ---- ----------------------- ---------- -------- ----------
// 11 U+0080 - U+07FF 110xxxxx 10xxxxxx 1
// 16 U+0800 - U+FFFF 1110xxxx 10xxxxxx 2
// 21 U+10000 - U+1FFFFF 11110xxx 10xxxxxx 3
// 26 U+200000 - U+3FFFFFF 111110xx 10xxxxxx 4
// 31 U+4000000 - U+7FFFFFFF 1111110x 10xxxxxx 5
Int trailing;
switch (cp.leftmostBit) {
case 0b00000000000000000000000010000000:
case 0b00000000000000000000000100000000:
case 0b00000000000000000000001000000000:
case 0b00000000000000000000010000000000:
bytes[of++] = 0b11000000 | (cp >>> 6).toByte();
trailing = 1;
break;
case 0b00000000000000000000100000000000:
case 0b00000000000000000001000000000000:
case 0b00000000000000000010000000000000:
case 0b00000000000000000100000000000000:
case 0b00000000000000001000000000000000:
bytes[of++] = 0b11100000 | (cp >>> 12).toByte();
trailing = 2;
break;
case 0b00000000000000010000000000000000:
case 0b00000000000000100000000000000000:
case 0b00000000000001000000000000000000:
case 0b00000000000010000000000000000000:
case 0b00000000000100000000000000000000:
bytes[of++] = 0b11110000 | (cp >>> 18).toByte();
trailing = 3;
break;
case 0b00000000001000000000000000000000:
case 0b00000000010000000000000000000000:
case 0b00000000100000000000000000000000:
case 0b00000001000000000000000000000000:
case 0b00000010000000000000000000000000:
bytes[of++] = 0b11111000 | (cp >>> 24).toByte();
trailing = 4;
break;
case 0b00000100000000000000000000000000:
case 0b00001000000000000000000000000000:
case 0b00010000000000000000000000000000:
case 0b00100000000000000000000000000000:
case 0b01000000000000000000000000000000:
bytes[of++] = 0b11111100 | (cp >>> 30).toByte();
trailing = 5;
break;
default:
// TODO: cp.toHexString() would be a better output
throw new IllegalUTF($"illegal codepoint: {cp}");
}
Int length = trailing + 1;
// write out trailing bytes; each has the same "10xxxxxx" format with 6
// bits of data
while (trailing > 0) {
bytes[of++] = 0b10_000000 | (cp >>> --trailing * 6 & 0b00_111111).toByte();
}
return length;
}
/**
* Determine if the character needs to be escaped in order to be displayed.
*
* @return True iff the character should be escaped in order to be displayed
* @return (conditional) the number of characters in the escape sequence
*/
conditional Int isEscaped() {
return switch (codepoint) {
case 0x00 : // null terminator
case 0x08 : // backspace
case 0x09 : // horizontal tab
case 0x0A : // line feed
case 0x0B : // vertical tab
case 0x0C : // form feed
case 0x0D : // carriage return
case 0x1A : // EOF
case 0x1B : // escape
case 0x22 : // double quotes
case 0x27 : // single quotes
case 0x5C : // the escaping slash itself requires an explicit escape
case 0x7F : (True, 2); // DEL
case 0x00..0x1F : // C0 control characters
case 0x80..0x9F : // C1 control characters
case 0x2028..0x2029 : (True, 5); // line and paragraph separator
default : False;
};
}
/**
* Append the specified character to the StringBuilder, escaping if
* necessary.
*
* @param buf the `Appender` to append to
* @param ch the character to escape
*
* @return the StringBuilder
*/
Appender<Char!> appendEscaped(Appender<Char!> buf) {
return switch (codepoint) {
case 0x00:
// null terminator
buf.add('\\')
.add('0');
case 0x08:
// backspace
buf.add('\\')
.add('b');
case 0x09:
// horizontal tab
buf.add('\\')
.add('t');
case 0x0A:
// line feed
buf.add('\\')
.add('n');
case 0x0B:
// vertical tab
buf.add('\\')
.add('v');
case 0x0C:
// form feed
buf.add('\\')
.add('f');
case 0x0D:
// carriage return
buf.add('\\')
.add('r');
case 0x1A:
// EOF
buf.add('\\')
.add('z');
case 0x1B:
// escape
buf.add('\\')
.add('e');
case 0x22:
// double quotes
buf.add('\\')
.add('\"');
case 0x27:
// single quotes
buf.add('\\')
.add('\'');
case 0x5C:
// the escaping slash itself requires an explicit escape
buf.add('\\')
.add('\\');
case 0x7F:
// DEL
buf.add('\\')
.add('d');
case 0x00..0x1F : // C0 control characters
case 0x80..0x9F : // C1 control characters
case 0x2028..0x2029 : // line and paragraph separator
buf.add('\\')
.add('u')
.add((codepoint & 0xF000 >>> 24).toHexit())
.add((codepoint & 0x0F00 >>> 16).toHexit())
.add((codepoint & 0x00F0 >>> 8).toHexit())
.add((codepoint & 0x000F >>> 0).toHexit());
default:
buf.add(this);
};
}
/**
* @return the character as it would appear in source code, in single quotes and escaped as
* necessary
*/
String quoted() {
if (Int len := isEscaped()) {
return appendEscaped(new StringBuffer(len + 2).add('\'')).add('\'').toString();
} else {
return new StringBuffer(3).add('\'').add(this).add('\'').toString();
}
}
// ----- ASCII support -------------------------------------------------------------------------
/**
* Determine if the character is in the ASCII range.
*/
Boolean ascii.get() = codepoint <= 0x7F;
/**
* Determine if the character is an ASCII digit, one of the values '0'..'9'.
*
* @return True iff the character is an ASCII digit
* @return (conditional) a value in the range `0..9`
*/
conditional UInt8 asciiDigit() {
return switch (this) {
case '0'..'9': (True, (this - '0').toUInt8());
default : False;
};
}
/**
* Determine if the character is an ASCII hexit, one of the values `'0'..'9'`, `'A'..'F'`,
* or `'a'..'f'`.
*
* @return True iff the character is an ASCII hexadecimal digit (a "hexit")
* @return (conditional) a value in the range `0..15`
*/
conditional UInt8 asciiHexit() {
return switch (this) {
case '0'..'9': (True, (this - '0').toUInt8());
case 'A'..'F': (True, 0xA + (this - 'A').toUInt8());
case 'a'..'f': (True, 0xa + (this - 'a').toUInt8());
default : False;
};
}
/**
* Determine if the character is an ASCII letter, one of the values 'A'..'Z' or 'a'..'z'.
*
* @return True iff the character is an ASCII letter
* @return (conditional) this letter
*/
conditional Char asciiLetter() {
return switch (this) {
case 'A'..'Z': (True, this);
case 'a'..'z': (True, this);
default : False;
};
}
/**
* Determine if the character is an ASCII uppercase letter, one of the values 'A'..'Z'.
*
* @return True iff the character is an ASCII uppercase letter
* @return (conditional) this uppercase letter
*/
conditional Char asciiUppercase() = 'A' <= this <= 'Z' ? (True, this) : False;
/**
* Determine if the character is an ASCII lowercase letter, one of the values 'a'..'z'.
*
* @return True iff the character is an ASCII lowercase letter
* @return (conditional) this lowercase letter
*/
conditional Char asciiLowercase() = 'a' <= this <= 'z' ? (True, this) : False;
// ----- numeric conversion support ------------------------------------------------------------
/**
* Determine if the character represents a nibble value.
*
* @return True iff the character represents a nibble value
* @return (conditional) the corresponding Nibble
*/
conditional Nibble isNibble() {
return switch (this) {
case '0'..'9':
case 'A'..'F':
case 'a'..'f': (True, Nibble.of(this));
default: False;
};
}
// ----- surrogate pair support ----------------------------------------------------------------
/**
* Test this character to determine if it is the first part of a surrogate pair.
*
* From [the Unicode FAQ](https://unicode.org/faq/utf_bom.html#utf8-4):
*
* > Surrogates are code points from two special ranges of Unicode values, reserved for use as
* > the leading, and trailing values of paired code units in UTF-16. Leading, also called high,
* > surrogates are from D80016 to DBFF16, and trailing, or low, surrogates are from DC0016 to
* > DFFF16. They are called surrogates, since they do not represent characters directly, but
* > only as a pair.
*
* @return True if this this Char has a surrogate codepoint, and is a leading (first) value of
* a surrogate pair
*
* @throws IllegalUTF if this Char has a surrogate codepoint, but is not a valid **leading**
* value for a surrogate pair
*/
Boolean requiresTrailingSurrogate() {
if (codepoint < 0xD800 || codepoint >= 0xE000) {
return False;
}
// for surrogates, the high ten bits (in the range 0x000–0x3FF) are encoded in the range
// 0xD800–0xDBFF, and the low ten bits (in the range 0x000–0x3FF) are encoded in the range
// 0xDC00–0xDFFF
if (codepoint >= 0xDC00) {
throw new IllegalUTF($"leading-surrogate required; trailing-surrogate found: {codepoint}");
}
return True;
}
/**
* Combine this leading surrogate with a trailing surrogate to produce a character.
*
* From [the Unicode FAQ](https://unicode.org/faq/utf_bom.html#utf8-4):
*
* > There is a much simpler computation that does not try to follow the bit distribution table.
*
* // constants
* const UTF32 LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
* const UTF32 SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
*
* // computations
* UTF16 lead = LEAD_OFFSET + (codepoint >> 10);
* UTF16 trail = 0xDC00 + (codepoint & 0x3FF);
*
* UTF32 codepoint = (lead << 10) + trail + SURROGATE_OFFSET;
*
* And:
*
* > Finally, the reverse, where hi and lo are the high and low surrogate, and C the resulting
* > character
*
* UTF32 X = (hi & ((1 << 6) -1)) << 10 | lo & ((1 << 10) -1);
* UTF32 W = (hi >> 6) & ((1 << 5) - 1);
* UTF32 U = W + 1;
*
* UTF32 C = U << 16 | X;
*
* @param trailing the trailing portion of the surrogate pair
*
* @return the resulting Char that the surrogate pair represents
*
* @throws IllegalUTF if `this` Char is not a leading surrogate, or the `trailing` Char is not a
* trailing surrogate
*/
Char addTrailingSurrogate(Char trailing) {
UInt32 hi = this.codepoint;
if (hi < 0xD800 || hi >= 0xDC00) {
throw new IllegalUTF($"illegal leading-surrogate: {hi}");
}
UInt32 lo = trailing.codepoint;
if (lo < 0xDC00 || codepoint >= 0xE000) {
throw new IllegalUTF($"illegal trailing-surrogate: {lo}");
}
static UInt32 SURROGATE_OFFSET = (0xD800 << 10) + 0xDC00 - 0x10000;
return new Char((hi << 10) + lo - SURROGATE_OFFSET);
}
// ----- Unicode support -----------------------------------------------------------------------
/**
* True iff the codepoint is a defined Unicode character.
*/
Boolean unicode.get() = category != Unassigned;
/**
* The Unicode General Category of the character.
*
* This information is field 2 in the `UnicodeData.txt` data file from the Unicode Consortium.
* From [https://www.unicode.org/reports/tr44/#General_Category_Values]:
*
* > This is a useful breakdown into various character types which can be used as a default
* > categorization in implementations. For the property values, see
* > [General Category Values](https://www.unicode.org/reports/tr44/#General_Category_Values).
*
* This information is stored in the binary file "CharCats.dat" in this package. For a codepoint
* `n`, the n-th byte of the file is the ordinal of the `Category` enum value for the character.
*/
Category category.get() {
static List<Int> categoriesByCodepoint = new ConstOrdinalList(#./CharCats.dat);
return codepoint < categoriesByCodepoint.size
? Category.values[categoriesByCodepoint[codepoint]]
: Unassigned;
}
/**
* Unicode "General Categories".
*/
enum Category(String code, String description) {
UppercaseLetter ("Lu", "An uppercase letter"),
LowercaseLetter ("Ll", "A lowercase letter"),
TitlecaseLetter ("Lt", "A digraphic character, with first part uppercase"),
ModifierLetter ("Lm", "A modifier letter"),
OtherLetter ("Lo", "Other letters, including syllables and ideographs"),
NonspacingMark ("Mn", "A nonspacing combining mark (zero advance width)"),
SpacingMark ("Mc", "A spacing combining mark (positive advance width)"),
EnclosingMark ("Me", "An enclosing combining mark"),
DecimalNumber ("Nd", "A decimal digit"),
LetterNumber ("Nl", "A letterlike numeric character"),
OtherNumber ("No", "A numeric character of other type"),
ConnectorPunctuation("Pc", "A connecting punctuation mark, like a tie"),
DashPunctuation ("Pd", "A dash or hyphen punctuation mark"),
OpenPunctuation ("Ps", "An opening punctuation mark (of a pair)"),
ClosePunctuation ("Pe", "A closing punctuation mark (of a pair)"),
InitialPunctuation ("Pi", "An initial quotation mark"),
FinalPunctuation ("Pf", "A final quotation mark"),
OtherPunctuation ("Po", "A punctuation mark of other type"),
MathSymbol ("Sm", "A symbol of mathematical use"),
CurrencySymbol ("Sc", "A currency sign"),
ModifierSymbol ("Sk", "A non-letterlike modifier symbol"),
OtherSymbol ("So", "A symbol of other type"),
SpaceSeparator ("Zs", "A space character (of various non-zero widths)"),
LineSeparator ("Zl", "U+2028 LINE SEPARATOR only"),
ParagraphSeparator ("Zp", "U+2029 PARAGRAPH SEPARATOR only"),
Control ("Cc", "A C0 or C1 control code"),
Format ("Cf", "A format control character"),
Surrogate ("Cs", "A surrogate code point"),
PrivateUse ("Co", "A private-use character"),
Unassigned ("Cn", "A reserved unassigned code point or a noncharacter");
Boolean casedLetter;
Boolean letter;
Boolean mark;
Boolean number;
Boolean punctuation;
Boolean symbol;
Boolean separator;
Boolean other;
construct(String code, String description) {
this.code = code;
this.description = description;
letter = code[0] == 'L';
mark = code[0] == 'M';
number = code[0] == 'N';
punctuation = code[0] == 'P';
symbol = code[0] == 'S';
separator = code[0] == 'Z';
other = code[0] == 'C';
casedLetter = letter && (code == "Lu" || code == "Ll" || code == "Lt");
}
}
/**
* The value in the range `0..9` that represents the decimal value of this character.
*
* > If the character has the property value Numeric_Type=Decimal, then the Numeric_Value of
* > that digit is represented with an integer value (limited to the range 0..9) in fields 6, 7,
* > and 8. Characters with the property value Numeric_Type=Decimal are restricted to digits
* > which can be used in a decimal radix positional numeral system and which are encoded in the
* > standard in a contiguous ascending range 0..9. See the discussion of decimal digits in
* > Chapter 4, Character Properties in
* > [Unicode](https://www.unicode.org/reports/tr41/tr41-26.html#Unicode).
*/
conditional Int decimalValue() {
static List<Int> decsByCodepoint = new ConstOrdinalList(#./CharDecs.dat);
if (codepoint < decsByCodepoint.size) {
Int val = decsByCodepoint[codepoint];
if (val < 10) {
return True, val;
}
}
return False;
}
/**
* The numeric value of this character, represented as a `String`, and potentially represented
* using a fractional notation of an integer value followed by `/` followed by a second integer
* value.
*
* > If the character has the property value Numeric_Type=Numeric, then the Numeric_Value of
* > that character is represented with a positive or negative integer or rational number in
* > this field, and fields 6 and 7 are null. This includes fractions such as, for example,
* > "1/5" for U+2155 VULGAR FRACTION ONE FIFTH.
*/
String? numericValue.get() {
static List<Int> numsByCodepoint = new ConstOrdinalList(#./CharNums.dat);
if (codepoint >= numsByCodepoint.size) {
return Null;
}
static String[] numStrings =
[
"-1/2",
"0",
"1",
"1/10",
"1/12",
"1/16",
"1/160",
"1/2",
"1/20",
"1/3",
"1/32",
"1/320",
"1/4",
"1/40",
"1/5",
"1/6",
"1/64",
"1/7",
"1/8",
"1/80",
"1/9",
"10",
"10/12",
"100",
"1000",
"10000",
"100000",
"1000000",
"10000000",
"100000000",
"10000000000",
"1000000000000",
"11",
"11/12",
"11/2",
"12",
"13",
"13/2",
"14",
"15",
"15/2",
"16",
"17",
"17/2",
"18",
"19",
"2",
"2/12",
"2/3",
"2/5",
"20",
"200",
"2000",
"20000",
"200000",
"20000000",
"21",
"216000",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"3",
"3/12",
"3/16",
"3/2",
"3/20",
"3/4",
"3/5",
"3/64",
"3/8",
"3/80",
"30",
"300",
"3000",
"30000",
"300000",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"4",
"4/12",
"4/5",
"40",
"400",
"4000",
"40000",
"400000",
"41",
"42",
"43",
"432000",
"44",
"45",
"46",
"47",
"48",
"49",
"5",
"5/12",
"5/2",
"5/6",
"5/8",
"50",
"500",
"5000",
"50000",
"500000",
"6",
"6/12",
"60",
"600",
"6000",
"60000",
"600000",
"7",
"7/12",
"7/2",
"7/8",
"70",
"700",
"7000",
"70000",
"700000",
"8",
"8/12",
"80",
"800",
"8000",
"80000",