-
-
Notifications
You must be signed in to change notification settings - Fork 721
/
corepropsbuilder.cpp
1105 lines (955 loc) · 41.6 KB
/
corepropsbuilder.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: corepropsbuilder.cpp (was store.c & props2.cpp)
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999dec11
* created by: Markus W. Scherer
*
* Store Unicode character properties efficiently for
* random access.
*/
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucptrie.h"
#include "unicode/udata.h"
#include "unicode/umutablecptrie.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/usetiter.h"
#include "unicode/uscript.h"
#include "cmemory.h"
#include "cstring.h"
#include "genprops.h"
#include "propsvec.h"
#include "uassert.h"
#include "unewdata.h"
#include "uprops.h"
#include "utrie2.h"
#include "writesrc.h"
/* Unicode character properties file format ------------------------------------
The file format prepared and written here contains several data
structures that store indexes or data.
Before the data contents described below, there are the headers required by
the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 9.0 .
Data contents:
The contents is a parsed, binary form of several Unicode character
database files, most prominently UnicodeData.txt.
Any Unicode code point from 0 to 0x10ffff can be looked up to get
the properties, if any, for that code point. This means that the input
to the lookup are 21-bit unsigned integers, with not all of the
21-bit range used.
It is assumed that client code keeps a uint32_t pointer
to the beginning of the data:
const uint32_t *p32;
Formally, the file contains the following structures:
const int32_t indexes[16] with values i0..i15:
i0 indicates the length of the main trie.
i0..i3 all have the same value in format versions 4.0 and higher;
the related props32[] and exceptions[] and uchars[] were used in format version 3
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
i7 blockTrieIndex; -- 32-bit unit index to the Block property trie (format version 9+)
i8 reservedIndex8; -- top of the previous part of the data; i8 and i9 have the same values
i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
i12 maxValuesOther; -- additional maximum values, see uprops.h (format version 9+)
i13..i15 reservedIndexes; -- reserved values; 0 for now
PT serialized properties trie, see utrie2.h (byte size: 4*(i0-16))
P, E, and U are not used (empty) in format versions 4 and above
P const uint32_t props32[i1-i0];
E const uint32_t exceptions[i2-i1];
U const char16_t uchars[2*(i3-i2)];
AT serialized trie for additional properties (byte size: 4*(i4-i3))
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
SCX const uint16_t scriptExtensions[2*(i7-i6)];
SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs.
A Script_Extensions list is a sequence of UScriptCode values in ascending order,
with the last code having bit 15 set for termination.
A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value)
followed by the index of the Script_Extensions list.
If the propsVectors[] column 0 value indicates that there are Script_Extensions,
then the script-code-or-index bit fields are an index to either a list or a pair in SCX,
rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields
indicate whether the main Script value is Common or Inherited (and the index is to a list)
vs. another value (and the index is to a pair).
(See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
blockTrie serialized CodePointTrie/UCPTrie for the Block property (format version 9+)
Indexed by (code point >> 4). Takes advantage of each Block having xxx0..xxxF boundaries.
Trie lookup and properties:
In order to condense the data for the 21-bit code space, several properties of
the Unicode code assignment are exploited:
- The code space is sparse.
- There are several 10k of consecutive codes with the same properties.
- Characters and scripts are allocated in groups of 16 code points.
- Inside blocks for scripts the properties are often repetitive.
- The 21-bit space is not fully used for Unicode.
The lookup of properties for a given code point is done with a trie lookup,
using the UTrie implementation.
The trie lookup result is a 16-bit properties word.
With a given Unicode code point
UChar32 c;
and 0<=c<0x110000, the lookup is done like this:
uint16_t props;
UTRIE_GET16(trie, c, props);
Each 16-bit properties word contains:
0.. 4 general category
5 reserved
6..15 numeric type and value (ntv)
Encoding of numeric type and value in the 10-bit ntv field:
ntv type value
0 U_NT_NONE 0
1..10 U_NT_DECIMAL 0..9
11..20 U_NT_DIGIT 0..9
21..0x3ff U_NT_NUMERIC see below
For U_NT_NUMERIC:
ntv value
21..0xaf integer 0..154
0xb0..0x1df fraction ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
0x1e0..0x2ff large int ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
(only one significant decimal digit)
0x300..0x323 base-60 (sexagesimal) integer (new in format version 7.1)
((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
0x324..0x34b fraction-20 (new in format version 7.3)
frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
numerator: num = 2*(frac20&3)+1
denominator: den = 20<<(frac20>>2)
0x34c..0x35b fraction-32 (new in format version 7.6)
frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256
numerator: num = 2*(frac32&3)+1
denominator: den = 32<<(frac32>>2)
0x35c..0x3ff reserved
--- Additional properties (new in format version 2.1) ---
The second trie for additional properties (AT) is also a UTrie with 16-bit data.
The data words consist of 32-bit unit indexes (not row indexes!) into the
table of unique properties vectors (PV).
Each vector contains a set of properties.
The width of a vector (number of uint32_t per row) may change
with the formatVersion, it is stored in i5.
Current properties: see icu/source/common/uprops.h
--- Changes in format version 3.1 ---
See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
--- Changes in format version 3.2 ---
- The tries use linear Latin-1 ranges.
- The additional properties bits store full properties XYZ instead
of partial Other_XYZ, so that changes in the derivation formulas
need not be tracked in runtime library code.
- Joining Type and Line Break are also stored completely, so that uprops.c
needs no runtime formulas for enumerated properties either.
- Store the case-sensitive flag in the main properties word.
- i10 also contains U_LB_COUNT and U_EA_COUNT.
- i11 contains maxValues2 for vector word 2.
--- Changes in format version 4 ---
The format changes between version 3 and 4 because the properties related to
case mappings and bidi/shaping are pulled out into separate files
for modularization.
In order to reduce the need for code changes, some of the previous data
structures are omitted, rather than rearranging everything.
(The change to format version 4 is for ICU 3.4. The last CVS revision of
genprops/store.c for format version 3.2 is 1.48.)
The main trie's data is significantly simplified:
- The trie's 16-bit data word is used directly instead of as an index
into props32[].
- The trie uses the default trie folding functions instead of custom ones.
- Numeric values are stored directly in the trie data word, with special
encodings.
- No more exception data (the data that needed it was pulled out, or, in the
case of numeric values, encoded differently).
- No more string data (pulled out - was for case mappings).
Also, some of the previously used properties vector bits are reserved again.
The indexes[] values for the omitted structures are still filled in
(indicating zero-length arrays) so that the swapper code remains unchanged.
--- Changes in format version 5 ---
Format version 5 became necessary because the bit field for script codes
overflowed. The changes are incompatible because
old code would have seen nonsensically low values for new, higher script codes.
Rearranged bit fields in the second trie (AT) and widened three (Script, Block,
Word_Break) by one bit each.
Modified bit fields in icu/source/common/uprops.h
--- Changes in format version 6 ---
Format version 6 became necessary because Unicode 5.2 adds fractions with
denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
types and values rather than add another variant to the previous format.
--- Changes in format version 7 ---
Unicode 6.0 adds Script_Extensions. For characters with script extensions data,
the script code bits are an index into the new Script_Extensions array rather
than a script code.
Change from UTrie to UTrie2.
--- Changes in format version 7.1 ---
Unicode 6.2 adds sexagesimal (base-60) numeric values:
cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000
cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000
The encoding of numeric values was extended to handle such values.
--- Changes in format version 7.2 ---
ICU 57 adds 4 Emoji properties to vector word 2.
https://unicode-org.atlassian.net/browse/ICU-11802
http://www.unicode.org/reports/tr51/#Emoji_Properties
--- Changes in format version 7.3 ---
ICU 58 adds fraction-20 numeric values for new Unicode 9 Malayalam fraction characters.
--- Changes in format version 7.4 ---
ICU 60 adds the Prepended_Concatenation_Mark property to vector word 1.
ICU 60 adds the Emoji_Component property to vector word 2, for emoji 5.
https://unicode-org.atlassian.net/browse/ICU-13062
http://www.unicode.org/reports/tr51/#Emoji_Properties
--- Changes in format version 7.5 ---
ICU 62 adds the Extended_Pictographic property to vector word 2, for emoji 11.
http://www.unicode.org/reports/tr51/#Emoji_Properties
--- Changes in format version 7.6 ---
ICU 64 adds fraction-32 numeric values for new Unicode 12 Tamil fraction characters.
--- Changes in format version 7.7 ---
ICU 66 adds two bits for the UScriptCode or Script_Extensions index in vector word 0.
The value is split across bits 21..20 & 7..0.
--- Changes in format version 7.8 ---
ICU 70 moves the emoji properties from uprops.icu to (new) uemoji.icu.
The 6 bits in vector word 2 that stored emoji properties are unused again.
--- Changes in format version 8.0 ---
ICU 75 uses the vector word 2 bits 31..26 for encoded Identifier_Type bit sets.
--- Changes in format version 9.0 (ICU 76) ---
Age major:minor version bit fields changed from 4:4 to 6:2 so that age=16.0 fits.
Block data moved from props vector 0 into its own new CodePointTrie.
Reserve 10 bits in the new indexes[UPROPS_MAX_VALUES_OTHER_INDEX] for the max Block value,
although the trie can hold 16-bit values.
Props vector 0 bits shuffled so that script and script extensions bits are contiguous.
----------------------------------------------------------------------------- */
U_NAMESPACE_USE
namespace {
/* UDataInfo cf. udata.h */
UDataInfo dataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
{ 9, 0, 0, 0 }, /* formatVersion */
{ 16, 0, 0, 0 } /* dataVersion */
};
class CorePropsBuilder : public PropsBuilder {
public:
CorePropsBuilder(UErrorCode &errorCode);
virtual ~CorePropsBuilder();
virtual void setUnicodeVersion(const UVersionInfo version);
virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
virtual void build(UErrorCode &errorCode);
virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
private:
void setGcAndNumeric(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
UTrie2 *pTrie;
UTrie2 *props2Trie;
UPropsVectors *pv;
UMutableCPTrie *mutableBlockTrie = nullptr;
UCPTrie *blockTrie = nullptr;
UnicodeString scriptExtensions;
uint8_t blockTrieBytes[100000];
int32_t blockTrieSize = 0;
};
CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
: pTrie(nullptr), props2Trie(nullptr), pv(nullptr) {
pTrie=utrie2_open(0, 0, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: corepropsbuilder utrie2_open() failed - %s\n",
u_errorName(errorCode));
}
pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: corepropsbuilder upvec_open() failed - %s\n",
u_errorName(errorCode));
}
mutableBlockTrie = umutablecptrie_open(0, 0, &errorCode);
if (U_FAILURE(errorCode)) {
fprintf(stderr, "genprops/Block error: umutablecptrie_open() failed: %s\n",
u_errorName(errorCode));
}
}
CorePropsBuilder::~CorePropsBuilder() {
utrie2_close(pTrie);
utrie2_close(props2Trie);
upvec_close(pv);
umutablecptrie_close(mutableBlockTrie);
ucptrie_close(blockTrie);
}
void
CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
int32_t encodeFractional20(int32_t value, int32_t den) {
if(den<20 || 640<den) { return -1; }
int32_t frac20;
if(value==1 || value==3 || value==5 || value==7) {
frac20=value/2;
} else {
return -1;
}
// Denominator: 20 times which power of 2: 0..5 into bits 4..2
do {
if(den==20) {
return UPROPS_NTV_FRACTION20_START+frac20;
}
if(den&1) {
return -1; // odd denominator, and we would lose the low bit in den/=2
}
den/=2;
frac20+=4;
} while(den>=20);
return -1;
}
int32_t encodeFractional32(int32_t value, int32_t den) {
if(den<32 || 256<den) { return -1; }
int32_t frac32;
if(value==1 || value==3 || value==5 || value==7) {
frac32=value/2;
} else {
return -1;
}
// Denominator: 32 times which power of 2: 0..3 into bits 3..2
do {
if(den==32) {
return UPROPS_NTV_FRACTION32_START+frac32;
}
if(den&1) {
return -1; // odd denominator, and we would lose the low bit in den/=2
}
den/=2;
frac32+=4;
} while(den>=32);
return -1;
}
// For nt=U_NT_NUMERIC.
int32_t
encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
const char *original=s;
/* get a possible minus sign */
UBool isNegative;
if(*s=='-') {
isNegative=true;
++s;
} else {
isNegative=false;
}
int32_t value=0, den=0, exp=0, ntv=0;
char *numberLimit;
/* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
value=s[0]-'0';
numberLimit=const_cast<char *>(s);
while(*(++numberLimit)=='0') {
++exp;
}
} else {
/* normal number parsing */
unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) {
ntv=-1;
} else {
value=(int32_t)ul;
}
if(ntv>=0 && *numberLimit=='/') {
/* fractional value, get the denominator */
s=numberLimit+1;
ul=uprv_strtoul(s, &numberLimit, 10);
if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) {
ntv=-1;
} else {
den=(int32_t)ul;
}
}
}
if(isNegative) {
value=-(int32_t)value;
}
if(ntv<0) {
// pass
} else if(den==0 && value>=0) {
if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
/* small integer parsed like a large one */
ntv=UPROPS_NTV_NUMERIC_START+value*100;
} else if(exp==0) {
if(value<=UPROPS_NTV_MAX_SMALL_INT) {
/* small integer */
ntv=UPROPS_NTV_NUMERIC_START+value;
} else {
/* large integer parsed like a small one */
/* split the value into mantissa and exponent, base 10 */
int32_t mant=value;
while((mant%10)==0) {
mant/=10;
++exp;
}
// Note: value<=0x7fffffff guarantees exp<=33
if(mant<=9) {
ntv=((mant+14)<<5)+(exp-2);
} else {
// Try sexagesimal (base 60) numbers.
mant=value;
exp=0;
while((mant%60)==0) {
mant/=60;
++exp;
}
if(mant<=9 && exp<=4) {
ntv=((mant+0xbf)<<2)+(exp-1);
} else {
ntv=-1;
}
}
}
} else if(2<=exp && exp<=33 && 1<=value && value<=9) {
/* large, single-significant-digit integer */
ntv=((value+14)<<5)+(exp-2);
} else {
ntv=-1;
}
} else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) {
/* fraction */
ntv=((value+12)<<4)+(den-1);
} else if(exp==0 && value==-1 && den==0) {
/* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
ntv=((value+12)<<4);
} else if(exp==0 && (ntv=encodeFractional20(value, den))>=0) {
// fits into fractional-20 format
} else if(exp==0 && (ntv=encodeFractional32(value, den))>=0) {
// fits into fractional-32 format
} else {
ntv=-1;
}
if(ntv<0 || *numberLimit!=0) {
fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original);
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
return ntv;
}
uint32_t encodeIdentifierType(const UnicodeSet &idType, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return 0; }
if(idType.isEmpty()) {
fprintf(stderr, "genprops error: data line has an empty Identifier_Type\n");
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(idType.contains(U_ID_TYPE_EXCLUSION) && idType.contains(U_ID_TYPE_LIMITED_USE)) {
// By definition, Exclusion and Limited_Use are mutually exclusive.
// We rely on that for the data structure.
fprintf(stderr,
"genprops error: data line has both Identifier_Type Exclusion and Limited_Use\n");
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
uint32_t result=UPROPS_ID_TYPE_NOT_CHARACTER;
UnicodeSetIterator iter(idType);
while(iter.next()) {
int32_t type=iter.getCodepoint();
if(type>=UPRV_LENGTHOF(uprops_idTypeToEncoded)) {
fprintf(stderr,
"genprops error: data line contains unknown (new?) Identifier_Type %d\n",
(int)type);
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
uint32_t encodedType=uprops_idTypeToEncoded[type];
if((encodedType & UPROPS_ID_TYPE_BIT)==0) {
if(idType.size()>1) {
fprintf(stderr,
"genprops error: data line contains uncombinable Identifier_Type values\n");
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
result=encodedType;
} else {
result|=encodedType;
}
}
return result&~UPROPS_ID_TYPE_BIT;
}
void
CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newValues,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
UChar32 start=props.start;
UChar32 end=props.end;
int32_t type=props.getIntProp(UCHAR_NUMERIC_TYPE);
const char *nvString=props.numericValue;
if(type!=U_NT_NONE && nvString==nullptr && start==end) {
fprintf(stderr, "genprops error: cp line has Numeric_Type but no Numeric_Value\n");
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(!newValues.contains(UCHAR_GENERAL_CATEGORY) && !newValues.contains(UCHAR_NUMERIC_VALUE)) {
return;
}
int32_t ntv=UPROPS_NTV_NONE; // numeric type & value
if(nvString!=nullptr && uprv_strcmp(nvString, "NaN")!=0) {
int32_t digitValue=props.digitValue;
if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
) {
fprintf(stderr, "genprops error: nt=%d but nv=%s\n",
(int)type, nvString==nullptr ? "nullptr" : nvString);
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
switch(type) {
case U_NT_NONE:
ntv=UPROPS_NTV_NONE;
break;
case U_NT_DECIMAL:
ntv=UPROPS_NTV_DECIMAL_START+digitValue;
break;
case U_NT_DIGIT:
ntv=UPROPS_NTV_DIGIT_START+digitValue;
break;
case U_NT_NUMERIC:
if(digitValue>=0) {
ntv=UPROPS_NTV_NUMERIC_START+digitValue;
} else {
ntv=encodeNumericValue(start, nvString, errorCode);
if(U_FAILURE(errorCode)) {
return;
}
}
default:
break; // unreachable
}
}
uint32_t value=
(uint32_t)props.getIntProp(UCHAR_GENERAL_CATEGORY) |
(ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
if(start==end) {
utrie2_set32(pTrie, start, value, &errorCode);
} else {
utrie2_setRange32(pTrie, start, end, value, true, &errorCode);
}
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie2_setRange32(properties trie %04lX..%04lX) failed - %s\n",
(long)start, (long)end, u_errorName(errorCode));
}
}
struct PropToBinary {
int32_t prop; // UProperty
int32_t vecWord, vecShift;
};
const PropToBinary
propToBinaries[]={
{ UCHAR_WHITE_SPACE, 1, UPROPS_WHITE_SPACE },
{ UCHAR_DASH, 1, UPROPS_DASH },
// Note: The Hyphen property is stabilized since Unicode 4.0
// and deprecated since Unicode 6.0.
{ UCHAR_HYPHEN, 1, UPROPS_HYPHEN },
{ UCHAR_QUOTATION_MARK, 1, UPROPS_QUOTATION_MARK },
{ UCHAR_TERMINAL_PUNCTUATION, 1, UPROPS_TERMINAL_PUNCTUATION },
// Note: The Hex_Digit and ASCII_Hex_Digit properties are probably stable enough
// so that they could be hardcoded.
{ UCHAR_HEX_DIGIT, 1, UPROPS_HEX_DIGIT },
{ UCHAR_ASCII_HEX_DIGIT, 1, UPROPS_ASCII_HEX_DIGIT },
{ UCHAR_IDEOGRAPHIC, 1, UPROPS_IDEOGRAPHIC },
{ UCHAR_DIACRITIC, 1, UPROPS_DIACRITIC },
{ UCHAR_EXTENDER, 1, UPROPS_EXTENDER },
// Note: The Noncharacter_Code_Point property is probably stable enough
// so that it could be hardcoded.
{ UCHAR_NONCHARACTER_CODE_POINT, 1, UPROPS_NONCHARACTER_CODE_POINT },
// Note: The Grapheme_Link property is deprecated since Unicode 5.0
// because it is a "Duplication of ccc=9" (UAX #44).
{ UCHAR_GRAPHEME_LINK, 1, UPROPS_GRAPHEME_LINK },
{ UCHAR_IDS_BINARY_OPERATOR, 1, UPROPS_IDS_BINARY_OPERATOR },
{ UCHAR_IDS_TRINARY_OPERATOR, 1, UPROPS_IDS_TRINARY_OPERATOR },
{ UCHAR_RADICAL, 1, UPROPS_RADICAL },
{ UCHAR_UNIFIED_IDEOGRAPH, 1, UPROPS_UNIFIED_IDEOGRAPH },
{ UCHAR_DEPRECATED, 1, UPROPS_DEPRECATED },
{ UCHAR_LOGICAL_ORDER_EXCEPTION, 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
{ UCHAR_S_TERM, 1, UPROPS_S_TERM },
{ UCHAR_VARIATION_SELECTOR, 1, UPROPS_VARIATION_SELECTOR },
// Note: Pattern_Syntax & Pattern_White_Space are available via
// the internal PatternProps class and need not be stored here any more.
{ UCHAR_PATTERN_SYNTAX, 1, UPROPS_PATTERN_SYNTAX },
{ UCHAR_PATTERN_WHITE_SPACE, 1, UPROPS_PATTERN_WHITE_SPACE },
{ UCHAR_XID_START, 1, UPROPS_XID_START },
{ UCHAR_XID_CONTINUE, 1, UPROPS_XID_CONTINUE },
{ UCHAR_MATH, 1, UPROPS_MATH },
{ UCHAR_ALPHABETIC, 1, UPROPS_ALPHABETIC },
{ UCHAR_GRAPHEME_EXTEND, 1, UPROPS_GRAPHEME_EXTEND },
{ UCHAR_DEFAULT_IGNORABLE_CODE_POINT, 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
{ UCHAR_ID_START, 1, UPROPS_ID_START },
{ UCHAR_ID_CONTINUE, 1, UPROPS_ID_CONTINUE },
{ UCHAR_GRAPHEME_BASE, 1, UPROPS_GRAPHEME_BASE },
{ UCHAR_PREPENDED_CONCATENATION_MARK, 1, UPROPS_PREPENDED_CONCATENATION_MARK },
};
struct PropToEnum {
int32_t prop; // UProperty
int32_t vecWord, vecShift;
uint32_t vecMask;
};
const PropToEnum
propToEnums[]={
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
{ UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
{ UCHAR_SENTENCE_BREAK, 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK },
{ UCHAR_LINE_BREAK, 2, UPROPS_LB_SHIFT, UPROPS_LB_MASK },
};
void
CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
UErrorCode &errorCode) {
setGcAndNumeric(props, newValues, errorCode);
if(U_FAILURE(errorCode)) { return; }
UChar32 start=props.start;
UChar32 end=props.end;
UChar32 pvecEnd=end;
if(start==0 && end==0x10ffff) {
// Also set bits for initialValue and errorValue.
pvecEnd=UPVEC_MAX_CP;
}
if(newValues.containsSome(0, UCHAR_BINARY_LIMIT-1)) {
for(int32_t i=0; i<LENGTHOF(propToBinaries); ++i) {
const PropToBinary &p2b=propToBinaries[i];
U_ASSERT(p2b.vecShift<32);
if(newValues.contains(p2b.prop)) {
uint32_t mask=U_MASK(p2b.vecShift);
uint32_t value= props.binProps[p2b.prop] ? mask : 0;
upvec_setValue(pv, start, pvecEnd, p2b.vecWord, value, mask, &errorCode);
}
}
}
// Set int property values.
if(newValues.containsSome(UCHAR_INT_START, UCHAR_INT_LIMIT-1)) {
for(int32_t i=0; i<LENGTHOF(propToEnums); ++i) {
const PropToEnum &p2e=propToEnums[i];
U_ASSERT(p2e.vecShift<32);
if(newValues.contains(p2e.prop)) {
uint32_t mask=p2e.vecMask;
uint32_t value=(uint32_t)(props.getIntProp(p2e.prop)<<p2e.vecShift);
U_ASSERT((value&mask)==value);
upvec_setValue(pv, start, pvecEnd, p2e.vecWord, value, mask, &errorCode);
}
}
}
if(newValues.contains(UCHAR_AGE)) {
if(props.age[0]>UPROPS_AGE_MAJOR_MAX || props.age[1]>UPROPS_AGE_MINOR_MAX ||
props.age[2]!=0 || props.age[3]!=0) {
char buffer[U_MAX_VERSION_STRING_LENGTH];
u_versionToString(props.age, buffer);
fprintf(stderr, "genprops error: age %s cannot be encoded\n", buffer);
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
uint32_t version=(props.age[0]<<2)|props.age[1];
upvec_setValue(pv, start, pvecEnd,
0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK,
&errorCode);
}
if (newValues.contains(UCHAR_BLOCK)) {
uint32_t value = props.getIntProp(UCHAR_BLOCK);
if ((start & 0xf) != 0 || (end & 0xf) != 0xf || value > UPROPS_MAX_BLOCK) {
fprintf(stderr, "genprops error: %04lX..%04lX Block 0x%x cannot be encoded\n",
(long)start, (long)end, (int)value);
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
umutablecptrie_setRange(mutableBlockTrie, start >> 4, end >> 4, value, &errorCode);
}
// Set the script value if the Script_Extensions revert to {Script}.
// Otherwise we would have to duplicate the code for doing so.
// Script and Script_Extensions share a bit field, so that by setting it to just the script
// we remove the Script_Extensions.
// (We do not just set the script bit in newValues because that is const.)
// For example, for U+3000:
// block;3000..303F;age=1.1;...;sc=Zyyy;scx=Bopo Hang Hani Hira Kana Yiii;vo=U
// cp;3000;...;gc=Zs;lb=BA;na=IDEOGRAPHIC SPACE;...;SB=SP;scx=<script>;WSpace
UBool revertToScript=
newValues.contains(UCHAR_SCRIPT_EXTENSIONS) && props.scx.isEmpty() &&
!newValues.contains(UCHAR_SCRIPT);
if(newValues.contains(UCHAR_SCRIPT) || revertToScript) {
int32_t script=props.getIntProp(UCHAR_SCRIPT);
// Use UPROPS_SCRIPT_X_MASK:
// When writing a Script code, remove Script_Extensions bits as well.
// If needed, they will get written again.
upvec_setValue(pv, start, pvecEnd, 0, script, UPROPS_SCRIPT_X_MASK, &errorCode);
}
// Write a new (Script, Script_Extensions) value if there are Script_Extensions
// and either Script or Script_Extensions are new on the current line.
// (If only Script is new, then it just clobbered the relevant bits.)
if( !props.scx.isEmpty() &&
(newValues.contains(UCHAR_SCRIPT) || newValues.contains(UCHAR_SCRIPT_EXTENSIONS))
) {
UnicodeString codes; // vector of 16-bit UScriptCode values
UnicodeSetIterator iter(props.scx);
while(iter.next()) { codes.append((char16_t)iter.getCodepoint()); }
// Set bit 15 on the last script code, for termination.
int32_t length=codes.length();
codes.setCharAt(length-1, (char16_t)(codes[length-1]|0x8000));
// Find this list of codes in the Script_Extensions data so far, or add this list.
int32_t index=scriptExtensions.indexOf(codes);
if(index<0) {
index=scriptExtensions.length();
scriptExtensions.append(codes);
}
// Encode the (Script, Script_Extensions index) pair.
int32_t script=props.getIntProp(UCHAR_SCRIPT);
uint32_t scriptX;
if(script==USCRIPT_COMMON) {
scriptX=UPROPS_SCRIPT_X_WITH_COMMON;
} else if(script==USCRIPT_INHERITED) {
scriptX=UPROPS_SCRIPT_X_WITH_INHERITED;
} else {
// Store an additional pair of 16-bit units for an unusual main Script code
// together with the Script_Extensions index.
UnicodeString codeIndexPair;
codeIndexPair.append((char16_t)script).append((char16_t)index);
index=scriptExtensions.indexOf(codeIndexPair);
if(index<0) {
index=scriptExtensions.length();
scriptExtensions.append(codeIndexPair);
}
scriptX=UPROPS_SCRIPT_X_WITH_OTHER;
}
if(index>UPROPS_MAX_SCRIPT) {
fprintf(stderr, "genprops: Script_Extensions indexes overflow bit fields\n");
errorCode=U_BUFFER_OVERFLOW_ERROR;
return;
}
scriptX|=index;
upvec_setValue(pv, start, pvecEnd, 0, scriptX, UPROPS_SCRIPT_X_MASK, &errorCode);
}
if(newValues.contains(UCHAR_IDENTIFIER_TYPE)) {
uint32_t encodedType=encodeIdentifierType(props.idType, errorCode);
upvec_setValue(
pv, start, pvecEnd, 2,
encodedType << UPROPS_2_ID_TYPE_SHIFT, UPROPS_2_ID_TYPE_MASK,
&errorCode);
}
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to set props2 values for %04lX..%04lX: %s\n",
(long)start, (long)end, u_errorName(errorCode));
}
}
int32_t indexes[UPROPS_INDEX_COUNT]={
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0
};
uint8_t trieBlock[100000];
int32_t trieSize;
uint8_t props2TrieBlock[100000];
int32_t props2TrieSize;
int32_t totalSize;
void
CorePropsBuilder::build(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops/core error: utrie2_freeze(main trie) failed: %s\n",
u_errorName(errorCode));
return;
}
trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops/core error: utrie2_serialize(main trie) failed: %s (length %ld)\n",
u_errorName(errorCode), (long)trieSize);
return;
}
props2Trie=upvec_compactToUTrie2WithRowIndexes(pv, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops/core error: unable to build trie for additional properties: %s\n",
u_errorName(errorCode));
return;
}
props2TrieSize=utrie2_serialize(props2Trie,
props2TrieBlock, (int32_t)sizeof(props2TrieBlock),
&errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops/core error: utrie2_freeze(additional properties)+utrie2_serialize() "
"failed: %s\n",
u_errorName(errorCode));
return;
}
int32_t pvRows;
upvec_getArray(pv, &pvRows, nullptr);
int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
/* round up scriptExtensions to multiple of 4 bytes */
if(scriptExtensions.length()&1) {
scriptExtensions.append((char16_t)0);
}
blockTrie = umutablecptrie_buildImmutable(
mutableBlockTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_16, &errorCode);
if (U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops/Block error: umutablecptrie_buildImmutable() failed: %s\n",
u_errorName(errorCode));
return;
}
blockTrieSize = ucptrie_toBinary(blockTrie,
blockTrieBytes, sizeof(blockTrieBytes), &errorCode);
if (U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops/Block error: ucptrie_toBinary() failed: %s (length %ld)\n",
u_errorName(errorCode), (long)trieSize);
return;
}
U_ASSERT((blockTrieSize & 3) == 0); // multiple of 4 bytes
/* set indexes */
int32_t offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
offset+=trieSize>>2;
indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
offset+=props2TrieSize/4;
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=offset;
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
offset+=pvCount;
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=offset;
offset+=scriptExtensions.length()/2;
indexes[UPROPS_BLOCK_TRIE_INDEX]=offset;
offset+=blockTrieSize/4;
indexes[UPROPS_RESERVED_INDEX_8]=offset;
indexes[UPROPS_DATA_TOP_INDEX]=offset;
totalSize=4*offset;
indexes[UPROPS_MAX_VALUES_INDEX]=
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
((int32_t)USCRIPT_CODE_LIMIT-1);
indexes[UPROPS_MAX_VALUES_2_INDEX]=
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
((int32_t)U_DT_COUNT-1);
indexes[UPROPS_MAX_VALUES_OTHER_INDEX]=
(int32_t)UBLOCK_COUNT-1;
if(!beQuiet) {
puts("* uprops.icu stats *");
printf("trie size in bytes: %5u\n", (int)trieSize);
printf("size in bytes of additional props trie:%5u\n", (int)props2TrieSize);
printf("number of additional props vectors: %5u\n", (int)pvRows);
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions.length());
printf("size in bytes of Block trie: %5u\n", (int)blockTrieSize);
printf("data size: %6ld\n", (long)totalSize);
}
}
void
CorePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
int32_t pvRows;
const uint32_t *pvArray=upvec_getArray(pv, &pvRows, nullptr);
int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
FILE *f=usrc_create(path, "uchar_props_data.h", 2016,
"icu/tools/unicode/c/genprops/corepropsbuilder.cpp");
if(f==nullptr) {
errorCode=U_FILE_ACCESS_ERROR;
return;
}
fputs("#ifdef INCLUDED_FROM_UCHAR_C\n\n", f);