Permalink
Browse files

Treat CJK Ideographs and letters as a distinct Unicode category.

  • Loading branch information...
zufuliu committed Sep 30, 2018
1 parent 14442ca commit 04e5a100c286ef348a8d1f7524f899233e83643d
@@ -1240,7 +1240,8 @@ const int catRanges[] = {
139121,
139139,
139169,
139268,
139294,
147460,
149821,
149828,
149981,
@@ -2294,21 +2295,21 @@ const int catRanges[] = {
395185,
395221,
395293,
395300,
395326,
398077,
398117,
398196,
398243,
398308,
398334,
398348,
398372,
398398,
401265,
401283,
401380,
401406,
401437,
401572,
402973,
402980,
403006,
406013,
406037,
406090,
@@ -2317,7 +2318,7 @@ const int catRanges[] = {
407421,
407573,
408733,
409092,
409118,
409621,
410621,
410634,
@@ -2332,10 +2333,10 @@ const int catRanges[] = {
415765,
417789,
417813,
425988,
426014,
636637,
636949,
638980,
639006,
1310237,
1310724,
1311395,
@@ -2623,7 +2624,7 @@ const int catRanges[] = {
1387078,
1387165,
1387505,
1387524,
1387550,
1388477,
1388549,
1388646,
@@ -2724,17 +2725,17 @@ const int catRanges[] = {
1408477,
1408520,
1408861,
1409028,
1409054,
1766557,
1766916,
1766942,
1767677,
1767780,
1767806,
1769373,
1769499,
1835036,
2039812,
2039838,
2051549,
2051588,
2051614,
2055005,
2056193,
2056445,
@@ -2857,19 +2858,19 @@ const int catRanges[] = {
2092109,
2092142,
2092177,
2092228,
2092254,
2092547,
2092580,
2092606,
2094019,
2094084,
2094110,
2095101,
2095172,
2095198,
2095389,
2095428,
2095454,
2095645,
2095684,
2095710,
2095901,
2095940,
2095966,
2096061,
2096147,
2096210,
@@ -3452,7 +3453,8 @@ const int catRanges[] = {
3210845,
3211268,
3235453,
3538948,
3538974,
3539012,
3548157,
3550724,
3563421,
@@ -3768,17 +3770,17 @@ const int catRanges[] = {
4145181,
4148245,
4148701,
4194308,
4194334,
5561085,
5562372,
5562398,
5695165,
5695492,
5695518,
5702621,
5702660,
5702686,
5887069,
5887492,
5887518,
6126653,
6225924,
6225950,
6243293,
29360186,
29360221,
@@ -3927,7 +3929,7 @@ bool IsIdStart(int character) noexcept {
}
const CharacterCategory c = CategoriseCharacter(character);
return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
|| c == ccNl);
|| c == ccNl || c == ccCJK);
}
// UAX #31 defines ID_Continue as
@@ -3942,7 +3944,7 @@ bool IsIdContinue(int character) noexcept {
}
const CharacterCategory c = CategoriseCharacter(character);
return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
|| c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc);
|| c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc || c == ccCJK);
}
// XID_Start is ID_Start modified for Normalization Form KC in UAX #31
@@ -17,7 +17,8 @@ enum CharacterCategory {
ccPc, ccPd, ccPs, ccPe, ccPi, ccPf, ccPo,
ccSm, ccSc, ccSk, ccSo,
ccZs, ccZl, ccZp,
ccCc, ccCf, ccCs, ccCo, ccCn
ccCc, ccCf, ccCs, ccCo, ccCn,
ccCJK
};
CharacterCategory CategoriseCharacter(int character) noexcept;
@@ -15,17 +15,31 @@ def findCategories(filename):
print(values)
return [v[2:] for v in values]
def isCJKLetter(uch):
name = ''
try:
name = unicodedata.name(uch).upper()
except:
pass
return 'CJK' in name \
or 'HIRAGANA' in name \
or 'KATAKANA' in name \
or 'HANGUL' in name
def updateCharacterCategory(filename):
values = ["// Created with Python %s, Unicode %s" % (
platform.python_version(), unicodedata.unidata_version)]
category = unicodedata.category(chr(0))
startRange = 0
for ch in range(sys.maxunicode):
uch = chr(ch)
if unicodedata.category(uch) != category:
current = unicodedata.category(uch)
if current == 'Lo' and isCJKLetter(uch):
current = 'CJK'
if current != category:
value = startRange * 32 + categories.index(category)
values.append("%d," % value)
category = unicodedata.category(uch)
category = current
startRange = ch
value = startRange * 32 + categories.index(category)
values.append("%d," % value)
@@ -15,7 +15,7 @@ class CharClassify {
CharClassify() noexcept;
enum cc {
ccSpace, ccNewLine, ccWord, ccPunctuation
ccSpace, ccNewLine, ccWord, ccPunctuation, ccCJK
};
void SetDefaultCharClasses(bool includeWordClass) noexcept;
void SetCharClasses(const unsigned char *chars, cc newCharClass) noexcept;
@@ -1646,10 +1646,12 @@ CharClassify::cc Document::WordCharacterClass(unsigned int ch) const noexcept {
case ccSo:
return CharClassify::ccPunctuation;
case ccCJK:
return CharClassify::ccCJK;
}
} else {
// Asian DBCS
return CharClassify::ccWord;
return CharClassify::ccCJK;
}
}
return charClass.GetClass(static_cast<unsigned char>(ch));
@@ -1104,7 +1104,7 @@ BEGIN
LTEXT "No wrap indent|Wrap indent by 1 character|Wrap indent by 2 characters|Wrap indent by 1 level|Wrap indent by 2 levels|Wrap indent as first subline|Wrap indent 1 level more than first subline|Wrap indent 2 levels more than first subline",200,0,0,615,8,NOT WS_VISIBLE
LTEXT "No visual indicators before wrap|Show visual indicators before wrap (near text)|Show visual indicators before wrap (near borders)",201,0,0,418,8,NOT WS_VISIBLE
LTEXT "No visual indicators after wrap|Show visual indicators after wrap (near text)|Show visual indicators after wrap (near borders)",202,0,0,402,8,NOT WS_VISIBLE
LTEXT "No wrap|Wrap text between words|Wrap text between any characters (preferred for CJK text)|Wrap text on whitespace",203,0,0,187,8,NOT WS_VISIBLE
LTEXT "No wrap|Wrap text between words|Wrap text between any characters|Wrap text on whitespace",203,0,0,187,8,NOT WS_VISIBLE
END
IDD_PAGESETUP DIALOGEX 5, 5, 356, 260

0 comments on commit 04e5a10

Please sign in to comment.