Skip to content

Commit

Permalink
Treat CJK Ideographs and letters as a distinct Unicode category.
Browse files Browse the repository at this point in the history
  • Loading branch information
zufuliu committed Sep 30, 2018
1 parent 14442ca commit 04e5a10
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 37 deletions.
64 changes: 33 additions & 31 deletions scintilla/lexlib/CharacterCategory.cxx
Expand Up @@ -1240,7 +1240,8 @@ const int catRanges[] = {
139121, 139121,
139139, 139139,
139169, 139169,
139268, 139294,
147460,
149821, 149821,
149828, 149828,
149981, 149981,
Expand Down Expand Up @@ -2294,21 +2295,21 @@ const int catRanges[] = {
395185, 395185,
395221, 395221,
395293, 395293,
395300, 395326,
398077, 398077,
398117, 398117,
398196, 398196,
398243, 398243,
398308, 398334,
398348, 398348,
398372, 398398,
401265, 401265,
401283, 401283,
401380, 401406,
401437, 401437,
401572, 401572,
402973, 402973,
402980, 403006,
406013, 406013,
406037, 406037,
406090, 406090,
Expand All @@ -2317,7 +2318,7 @@ const int catRanges[] = {
407421, 407421,
407573, 407573,
408733, 408733,
409092, 409118,
409621, 409621,
410621, 410621,
410634, 410634,
Expand All @@ -2332,10 +2333,10 @@ const int catRanges[] = {
415765, 415765,
417789, 417789,
417813, 417813,
425988, 426014,
636637, 636637,
636949, 636949,
638980, 639006,
1310237, 1310237,
1310724, 1310724,
1311395, 1311395,
Expand Down Expand Up @@ -2623,7 +2624,7 @@ const int catRanges[] = {
1387078, 1387078,
1387165, 1387165,
1387505, 1387505,
1387524, 1387550,
1388477, 1388477,
1388549, 1388549,
1388646, 1388646,
Expand Down Expand Up @@ -2724,17 +2725,17 @@ const int catRanges[] = {
1408477, 1408477,
1408520, 1408520,
1408861, 1408861,
1409028, 1409054,
1766557, 1766557,
1766916, 1766942,
1767677, 1767677,
1767780, 1767806,
1769373, 1769373,
1769499, 1769499,
1835036, 1835036,
2039812, 2039838,
2051549, 2051549,
2051588, 2051614,
2055005, 2055005,
2056193, 2056193,
2056445, 2056445,
Expand Down Expand Up @@ -2857,19 +2858,19 @@ const int catRanges[] = {
2092109, 2092109,
2092142, 2092142,
2092177, 2092177,
2092228, 2092254,
2092547, 2092547,
2092580, 2092606,
2094019, 2094019,
2094084, 2094110,
2095101, 2095101,
2095172, 2095198,
2095389, 2095389,
2095428, 2095454,
2095645, 2095645,
2095684, 2095710,
2095901, 2095901,
2095940, 2095966,
2096061, 2096061,
2096147, 2096147,
2096210, 2096210,
Expand Down Expand Up @@ -3452,7 +3453,8 @@ const int catRanges[] = {
3210845, 3210845,
3211268, 3211268,
3235453, 3235453,
3538948, 3538974,
3539012,
3548157, 3548157,
3550724, 3550724,
3563421, 3563421,
Expand Down Expand Up @@ -3768,17 +3770,17 @@ const int catRanges[] = {
4145181, 4145181,
4148245, 4148245,
4148701, 4148701,
4194308, 4194334,
5561085, 5561085,
5562372, 5562398,
5695165, 5695165,
5695492, 5695518,
5702621, 5702621,
5702660, 5702686,
5887069, 5887069,
5887492, 5887518,
6126653, 6126653,
6225924, 6225950,
6243293, 6243293,
29360186, 29360186,
29360221, 29360221,
Expand Down Expand Up @@ -3927,7 +3929,7 @@ bool IsIdStart(int character) noexcept {
} }
const CharacterCategory c = CategoriseCharacter(character); const CharacterCategory c = CategoriseCharacter(character);
return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
|| c == ccNl); || c == ccNl || c == ccCJK);
} }


// UAX #31 defines ID_Continue as // UAX #31 defines ID_Continue as
Expand All @@ -3942,7 +3944,7 @@ bool IsIdContinue(int character) noexcept {
} }
const CharacterCategory c = CategoriseCharacter(character); const CharacterCategory c = CategoriseCharacter(character);
return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo return (c == ccLl || c == ccLu || c == ccLt || c == ccLm || c == ccLo
|| c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc); || c == ccNl || c == ccMn || c == ccMc || c == ccNd || c == ccPc || c == ccCJK);
} }


// XID_Start is ID_Start modified for Normalization Form KC in UAX #31 // XID_Start is ID_Start modified for Normalization Form KC in UAX #31
Expand Down
3 changes: 2 additions & 1 deletion scintilla/lexlib/CharacterCategory.h
Expand Up @@ -17,7 +17,8 @@ enum CharacterCategory {
ccPc, ccPd, ccPs, ccPe, ccPi, ccPf, ccPo, ccPc, ccPd, ccPs, ccPe, ccPi, ccPf, ccPo,
ccSm, ccSc, ccSk, ccSo, ccSm, ccSc, ccSk, ccSo,
ccZs, ccZl, ccZp, ccZs, ccZl, ccZp,
ccCc, ccCf, ccCs, ccCo, ccCn ccCc, ccCf, ccCs, ccCo, ccCn,
ccCJK
}; };


CharacterCategory CategoriseCharacter(int character) noexcept; CharacterCategory CategoriseCharacter(int character) noexcept;
Expand Down
18 changes: 16 additions & 2 deletions scintilla/scripts/GenerateCharacterCategory.py
Expand Up @@ -15,17 +15,31 @@ def findCategories(filename):
print(values) print(values)
return [v[2:] for v in values] return [v[2:] for v in values]


def isCJKLetter(uch):
name = ''
try:
name = unicodedata.name(uch).upper()
except:
pass
return 'CJK' in name \
or 'HIRAGANA' in name \
or 'KATAKANA' in name \
or 'HANGUL' in name

def updateCharacterCategory(filename): def updateCharacterCategory(filename):
values = ["// Created with Python %s, Unicode %s" % ( values = ["// Created with Python %s, Unicode %s" % (
platform.python_version(), unicodedata.unidata_version)] platform.python_version(), unicodedata.unidata_version)]
category = unicodedata.category(chr(0)) category = unicodedata.category(chr(0))
startRange = 0 startRange = 0
for ch in range(sys.maxunicode): for ch in range(sys.maxunicode):
uch = chr(ch) uch = chr(ch)
if unicodedata.category(uch) != category: current = unicodedata.category(uch)
if current == 'Lo' and isCJKLetter(uch):
current = 'CJK'
if current != category:
value = startRange * 32 + categories.index(category) value = startRange * 32 + categories.index(category)
values.append("%d," % value) values.append("%d," % value)
category = unicodedata.category(uch) category = current
startRange = ch startRange = ch
value = startRange * 32 + categories.index(category) value = startRange * 32 + categories.index(category)
values.append("%d," % value) values.append("%d," % value)
Expand Down
2 changes: 1 addition & 1 deletion scintilla/src/CharClassify.h
Expand Up @@ -15,7 +15,7 @@ class CharClassify {
CharClassify() noexcept; CharClassify() noexcept;


enum cc { enum cc {
ccSpace, ccNewLine, ccWord, ccPunctuation ccSpace, ccNewLine, ccWord, ccPunctuation, ccCJK
}; };
void SetDefaultCharClasses(bool includeWordClass) noexcept; void SetDefaultCharClasses(bool includeWordClass) noexcept;
void SetCharClasses(const unsigned char *chars, cc newCharClass) noexcept; void SetCharClasses(const unsigned char *chars, cc newCharClass) noexcept;
Expand Down
4 changes: 3 additions & 1 deletion scintilla/src/Document.cxx
Expand Up @@ -1646,10 +1646,12 @@ CharClassify::cc Document::WordCharacterClass(unsigned int ch) const noexcept {
case ccSo: case ccSo:
return CharClassify::ccPunctuation; return CharClassify::ccPunctuation;


case ccCJK:
return CharClassify::ccCJK;
} }
} else { } else {
// Asian DBCS // Asian DBCS
return CharClassify::ccWord; return CharClassify::ccCJK;
} }
} }
return charClass.GetClass(static_cast<unsigned char>(ch)); return charClass.GetClass(static_cast<unsigned char>(ch));
Expand Down
2 changes: 1 addition & 1 deletion src/Notepad2.rc
Expand Up @@ -1104,7 +1104,7 @@ BEGIN
LTEXT "No wrap indent|Wrap indent by 1 character|Wrap indent by 2 characters|Wrap indent by 1 level|Wrap indent by 2 levels|Wrap indent as first subline|Wrap indent 1 level more than first subline|Wrap indent 2 levels more than first subline",200,0,0,615,8,NOT WS_VISIBLE LTEXT "No wrap indent|Wrap indent by 1 character|Wrap indent by 2 characters|Wrap indent by 1 level|Wrap indent by 2 levels|Wrap indent as first subline|Wrap indent 1 level more than first subline|Wrap indent 2 levels more than first subline",200,0,0,615,8,NOT WS_VISIBLE
LTEXT "No visual indicators before wrap|Show visual indicators before wrap (near text)|Show visual indicators before wrap (near borders)",201,0,0,418,8,NOT WS_VISIBLE LTEXT "No visual indicators before wrap|Show visual indicators before wrap (near text)|Show visual indicators before wrap (near borders)",201,0,0,418,8,NOT WS_VISIBLE
LTEXT "No visual indicators after wrap|Show visual indicators after wrap (near text)|Show visual indicators after wrap (near borders)",202,0,0,402,8,NOT WS_VISIBLE LTEXT "No visual indicators after wrap|Show visual indicators after wrap (near text)|Show visual indicators after wrap (near borders)",202,0,0,402,8,NOT WS_VISIBLE
LTEXT "No wrap|Wrap text between words|Wrap text between any characters (preferred for CJK text)|Wrap text on whitespace",203,0,0,187,8,NOT WS_VISIBLE LTEXT "No wrap|Wrap text between words|Wrap text between any characters|Wrap text on whitespace",203,0,0,187,8,NOT WS_VISIBLE
END END


IDD_PAGESETUP DIALOGEX 5, 5, 356, 260 IDD_PAGESETUP DIALOGEX 5, 5, 356, 260
Expand Down

0 comments on commit 04e5a10

Please sign in to comment.