Add character classification tables for ANSI code page Central Europe…

…an (Windows-1250), Cyrillic (Windows-1251), Western European (Windows-1252), Greek (Windows-1253), Turkish (Windows-1254), Hebrew (Windows-1255), Arabic (Windows-1256), Baltic (Windows-1257), Vietnamese (Windows-1258), and Thai (Windows-874). Improve word selection and auto-completion (issue #36) for ANSI-encoded documents on system with these code pages.
zufuliu · Jan 13, 2019 · f449b46 · f449b46
1 parent 30c3e4c
commit f449b46
Show file tree

Hide file tree

Showing 10 changed files with 428 additions and 153 deletions.
diff --git a/scintilla/include/Scintilla.h b/scintilla/include/Scintilla.h
@@ -276,6 +276,7 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,
 #define SCI_GETCARETPERIOD 2075
 #define SCI_SETCARETPERIOD 2076
 #define SCI_SETWORDCHARS 2077
+#define SCI_SETCHARCLASSESEX 2065
 #define SCI_GETWORDCHARS 2646
 #define SCI_BEGINUNDOACTION 2078
 #define SCI_ENDUNDOACTION 2079

diff --git a/scintilla/include/Scintilla.iface b/scintilla/include/Scintilla.iface
@@ -624,6 +624,9 @@ set void SetCaretPeriod=2076(int periodMilliseconds,)
 # First sets defaults like SetCharsDefault.
 set void SetWordChars=2077(, string characters)
 
+# call CharClassify::SetCharClassesEx()
+set void SetCharClassesEx=2065(int length, string characters)
+
 # Get the set of characters making up words for when moving or selecting by word.
 # Returns the number of characters
 get int GetWordChars=2646(, stringresult characters)

diff --git a/scintilla/scripts/GenerateCharTable.py b/scintilla/scripts/GenerateCharTable.py
@@ -1,82 +1,82 @@
 #!/usr/bin/env python3
 
 def GenerateUTF8Table():
-    def BytesFromLead(leadByte):
-        # Single byte or invalid
-        if leadByte < 0xC2:
-            return 1
-        if leadByte < 0xE0:
-            return 2
-        if leadByte < 0xF0:
-            return 3
-        if leadByte < 0xF5:
-            return 4
-        # Characters longer than 4 bytes not possible in current UTF-8
-        return 1
+	def BytesFromLead(leadByte):
+		# Single byte or invalid
+		if leadByte < 0xC2:
+			return 1
+		if leadByte < 0xE0:
+			return 2
+		if leadByte < 0xF0:
+			return 3
+		if leadByte < 0xF5:
+			return 4
+		# Characters longer than 4 bytes not possible in current UTF-8
+		return 1
 
-    def UTF8IsLeadByte(ch):
-        if ch <= 0x7F:
-            return True
-        if ch >= 0xC2 and ch <= 0xF4:
-            return True
-        return False
+	def UTF8IsLeadByte(ch):
+		if ch <= 0x7F:
+			return True
+		if ch >= 0xC2 and ch <= 0xF4:
+			return True
+		return False
 
-    def UTF8IsTrailByte(ch):
-        return ch >= 0x80 and ch <= 0xBF
+	def UTF8IsTrailByte(ch):
+		return ch >= 0x80 and ch <= 0xBF
 
-    def UTF8IntervalNumber(ch):
-        # UTF8-1
-        if ch <= 0x7F:
-            return 0
-        # UTF8-tail
-        if ch <= 0x8F:
-            return 1
-        if ch <= 0x9F:
-            return 2
-        if ch <= 0xBF:
-            return 3
-        # UTF8-invalid
-        if ch == 0xC0 or ch == 0xC1:
-            return 11
-        # UTF8-2
-        if ch >= 0xC2 and ch <= 0xDF:
-            return 4
-        # UTF8-3
-        if ch == 0xE0:
-            return 5
-        if ch >= 0xE1 and ch <= 0xEC:
-            return 6
-        if ch == 0xED:
-            return 7
-        if ch == 0xEE or ch == 0xEF:
-            return 6
-        # UTF8-4
-        if ch == 0xF0:
-            return 8
-        if ch >= 0xF1 and ch <= 0xF3:
-            return 9
-        if ch == 0xF4:
-            return 10
-        # UTF8-invalid
-        return 11
+	def UTF8IntervalNumber(ch):
+		# UTF8-1
+		if ch <= 0x7F:
+			return 0
+		# UTF8-tail
+		if ch <= 0x8F:
+			return 1
+		if ch <= 0x9F:
+			return 2
+		if ch <= 0xBF:
+			return 3
+		# UTF8-invalid
+		if ch == 0xC0 or ch == 0xC1:
+			return 11
+		# UTF8-2
+		if ch >= 0xC2 and ch <= 0xDF:
+			return 4
+		# UTF8-3
+		if ch == 0xE0:
+			return 5
+		if ch >= 0xE1 and ch <= 0xEC:
+			return 6
+		if ch == 0xED:
+			return 7
+		if ch == 0xEE or ch == 0xEF:
+			return 6
+		# UTF8-4
+		if ch == 0xF0:
+			return 8
+		if ch >= 0xF1 and ch <= 0xF3:
+			return 9
+		if ch == 0xF4:
+			return 10
+		# UTF8-invalid
+		return 11
 
-    def MakeUTF8ClassifyMask(ch):
-        mask = BytesFromLead(ch)
-        if UTF8IsTrailByte(ch):
-            mask |= 1 << 3
+	def MakeUTF8ClassifyMask(ch):
+		mask = BytesFromLead(ch)
+		if UTF8IsTrailByte(ch):
+			mask |= 1 << 3
 
-        number = UTF8IntervalNumber(ch)
-        mask |= number << 4
-        return mask
+		number = UTF8IntervalNumber(ch)
+		mask |= number << 4
+		return mask
 
-    UTF8ClassifyTable = []
-    for i in range(0, 255, 16):
-        line = ', '.join('0x%02X' % MakeUTF8ClassifyMask(ch) for ch in range(i, i + 16)) + ','
-        line += ' // %02X - %02X' % (i, i + 15)
-        UTF8ClassifyTable.append(line)
+	UTF8ClassifyTable = []
+	for i in range(0, 255, 16):
+		line = ', '.join('0x%02X' % MakeUTF8ClassifyMask(ch) for ch in range(i, i + 16)) + ','
+		line += ' // %02X - %02X' % (i, i + 15)
+		UTF8ClassifyTable.append(line)
 
-    print('UTF8ClassifyTable:', len(UTF8ClassifyTable))
-    print('\n'.join(UTF8ClassifyTable))
+	print('UTF8ClassifyTable:', len(UTF8ClassifyTable))
+	print('\n'.join(UTF8ClassifyTable))
 
 def GenerateUnicodeControlCharacters():
 	ucc_table = [
@@ -108,5 +108,5 @@ def GenerateUnicodeControlCharacters():
 		print(utf8str, 'U+%04X' % ord(ucc))
 
 if __name__ == '__main__':
-    GenerateUnicodeControlCharacters();
-    GenerateUTF8Table()
+	GenerateUnicodeControlCharacters();
+	GenerateUTF8Table()