Skip to content

Commit

Permalink
Add character classification tables for ANSI code page Central Europe…
Browse files Browse the repository at this point in the history
…an (Windows-1250), Cyrillic (Windows-1251), Western European (Windows-1252), Greek (Windows-1253), Turkish (Windows-1254), Hebrew (Windows-1255), Arabic (Windows-1256), Baltic (Windows-1257), Vietnamese (Windows-1258), and Thai (Windows-874).

Improve word selection and auto-completion (issue #36) for ANSI-encoded documents on system with these code pages.
  • Loading branch information
zufuliu committed Jan 13, 2019
1 parent 30c3e4c commit f449b46
Show file tree
Hide file tree
Showing 10 changed files with 428 additions and 153 deletions.
1 change: 1 addition & 0 deletions scintilla/include/Scintilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ typedef sptr_t (*SciFnDirect)(sptr_t ptr, unsigned int iMessage, uptr_t wParam,
#define SCI_GETCARETPERIOD 2075
#define SCI_SETCARETPERIOD 2076
#define SCI_SETWORDCHARS 2077
#define SCI_SETCHARCLASSESEX 2065
#define SCI_GETWORDCHARS 2646
#define SCI_BEGINUNDOACTION 2078
#define SCI_ENDUNDOACTION 2079
Expand Down
3 changes: 3 additions & 0 deletions scintilla/include/Scintilla.iface
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,9 @@ set void SetCaretPeriod=2076(int periodMilliseconds,)
# First sets defaults like SetCharsDefault.
set void SetWordChars=2077(, string characters)

# call CharClassify::SetCharClassesEx()
set void SetCharClassesEx=2065(int length, string characters)

# Get the set of characters making up words for when moving or selecting by word.
# Returns the number of characters
get int GetWordChars=2646(, stringresult characters)
Expand Down
142 changes: 71 additions & 71 deletions scintilla/scripts/GenerateCharTable.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,82 @@
#!/usr/bin/env python3

def GenerateUTF8Table():
def BytesFromLead(leadByte):
# Single byte or invalid
if leadByte < 0xC2:
return 1
if leadByte < 0xE0:
return 2
if leadByte < 0xF0:
return 3
if leadByte < 0xF5:
return 4
# Characters longer than 4 bytes not possible in current UTF-8
return 1
def BytesFromLead(leadByte):
# Single byte or invalid
if leadByte < 0xC2:
return 1
if leadByte < 0xE0:
return 2
if leadByte < 0xF0:
return 3
if leadByte < 0xF5:
return 4
# Characters longer than 4 bytes not possible in current UTF-8
return 1

def UTF8IsLeadByte(ch):
if ch <= 0x7F:
return True
if ch >= 0xC2 and ch <= 0xF4:
return True
return False
def UTF8IsLeadByte(ch):
if ch <= 0x7F:
return True
if ch >= 0xC2 and ch <= 0xF4:
return True
return False

def UTF8IsTrailByte(ch):
return ch >= 0x80 and ch <= 0xBF
def UTF8IsTrailByte(ch):
return ch >= 0x80 and ch <= 0xBF

def UTF8IntervalNumber(ch):
# UTF8-1
if ch <= 0x7F:
return 0
# UTF8-tail
if ch <= 0x8F:
return 1
if ch <= 0x9F:
return 2
if ch <= 0xBF:
return 3
# UTF8-invalid
if ch == 0xC0 or ch == 0xC1:
return 11
# UTF8-2
if ch >= 0xC2 and ch <= 0xDF:
return 4
# UTF8-3
if ch == 0xE0:
return 5
if ch >= 0xE1 and ch <= 0xEC:
return 6
if ch == 0xED:
return 7
if ch == 0xEE or ch == 0xEF:
return 6
# UTF8-4
if ch == 0xF0:
return 8
if ch >= 0xF1 and ch <= 0xF3:
return 9
if ch == 0xF4:
return 10
# UTF8-invalid
return 11
def UTF8IntervalNumber(ch):
# UTF8-1
if ch <= 0x7F:
return 0
# UTF8-tail
if ch <= 0x8F:
return 1
if ch <= 0x9F:
return 2
if ch <= 0xBF:
return 3
# UTF8-invalid
if ch == 0xC0 or ch == 0xC1:
return 11
# UTF8-2
if ch >= 0xC2 and ch <= 0xDF:
return 4
# UTF8-3
if ch == 0xE0:
return 5
if ch >= 0xE1 and ch <= 0xEC:
return 6
if ch == 0xED:
return 7
if ch == 0xEE or ch == 0xEF:
return 6
# UTF8-4
if ch == 0xF0:
return 8
if ch >= 0xF1 and ch <= 0xF3:
return 9
if ch == 0xF4:
return 10
# UTF8-invalid
return 11

def MakeUTF8ClassifyMask(ch):
mask = BytesFromLead(ch)
if UTF8IsTrailByte(ch):
mask |= 1 << 3
def MakeUTF8ClassifyMask(ch):
mask = BytesFromLead(ch)
if UTF8IsTrailByte(ch):
mask |= 1 << 3

number = UTF8IntervalNumber(ch)
mask |= number << 4
return mask
number = UTF8IntervalNumber(ch)
mask |= number << 4
return mask

UTF8ClassifyTable = []
for i in range(0, 255, 16):
line = ', '.join('0x%02X' % MakeUTF8ClassifyMask(ch) for ch in range(i, i + 16)) + ','
line += ' // %02X - %02X' % (i, i + 15)
UTF8ClassifyTable.append(line)
UTF8ClassifyTable = []
for i in range(0, 255, 16):
line = ', '.join('0x%02X' % MakeUTF8ClassifyMask(ch) for ch in range(i, i + 16)) + ','
line += ' // %02X - %02X' % (i, i + 15)
UTF8ClassifyTable.append(line)

print('UTF8ClassifyTable:', len(UTF8ClassifyTable))
print('\n'.join(UTF8ClassifyTable))
print('UTF8ClassifyTable:', len(UTF8ClassifyTable))
print('\n'.join(UTF8ClassifyTable))

def GenerateUnicodeControlCharacters():
ucc_table = [
Expand Down Expand Up @@ -108,5 +108,5 @@ def GenerateUnicodeControlCharacters():
print(utf8str, 'U+%04X' % ord(ucc))

if __name__ == '__main__':
GenerateUnicodeControlCharacters();
GenerateUTF8Table()
GenerateUnicodeControlCharacters();
GenerateUTF8Table()

0 comments on commit f449b46

Please sign in to comment.