In [50]:
utf8_bytes = "Hello, ‰∏ñÁïå üòä".encode(encoding="utf-8")

def get_char_length(byte):
    if byte >> 7 == 0:
        return 1
    elif byte >> 5 == 0b110:
        return 2
    elif byte >> 4 == 0b1110:
        return 3
    elif byte >> 3 == 0b11110:
        return 4
    raise ValueError(f"unknown byte {byte:02X}")

i = 0
while i < len(utf8_bytes):
    length = get_char_length(utf8_bytes[i])
    ch = utf8_bytes[i:i+length].decode("utf-8")
    print("Character", ch, end="; ")
    print(f"Code Point +U{ord(ch):02X}", end="; ")
    print("Bytes", " ".join(f"{b:02X}" for b in utf8_bytes[i:i+length]), end=";")
    print()
    i += length
    
        

Character H; Code Point +U48; Bytes 48;
Character e; Code Point +U65; Bytes 65;
Character l; Code Point +U6C; Bytes 6C;
Character l; Code Point +U6C; Bytes 6C;
Character o; Code Point +U6F; Bytes 6F;
Character ,; Code Point +U2C; Bytes 2C;
Character  ; Code Point +U20; Bytes 20;
Character ‰∏ñ; Code Point +U4E16; Bytes E4 B8 96;
Character Áïå; Code Point +U754C; Bytes E7 95 8C;
Character  ; Code Point +U20; Bytes 20;
Character üòä; Code Point +U1F60A; Bytes F0 9F 98 8A;


In [21]:
import unicodedata

s = "h‚ÑÆ‚Ñì‚Ñì‚óé, w‚óé“ë≈Ç‚Öæ‚Äº"
print(s, ascii(s), bytes(s, encoding="utf-8"), sep="\n")
print()

for ch in s:
    category = unicodedata.category(ch)
    name = unicodedata.name(ch)
    print(ch, category, name, sep="; ")

h‚ÑÆ‚Ñì‚Ñì‚óé, w‚óé“ë≈Ç‚Öæ‚Äº
'h\u212e\u2113\u2113\u25ce, w\u25ce\u0491\u0142\u217e\u203c'
b'h\xe2\x84\xae\xe2\x84\x93\xe2\x84\x93\xe2\x97\x8e, w\xe2\x97\x8e\xd2\x91\xc5\x82\xe2\x85\xbe\xe2\x80\xbc'

h; Ll; LATIN SMALL LETTER H
‚ÑÆ; So; ESTIMATED SYMBOL
‚Ñì; Ll; SCRIPT SMALL L
‚Ñì; Ll; SCRIPT SMALL L
‚óé; So; BULLSEYE
,; Po; COMMA
 ; Zs; SPACE
w; Ll; LATIN SMALL LETTER W
‚óé; So; BULLSEYE
“ë; Ll; CYRILLIC SMALL LETTER GHE WITH UPTURN
≈Ç; Ll; LATIN SMALL LETTER L WITH STROKE
‚Öæ; Nl; SMALL ROMAN NUMERAL FIVE HUNDRED
‚Äº; Po; DOUBLE EXCLAMATION MARK


In [22]:
b = bytes(s, encoding="utf-8")
print(b, type(b), "len=", len(b))
print(b.decode(encoding="utf-8"))

b'h\xe2\x84\xae\xe2\x84\x93\xe2\x84\x93\xe2\x97\x8e, w\xe2\x97\x8e\xd2\x91\xc5\x82\xe2\x85\xbe\xe2\x80\xbc' <class 'bytes'> len= 29
h‚ÑÆ‚Ñì‚Ñì‚óé, w‚óé“ë≈Ç‚Öæ‚Äº


## Normalize Unicode

In [None]:
s1 = "Spicy Jalape\u00f1o"
s2 = "Spicy Jalapen\u0303o"
print("str: ", s1, s2)
print("ascii: ", ascii(s1), ascii(s2))
print("compare: ", s1 == s2)
print("len: ", len(s1), len(s2))


Spicy Jalape√±o Spicy JalapenÃÉo
False
14 15


In [26]:
import unicodedata

t1 = unicodedata.normalize("NFC", s1)
t2 = unicodedata.normalize("NFC", s2)
print("str: ", t1, t2)
print("ascii: ", ascii(t1), ascii(t2))
print("compare: ", t1 == t2)
print("len: ", len(t1), len(t2))


str:  Spicy Jalape√±o Spicy Jalape√±o
ascii:  'Spicy Jalape\xf1o' 'Spicy Jalape\xf1o'
compare:  True
len:  14 14


In [27]:
import unicodedata

t3 = unicodedata.normalize("NFD", s1)
t4 = unicodedata.normalize("NFD", s2)
print("str: ", t3, t4)
print("ascii: ", ascii(t3), ascii(t4))
print("compare: ", t3 == t4)
print("len: ", len(t3), len(t4))

str:  Spicy JalapenÃÉo Spicy JalapenÃÉo
ascii:  'Spicy Jalapen\u0303o' 'Spicy Jalapen\u0303o'
compare:  True
len:  15 15


In [28]:
import unicodedata

s = "\ufb01"
print(s)
print(unicodedata.normalize("NFC", s), unicodedata.normalize("NFD", s))
print(unicodedata.normalize("NFKD", s), unicodedata.normalize("NFKC", s))

Ô¨Å
Ô¨Å Ô¨Å
fi fi


In [None]:
import unicodedata

# remove diacritical Marks
print(''.join(c for c in "Spicy Jalapen\u0303o" if not unicodedata.combining(c)))

Spicy Jalapeno


## Unicode + RegEx

In [32]:
import re

pat = re.compile("\d+")
print(pat.match("123"))
print(pat.match("\u0661\u0662\u0663")) # Ÿ°Ÿ¢Ÿ£

<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(0, 3), match='Ÿ°Ÿ¢Ÿ£'>


In [None]:
import re

arabic = re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+')
print(arabic.match("\u0661\u0662\u0663"))

<re.Match object; span=(0, 3), match='Ÿ°Ÿ¢Ÿ£'>


In [39]:
pat = re.compile("stra\u00dfe", re.IGNORECASE)
s = "stra√üe"
print(pat.match(s))
print(pat.match(s.upper()))

<re.Match object; span=(0, 6), match='stra√üe'>
None
