In [14]:
from helper import *

In [16]:
from pathlib import Path

In [18]:
import pandas as pd 

In [12]:
DROP_TABLES_FLAG = False

In [15]:
old_db_file = "../../old_db/hanzi-20160115.sqlite"

In [17]:
Path(old_db_file).exists()

True

In [21]:
with DBConn(Path(old_db_file)) as _conn:
    sql_select = "select * from wg_zi_shape order by zi"
    df = pd.read_sql(sql_select, _conn).fillna("")

In [22]:
df.head()

Unnamed: 0,id,zi,z_back,z_up_left,z_upper,z_up_right,z_left,z_center,z_right,z_low_left,z_lower,z_low_right,z_front,note,z_lowleft,z_upright,z_lowright,z_upleft
0,86,,,,,,,,,,,,,,,,,
1,266,,,,,,,,,,,,,,,,,
2,866,,,,,,,,,,,,,,,,,
3,428,㐬,,,云,,,,,,川,,,,,,,
4,841,㒸,,,丷,,,豕,,,,,,,,,,


In [23]:
df.to_csv("old_db_zi.csv", index=False)

## create tables 

In [8]:
sql_create = open("schema.sql", encoding="utf-8").read()

print(sql_create)

-- table to store zi (unicode), description, and their translations in English (en), Spanish (es)
create table if not exists t_zi (
    id INTEGER PRIMARY KEY AUTOINCREMENT,

    zi text NOT NULL,
    desc text,

    zi_en text,
    desc_en text,

    zi_es text,
    desc_es text,

    pinyin text,
    nstrokes INTEGER,

    is_radical INTEGER,
    is_zi INTEGER,
    is_traditional INTEGER,

    -- sys_cols
    is_active INTEGER,
    uid INTEGER,                    -- FK to t_user.id
    uid_created INTEGER,
    ts text,
    ts_created text,

    CONSTRAINT uk_zi UNIQUE (zi)
);


-- table to store disected zi parts
create table if not exists  t_zi_part (
    id INTEGER PRIMARY KEY AUTOINCREMENT,

    zi text NOT NULL,

    lu text,  -- left up part
    l text,  -- left part
    ld text,  -- left down part
    u text,  -- up part
    m text,  -- mid part
    d text,  -- down part
    ru text,  -- right up part
    r text,  -- right part
    rd text,  -- right down part
    mf text,  -- 

In [9]:
with DBConn() as _conn:
    db_run_sql(sql_create, _conn)

## verify tables 

In [10]:
with DBConn() as _conn:
    sql_select = f"""
        SELECT name FROM sqlite_master WHERE type='table' and name like 't_%';
    """
    df = db_run_sql(sql_select, _conn)
    tables = df["name"].to_list()
    print(f"Tables: {tables}")

Tables: ['t_zi', 't_zi_part', 't_zi_shufa', 't_zi_word', 't_zi_text', 't_zi_media', 't_user']


## drop tables 

In [13]:
if DROP_TABLES_FLAG:
    with DBConn() as _conn:
        for t in tables:
            sql_drop = f"""
                drop table IF EXISTS {t};
            """
            print(sql_drop)
            db_run_sql(sql_drop, _conn)

## Radicals

https://www.wikiwand.com/en/List_of_radicals_in_Unicode

In [1]:
radicals = """
U+2F0x	⼀	⼁	⼂	⼃	⼄	⼅	⼆	⼇	⼈	⼉	⼊	⼋	⼌	⼍	⼎	⼏
U+2F1x	⼐	⼑	⼒	⼓	⼔	⼕	⼖	⼗	⼘	⼙	⼚	⼛	⼜	⼝	⼞	⼟
U+2F2x	⼠	⼡	⼢	⼣	⼤	⼥	⼦	⼧	⼨	⼩	⼪	⼫	⼬	⼭	⼮	⼯
U+2F3x	⼰	⼱	⼲	⼳	⼴	⼵	⼶	⼷	⼸	⼹	⼺	⼻	⼼	⼽	⼾	⼿
U+2F4x	⽀	⽁	⽂	⽃	⽄	⽅	⽆	⽇	⽈	⽉	⽊	⽋	⽌	⽍	⽎	⽏
U+2F5x	⽐	⽑	⽒	⽓	⽔	⽕	⽖	⽗	⽘	⽙	⽚	⽛	⽜	⽝	⽞	⽟
U+2F6x	⽠	⽡	⽢	⽣	⽤	⽥	⽦	⽧	⽨	⽩	⽪	⽫	⽬	⽭	⽮	⽯
U+2F7x	⽰	⽱	⽲	⽳	⽴	⽵	⽶	⽷	⽸	⽹	⽺	⽻	⽼	⽽	⽾	⽿
U+2F8x	⾀	⾁	⾂	⾃	⾄	⾅	⾆	⾇	⾈	⾉	⾊	⾋	⾌	⾍	⾎	⾏
U+2F9x	⾐	⾑	⾒	⾓	⾔	⾕	⾖	⾗	⾘	⾙	⾚	⾛	⾜	⾝	⾞	⾟
U+2FAx	⾠	⾡	⾢	⾣	⾤	⾥	⾦	⾧	⾨	⾩	⾪	⾫	⾬	⾭	⾮	⾯
U+2FBx	⾰	⾱	⾲	⾳	⾴	⾵	⾶	⾷	⾸	⾹	⾺	⾻	⾼	⾽	⾾	⾿
U+2FCx	⿀	⿁	⿂	⿃	⿄	⿅	⿆	⿇	⿈	⿉	⿊	⿋	⿌	⿍	⿎	⿏
U+2FDx	⿐	⿑	⿒	⿓	⿔	⿕	
"""

In [2]:
zi = [i.strip() for i in radicals.split("\n") if i.strip()]

In [4]:
rad = []
for i in zi:
    rad += i.split("\t")[1:]

In [5]:
rad

['⼀',
 '⼁',
 '⼂',
 '⼃',
 '⼄',
 '⼅',
 '⼆',
 '⼇',
 '⼈',
 '⼉',
 '⼊',
 '⼋',
 '⼌',
 '⼍',
 '⼎',
 '⼏',
 '⼐',
 '⼑',
 '⼒',
 '⼓',
 '⼔',
 '⼕',
 '⼖',
 '⼗',
 '⼘',
 '⼙',
 '⼚',
 '⼛',
 '⼜',
 '⼝',
 '⼞',
 '⼟',
 '⼠',
 '⼡',
 '⼢',
 '⼣',
 '⼤',
 '⼥',
 '⼦',
 '⼧',
 '⼨',
 '⼩',
 '⼪',
 '⼫',
 '⼬',
 '⼭',
 '⼮',
 '⼯',
 '⼰',
 '⼱',
 '⼲',
 '⼳',
 '⼴',
 '⼵',
 '⼶',
 '⼷',
 '⼸',
 '⼹',
 '⼺',
 '⼻',
 '⼼',
 '⼽',
 '⼾',
 '⼿',
 '⽀',
 '⽁',
 '⽂',
 '⽃',
 '⽄',
 '⽅',
 '⽆',
 '⽇',
 '⽈',
 '⽉',
 '⽊',
 '⽋',
 '⽌',
 '⽍',
 '⽎',
 '⽏',
 '⽐',
 '⽑',
 '⽒',
 '⽓',
 '⽔',
 '⽕',
 '⽖',
 '⽗',
 '⽘',
 '⽙',
 '⽚',
 '⽛',
 '⽜',
 '⽝',
 '⽞',
 '⽟',
 '⽠',
 '⽡',
 '⽢',
 '⽣',
 '⽤',
 '⽥',
 '⽦',
 '⽧',
 '⽨',
 '⽩',
 '⽪',
 '⽫',
 '⽬',
 '⽭',
 '⽮',
 '⽯',
 '⽰',
 '⽱',
 '⽲',
 '⽳',
 '⽴',
 '⽵',
 '⽶',
 '⽷',
 '⽸',
 '⽹',
 '⽺',
 '⽻',
 '⽼',
 '⽽',
 '⽾',
 '⽿',
 '⾀',
 '⾁',
 '⾂',
 '⾃',
 '⾄',
 '⾅',
 '⾆',
 '⾇',
 '⾈',
 '⾉',
 '⾊',
 '⾋',
 '⾌',
 '⾍',
 '⾎',
 '⾏',
 '⾐',
 '⾑',
 '⾒',
 '⾓',
 '⾔',
 '⾕',
 '⾖',
 '⾗',
 '⾘',
 '⾙',
 '⾚',
 '⾛',
 '⾜',
 '⾝',
 '⾞',
 '⾟',
 '⾠',
 '⾡',
 '⾢',
 '⾣',
 '⾤',
 '⾥',
 '⾦'

In [6]:
data = []
for r in rad:
    data.append([r, 1, 0])

In [7]:
import pandas as pd 

In [8]:
df = pd.DataFrame(data, columns=["zi","is_radical","is_zi"])

In [9]:
df

Unnamed: 0,zi,is_radical,is_zi
0,⼀,1,0
1,⼁,1,0
2,⼂,1,0
3,⼃,1,0
4,⼄,1,0
...,...,...,...
209,⿑,1,0
210,⿒,1,0
211,⿓,1,0
212,⿔,1,0


In [10]:
df.to_csv("radicals.csv", index=False)

## convert legacy encoding GB2312 to UTF-8

In [4]:
radicals_2 = open("radicals-2.md", encoding="utf-8").read().split("\n")

In [5]:
radicals_2

['# zi (pinyin, meaning)',
 '一 (yī, one)',
 '丨 (gùn, line)',
 '丶 (zhǔ, dot)',
 '丿 (piě, slash)',
 '乙 (yǐ, second)',
 '亅 (gōu, hook)',
 '二 (èr, two)',
 '亠 (tóu, lid)',
 '人 (rén, person)',
 '儿 (ér, legs)',
 '入 (rù, enter)',
 '八 (bā, eight)',
 '冂 (jiǒng, down box)',
 '冖 (mì, cover)',
 '冫 (bīng, ice)',
 '几 (jī, table)',
 '凵 (qǔ, open box)',
 '刀 (dāo, knife)',
 '刂 (biāo, blade)',
 '力 (li, strength)',
 '十 (shí, ten)',
 '匕 (bǐ, spoon)',
 '冓 (yǒng, long)',
 '冖 (mì, cover)',
 '口 (kǒu, mouth)',
 '日 (rì, sun)',
 '月 (yuè, moon)',
 '木 (mù, tree)',
 '火 (huǒ, fire)',
 '土 (tǔ, earth)',
 '金 (jīn, metal)',
 '水 (shuǐ, water)',
 '火 (huǒ, fire)',
 '艹 (cǎo, grass)',
 '虫 (chóng, insect)',
 '彡 (shān, hair)',
 '彳 (chén, walk)',
 '亻 (rén, person)',
 '车 (chē, vehicle)',
 '衣 (yī, clothing)',
 '彐 (tiáo, garment)',
 '彡 (shān, hair)',
 '米 (mǐ, rice)',
 '禾 (hé, grain)',
 '貝 (bèi, shell)',
 '耒 (li, plow)',
 '牜 (niú, cow)',
 '手 (shǒu, hand)',
 '支 (zhī, branch)',
 '攴 (pō, strike)',
 '文 (wén, writing)',
 '卜 (bǔ, divinati

In [6]:
person_1 = "⼈"   # GB2312 encoding
person_2 = "人"   # UTF-8 encoding

In [7]:
person_1 == person_2

False

In [22]:
type(person_1), type(person_2)

(str, str)

In [23]:
ucode_1 = person_1.encode("utf-8")
# ucode_1g = person_1.encode("gb2312")

ucode_2 = person_2.encode("utf-8")
print(ucode_1, ucode_2)

UnicodeEncodeError: 'gb2312' codec can't encode character '\u2f08' in position 0: illegal multibyte sequence

Claude AI answers:
    
```
You're absolutely right. The same Chinese character can have different Unicode encodings depending on the encoding form used.

The two Unicode encodings you mentioned for the character "人" are:

b'\xe2\xbc\x88' - This is the Unicode code point U+0088 in the GB2312 encoding.

b'\xe4\xba\xba' - This is the code point U+4EBA in the UTF-8 encoding.

The reason is that Chinese has tens of thousands of characters, and early encodings like GB2312 could only encode a subset of the most commonly used characters.

UTF-8 was later created to be able to encode all Unicode characters, including the entire Chinese character set.

So the same Chinese character maps to different code points in legacy encodings like GB2312 vs the unified Unicode encodings like UTF-8.

To summarize:

```

- Same Chinese character can have different Unicode code points in different encodings
- Legacy encodings like GB2312 encode only a subset of Chinese characters
- UTF-8 encodes the full Unicode character set including all Chinese chars
- Always use UTF-8 for Chinese text to ensure consistency

The key is to convert any legacy encoding to UTF-8 to unify the encoding for consistent handling of Chinese text in programs.
    




In [10]:
gb2312_byte_string = b'\xe2\xbc\x88' 

# Decode from GB2312 to Unicode string
unicode_string = gb2312_byte_string.decode('gb2312')

# Encode the Unicode string to UTF-8
utf8_byte_string = unicode_string.encode('utf-8')

print(utf8_byte_string)

UnicodeDecodeError: 'gb2312' codec can't decode byte 0x88 in position 2: incomplete multibyte sequence

In [11]:
import chardet

data = b'\xe2\xbc\x88'
encoding = chardet.detect(data)['encoding']

print(encoding)

Windows-1252


In [12]:
encoding = chardet.detect(gb2312_byte_string)['encoding']
unicode_string = gb2312_byte_string.decode(encoding)
utf8_byte_string = unicode_string.encode('utf-8')
print(utf8_byte_string)

b'\xc3\xa2\xc2\xbc\xcb\x86'


In [14]:
print(utf8_byte_string.decode("utf-8"))

â¼ˆ


In [7]:
import codecs

def gb2312_to_utf8(string):
    return codecs.decode(string, 'gb2312').encode('utf-8')

In [21]:
import re

def replace_emoji(string):
    return re.sub(r'[\U00010000-\U0010ffff]', '', string)

# Example usage:
# gb2312_string = "⼈".encode("gb2312") # b'\xe2\xbc\x88' # 
gb2312_string = '你好，世界！'
gb2312_string = replace_emoji('你好，世界！')

utf8_string = gb2312_to_utf8(gb2312_string)
print(utf8_string)

TypeError: decoding with 'gb2312' codec failed (TypeError: a bytes-like object is required, not 'str')

In [24]:
import chardet

string = "你好，世界！"

encoding = chardet.detect(string)

print(encoding['encoding'])

TypeError: Expected object of type bytes or bytearray, got: <class 'str'>

In [10]:
import chardet

strings = ["长", "一","一","不", "上", "⼈", "人", "你好", "⼉", "儿"]

for s in strings:

    # Convert the str object to a bytes object.
    bytes_string = s.encode('utf-8')

    # Detect the encoding of the bytes object.
    encoding = chardet.detect(bytes_string)['encoding']
    
#     utf8_str  = gb2312_to_utf8(bytes_string)

    print(f"{s} : {bytes_string} : {encoding} : ")

长 : b'\xe9\x95\xbf' : Windows-1252 : 
一 : b'\xe4\xb8\x80' : Windows-1252 : 
一 : b'\xe4\xb8\x80' : Windows-1252 : 
不 : b'\xe4\xb8\x8d' : utf-8 : 
上 : b'\xe4\xb8\x8a' : Windows-1252 : 
⼈ : b'\xe2\xbc\x88' : Windows-1252 : 
人 : b'\xe4\xba\xba' : ISO-8859-1 : 
你好 : b'\xe4\xbd\xa0\xe5\xa5\xbd' : utf-8 : 
⼉ : b'\xe2\xbc\x89' : Windows-1252 : 
儿 : b'\xe5\x84\xbf' : Windows-1252 : 


UnicodeDecodeError: 'gb2312' codec can't decode byte 0x80 in position 2: incomplete multibyte sequence


"⼈" not found in uncode.org Unihan db

