Update 100 most commonly used parts or simple Zis

In [13]:
import pandas as pd 
from utils import *



In [2]:
def create_df_from_list(data_list):
    # Create DataFrame directly from the list of dictionaries
    df = pd.DataFrame(data_list)
    
    # Optional: Convert is_zi to boolean type
#     df['is_zi'] = df['is_zi'].astype(bool)
    
    return df

def merge_zi_dataframes(df1, df2):
    """
    Merge two dataframes on 'zi' column using left merge
    
    Parameters:
    df1: First DataFrame with columns ['zi', 'is_zi']
    df2: Second DataFrame with columns ['zi', 'pinyin']
    
    Returns:
    Merged DataFrame with columns ['zi', 'is_zi', 'pinyin']
    """
    # Perform left merge on 'zi' column
    merged_df = pd.merge(
        left=df1,
        right=df2,
        on='zi',
        how='left'
    ).fillna("")
    
    return merged_df

In [3]:
DEBUG = False  # True

In [4]:
file_xlsx = r"elemental_zi_v3.xlsx"

In [5]:
df = pd.read_excel(file_xlsx, sheet_name="elemental_zi_v3").fillna("") 

In [6]:
df

Unnamed: 0,zi,is_zi,id_kangxi,meaning,pinyin,n_strokes,term,examples,variant
0,丨,0,2.0,line,gùn / gǔn,1,,十、中、串、丰,
1,丶,0,3.0,dot,zhǔ,1,丶字旁,丸、凡、丹、户,
2,丿,0,,slash,piě,1,,乂、乃、久、八,
3,乀,0,,slash,piě,1,,乂、乃、久、八,
4,乁,0,,slash,piě,1,,,
...,...,...,...,...,...,...,...,...,...
417,黄,1,,yellow,huáng,12,,,黃
418,黑,1,203.0,black,hēi,12,,墨、黓、黔,
419,鼓,1,207.0,drum,gǔ,13,,鼕、鼖、鼗,鼔
420,鼻,1,209.0,nose,bí,14,,鼽、鼾、齁,


In [7]:
with DBConn() as _conn:
    df.to_sql("t_elemental_zi", _conn, if_exists="replace", index=False)

In [8]:
elements = df["zi"].to_list()

In [9]:
len(elements) , elements[:10]

(422, ['丨', '丶', '丿', '乀', '乁', '乚', '乛', '亅', '一', '乙'])

### Frequency

In [10]:
data_freq = []
with DBConn() as _conn:
    for x in elements:
        
        if x not in ['日', '乙', '口', '目'] and DEBUG: continue 

        sql_1 = f""" 
with hot1 as (
	select 
		  case when zi_left_up = '{x}' then 1 else 0 end as n_left_up
		, case when zi_left = '{x}' then 1 else 0 end as n_left
		, case when zi_left_down = '{x}' then 1 else 0 end as n_left_down
		, case when zi_up = '{x}' then 1 else 0 end as n_up
		, case when zi_mid = '{x}' then 1 else 0 end as n_mid
		, case when zi_down = '{x}' then 1 else 0 end as n_down
		, case when zi_right_up = '{x}' then 1 else 0 end as n_right_up
		, case when zi_right = '{x}' then 1 else 0 end as n_right
		, case when zi_right_down = '{x}' then 1 else 0 end as n_right_down
		, case when zi_mid_in = '{x}' then 1 else 0 end as n_mid_in
		, case when zi_mid_out = '{x}' then 1 else 0 end as n_mid_out
	from t_zi_part 
)
, sum1 as (
	select 
		sum(n_left_up) as n1, sum(n_left) as n2, sum(n_left_down) as n3,
		sum(n_up) as n4, sum(n_mid) as n5, sum(n_down) as n6,
		sum(n_right_up) as n7, sum(n_right) as n8, sum(n_right_down) as n9,
		sum(n_mid_in) as n10, sum(n_mid_out) as n11
	from hot1
)
select '{x}' as zi, (n1+n2+n3+n4+n5+n6+n7+n8+n9+n10+n11) as n_frequency
from sum1
;        
        
        """
        df_1 = pd.read_sql(sql_1, _conn)
        r = df_1.to_dict("records")
        print(r)
        data_freq += r
        
#         n_frequency = r[0].get('freq', 0)
#         sql_stmt = f"""
#             update t_elemental_zi 
#             set n_frequency = '{n_frequency}'
#                 where zi='{x}' and n_frequency is null 
#             ;
#         """
#         print(sql_stmt)
#         db_run_sql(sql_stmt, _conn, debug=False)
            


[{'zi': '丨', 'n_frequency': 25}]
[{'zi': '丶', 'n_frequency': 26}]
[{'zi': '丿', 'n_frequency': 40}]
[{'zi': '乀', 'n_frequency': 3}]
[{'zi': '乁', 'n_frequency': 1}]
[{'zi': '乚', 'n_frequency': 9}]
[{'zi': '乛', 'n_frequency': 3}]
[{'zi': '亅', 'n_frequency': 1}]
[{'zi': '一', 'n_frequency': 73}]
[{'zi': '乙', 'n_frequency': 11}]
[{'zi': '⺀', 'n_frequency': 0}]
[{'zi': '丷', 'n_frequency': 26}]
[{'zi': '乂', 'n_frequency': 23}]
[{'zi': '龴', 'n_frequency': 2}]
[{'zi': '亠', 'n_frequency': 27}]
[{'zi': '亻', 'n_frequency': 213}]
[{'zi': '冂', 'n_frequency': 17}]
[{'zi': '冖', 'n_frequency': 35}]
[{'zi': '冫', 'n_frequency': 29}]
[{'zi': '凵', 'n_frequency': 7}]
[{'zi': '⺈', 'n_frequency': 0}]
[{'zi': '刂', 'n_frequency': 45}]
[{'zi': '勹', 'n_frequency': 14}]
[{'zi': '匚', 'n_frequency': 14}]
[{'zi': '匸', 'n_frequency': 5}]
[{'zi': '卩', 'n_frequency': 8}]
[{'zi': '㔾', 'n_frequency': 6}]
[{'zi': '⺁', 'n_frequency': 0}]
[{'zi': '厶', 'n_frequency': 25}]
[{'zi': '讠', 'n_frequency': 102}]
[{'zi': '丁', 'n_frequ

[{'zi': '电', 'n_frequency': 3}]
[{'zi': '白', 'n_frequency': 32}]
[{'zi': '皮', 'n_frequency': 17}]
[{'zi': '皿', 'n_frequency': 30}]
[{'zi': '目', 'n_frequency': 79}]
[{'zi': '矛', 'n_frequency': 8}]
[{'zi': '矢', 'n_frequency': 16}]
[{'zi': '石', 'n_frequency': 71}]
[{'zi': '示', 'n_frequency': 13}]
[{'zi': '禾', 'n_frequency': 78}]
[{'zi': '穴', 'n_frequency': 41}]
[{'zi': '立', 'n_frequency': 34}]
[{'zi': '鸟', 'n_frequency': 62}]
[{'zi': '龙', 'n_frequency': 10}]
[{'zi': '囟', 'n_frequency': 2}]
[{'zi': '尧', 'n_frequency': 9}]
[{'zi': '屰', 'n_frequency': 3}]
[{'zi': '⺮', 'n_frequency': 108}]
[{'zi': '⺶', 'n_frequency': 0}]
[{'zi': '⺷', 'n_frequency': 1}]
[{'zi': '聿', 'n_frequency': 5}]
[{'zi': '艮', 'n_frequency': 19}]
[{'zi': '虍', 'n_frequency': 12}]
[{'zi': '覀', 'n_frequency': 7}]
[{'zi': '交', 'n_frequency': 19}]
[{'zi': '共', 'n_frequency': 15}]
[{'zi': '各', 'n_frequency': 20}]
[{'zi': '合', 'n_frequency': 18}]
[{'zi': '吉', 'n_frequency': 14}]
[{'zi': '向', 'n_frequency': 3}]
[{'zi': '吕', 'n_fre

In [11]:
data_freq

[{'zi': '丨', 'n_frequency': 25},
 {'zi': '丶', 'n_frequency': 26},
 {'zi': '丿', 'n_frequency': 40},
 {'zi': '乀', 'n_frequency': 3},
 {'zi': '乁', 'n_frequency': 1},
 {'zi': '乚', 'n_frequency': 9},
 {'zi': '乛', 'n_frequency': 3},
 {'zi': '亅', 'n_frequency': 1},
 {'zi': '一', 'n_frequency': 73},
 {'zi': '乙', 'n_frequency': 11},
 {'zi': '⺀', 'n_frequency': 0},
 {'zi': '丷', 'n_frequency': 26},
 {'zi': '乂', 'n_frequency': 23},
 {'zi': '龴', 'n_frequency': 2},
 {'zi': '亠', 'n_frequency': 27},
 {'zi': '亻', 'n_frequency': 213},
 {'zi': '冂', 'n_frequency': 17},
 {'zi': '冖', 'n_frequency': 35},
 {'zi': '冫', 'n_frequency': 29},
 {'zi': '凵', 'n_frequency': 7},
 {'zi': '⺈', 'n_frequency': 0},
 {'zi': '刂', 'n_frequency': 45},
 {'zi': '勹', 'n_frequency': 14},
 {'zi': '匚', 'n_frequency': 14},
 {'zi': '匸', 'n_frequency': 5},
 {'zi': '卩', 'n_frequency': 8},
 {'zi': '㔾', 'n_frequency': 6},
 {'zi': '⺁', 'n_frequency': 0},
 {'zi': '厶', 'n_frequency': 25},
 {'zi': '讠', 'n_frequency': 102},
 {'zi': '丁', 'n_frequ

In [12]:
df_freq = create_df_from_list(data_freq)

In [13]:
df_freq

Unnamed: 0,zi,n_frequency
0,丨,25
1,丶,26
2,丿,40
3,乀,3
4,乁,1
...,...,...
417,黄,0
418,黑,23
419,鼓,3
420,鼻,3


In [14]:
df.head()

Unnamed: 0,zi,is_zi,id_kangxi,meaning,pinyin,n_strokes,term,examples,variant
0,丨,0,2.0,line,gùn / gǔn,1,,十、中、串、丰,
1,丶,0,3.0,dot,zhǔ,1,丶字旁,丸、凡、丹、户,
2,丿,0,,slash,piě,1,,乂、乃、久、八,
3,乀,0,,slash,piě,1,,乂、乃、久、八,
4,乁,0,,slash,piě,1,,,


In [15]:
df = merge_zi_dataframes(df, df_freq)
df.head(10)

Unnamed: 0,zi,is_zi,id_kangxi,meaning,pinyin,n_strokes,term,examples,variant,n_frequency
0,丨,0,2.0,line,gùn / gǔn,1,,十、中、串、丰,,25
1,丶,0,3.0,dot,zhǔ,1,丶字旁,丸、凡、丹、户,,26
2,丿,0,,slash,piě,1,,乂、乃、久、八,,40
3,乀,0,,slash,piě,1,,乂、乃、久、八,,3
4,乁,0,,slash,piě,1,,,,1
5,乚,0,,second,yǐ,1,,九、乞、也,,9
6,乛,0,,second,yǐ,1,,九、乞、也,,3
7,亅,0,6.0,hook,gōu / jué,1,,了、矛、事,,1
8,一,1,1.0,one,yī,1,一字旁,王、丁、七、三,丶 丨 亅 丿 乀 ⺄ 乁 乙 乚 乛,73
9,乙,1,,second,yǐ,1,,九、乞、也,,11


In [16]:
with DBConn() as _conn:
    df.to_sql("t_ele_zi", _conn, if_exists="replace", index=False) 

### merge category

In [17]:
with DBConn() as _conn:
    sql_2 = f""" 
        select 
            p.zi, p.category, p.sub_category
        from t_part p
        where trim(p.category || '') != ''
;        
        
    """
    df_2 = pd.read_sql(sql_2, _conn).fillna("")

In [18]:
df_2

Unnamed: 0,zi,category,sub_category
0,⺌,概念-,
1,⺍,概念-,
2,氺,天文-水,
3,𤴔,人-生理,
4,罓,社会-,tool
...,...,...,...
385,⺁,radical,
386,龺,radical,
387,毋,概念-,
388,⺪,人-生理,


In [19]:
df = merge_zi_dataframes(df, df_2)
df.head(10)

Unnamed: 0,zi,is_zi,id_kangxi,meaning,pinyin,n_strokes,term,examples,variant,n_frequency,category,sub_category
0,丨,0,2.0,line,gùn / gǔn,1,,十、中、串、丰,,25,radical,
1,丶,0,3.0,dot,zhǔ,1,丶字旁,丸、凡、丹、户,,26,radical,
2,丿,0,,slash,piě,1,,乂、乃、久、八,,40,radical,
3,乀,0,,slash,piě,1,,乂、乃、久、八,,3,radical,
4,乁,0,,slash,piě,1,,,,1,radical,
5,乚,0,,second,yǐ,1,,九、乞、也,,9,radical,
6,乛,0,,second,yǐ,1,,九、乞、也,,3,radical,
7,亅,0,6.0,hook,gōu / jué,1,,了、矛、事,,1,radical,
8,一,1,1.0,one,yī,1,一字旁,王、丁、七、三,丶 丨 亅 丿 乀 ⺄ 乁 乙 乚 乛,73,数理-计算,number
9,乙,1,,second,yǐ,1,,九、乞、也,,11,数理-天干,


In [20]:
df.to_csv("elemental_zi_v4.csv", index=False)

In [21]:
with DBConn() as _conn:
    df["notes"] = ""
    df.to_sql("t_ele_zi", _conn, if_exists="replace", index=False)

#### convert id_kangxi to int, trim(meaning)

In [22]:
with DBConn() as _conn:
    sql_stmt = f"""
        update t_ele_zi 
        set id_kangxi = cast(id_kangxi as INT),
            meaning = trim(meaning),
            n_frequency = cast(n_frequency as INT)
        ;
    """
    db_run_sql(sql_stmt, _conn, debug=False)     

In [14]:
with DBConn() as _conn:
    sql_2 = f""" 
        select 
            n_strokes, zi
        from t_ele_zi
        order by n_strokes, zi
;        
        
    """
    df_2 = pd.read_sql(sql_2, _conn).fillna("")

In [15]:
df_2

Unnamed: 0,n_strokes,zi
0,1,一
1,1,丨
2,1,丶
3,1,丿
4,1,乀
...,...,...
417,12,黄
418,12,黍
419,12,黑
420,13,鼓


### get zi_part columns

In [10]:
s = """
		  case when zi_left_up = '{x}' then 1 else 0 end as n_left_up
		, case when zi_left = '{x}' then 1 else 0 end as n_left
		, case when zi_left_down = '{x}' then 1 else 0 end as n_left_down
		, case when zi_up = '{x}' then 1 else 0 end as n_up
		, case when zi_mid = '{x}' then 1 else 0 end as n_mid
		, case when zi_down = '{x}' then 1 else 0 end as n_down
		, case when zi_right_up = '{x}' then 1 else 0 end as n_right_up
		, case when zi_right = '{x}' then 1 else 0 end as n_right
		, case when zi_right_down = '{x}' then 1 else 0 end as n_right_down
		, case when zi_mid_in = '{x}' then 1 else 0 end as n_mid_in
		, case when zi_mid_out = '{x}' then 1 else 0 end as n_mid_out

"""

cols = []
for i in s.split("\n"):
#     print(i)
    if not i.strip(): continue
    j = i.split("=")
#     print(j[0])
    x = j[0].replace(",", "").replace("case when", "").strip()
    cols.append(x)

cols

['zi_left_up',
 'zi_left',
 'zi_left_down',
 'zi_up',
 'zi_mid',
 'zi_down',
 'zi_right_up',
 'zi_right',
 'zi_right_down',
 'zi_mid_in',
 'zi_mid_out']

In [12]:
" is not null or  ".join(cols)

'zi_left_up is not null or  zi_left is not null or  zi_left_down is not null or  zi_up is not null or  zi_mid is not null or  zi_down is not null or  zi_right_up is not null or  zi_right is not null or  zi_right_down is not null or  zi_mid_in is not null or  zi_mid_out'