In [1]:
from utils import *

### zi ~ part decomposition

In [2]:
sql_stmt = """

-- unique parts
with parts as (  
    select zi from t_part where is_active = 'Y'
    union 
    select zi from t_zi where is_active = 'Y' and as_part='Y'
)
-- zi vs part
, zi_part as (
    select zi_left_up as part, zi from t_zi_part 
    union all
    select zi_left as part, zi from t_zi_part 
    union all
    select zi_left_down as part, zi from t_zi_part 
    union all
    select zi_up as part, zi from t_zi_part 
    union all
    select zi_mid as part, zi from t_zi_part 
    union all
    select zi_down as part, zi from t_zi_part 
    union all
    select zi_right_up as part, zi from t_zi_part 
    union all
    select zi_right as part, zi from t_zi_part 
    union all
    select zi_right_down as part, zi from t_zi_part 
    union all
    select zi_mid_in as part, zi from t_zi_part 
    union all
    select zi_mid_out as part, zi from t_zi_part 
)
-- unique zi,part
, zi_part_2 as (
    select distinct zp.zi, zp.part 
    from zi_part zp 
    join parts p 
        on zp.part = p.zi
    where zp.part is not null and trim(zp.part) !='' 
)
select * from zi_part_2 
order by zi,part;
"""

with DBConn() as _conn:
    df_part = pd.read_sql(sql_stmt, _conn).fillna("")

In [3]:
df_part

Unnamed: 0,zi,part
0,㐬,云
1,㐬,川
2,㐱,人
3,㐱,彡
4,㑗,人
...,...,...
13940,𪙉,齿
13941,𪙊,兼
13942,𪙊,齿
13943,𪙑,齿


In [4]:
df_part.to_csv("zi_decomposed-v1.csv", index=False)

### part frequency

In [5]:
sql_stmt = """

-- unique parts
with parts as (  
    select zi from t_part where is_active = 'Y'
    union 
    select zi from t_zi where is_active = 'Y' and as_part='Y'
)
-- zi vs part
, zi_part as (
    select zi_left_up as part, zi from t_zi_part 
    union all
    select zi_left as part, zi from t_zi_part 
    union all
    select zi_left_down as part, zi from t_zi_part 
    union all
    select zi_up as part, zi from t_zi_part 
    union all
    select zi_mid as part, zi from t_zi_part 
    union all
    select zi_down as part, zi from t_zi_part 
    union all
    select zi_right_up as part, zi from t_zi_part 
    union all
    select zi_right as part, zi from t_zi_part 
    union all
    select zi_right_down as part, zi from t_zi_part 
    union all
    select zi_mid_in as part, zi from t_zi_part 
    union all
    select zi_mid_out as part, zi from t_zi_part 
)
-- unique zi,part
, zi_part_2 as (
    select distinct zp.zi, zp.part 
    from zi_part zp 
    join parts p 
        on zp.part = p.zi
    where zp.part is not null and trim(zp.part) !='' 
)
--select * from zi_part_2 order by zi,part;
-- count part frequency
select part,count(zi) from zi_part_2 
group by part -- having count(zi) > 10
order by count(zi) desc, part;
"""

with DBConn() as _conn:
    df_part_freq = pd.read_sql(sql_stmt, _conn).fillna("")

In [7]:
df_part_freq

Unnamed: 0,part,count(zi)
0,木,510
1,氵,374
2,口,334
3,纟,315
4,讠,308
...,...,...
418,闰,1
419,阜,1
420,麦,1
421,龴,1


In [8]:
df_part_freq.to_csv("zi_part_freq.csv", index=False)

### doubles

In [15]:
sql_stmt = """
with x as (
    select * from t_zi_part 
    where zi_up = zi_down 
        and (zi_up is not null and trim(zi_up) !='')
        -- others null
        and (zi_mid is null or trim(zi_mid) ='')
        and (zi_mid_in is null or trim(zi_mid_in) ='')
        and (zi_mid_out is null or trim(zi_mid_out) ='')

        and (zi_left_up is null or trim(zi_left_up) ='')
        and (zi_left is null or trim(zi_left) ='')
        and (zi_left_down is null or trim(zi_left_down) ='')

        and (zi_right_up is null or trim(zi_right_up) ='')
        and (zi_right is null or trim(zi_right) ='')
        and (zi_right_down is null or trim(zi_right_down) ='')

    union 

    select * from t_zi_part 
    where zi_left = zi_right
        and (zi_left is not null and trim(zi_left) !='')
        -- others null
        and (zi_mid is null or trim(zi_mid) ='')
        and (zi_mid_in is null or trim(zi_mid_in) ='')
        and (zi_mid_out is null or trim(zi_mid_out) ='')

        and (zi_left_up is null or trim(zi_left_up) ='')
        and (zi_up is null or trim(zi_up) ='')
        and (zi_right_up is null or trim(zi_right_up) ='')

        and (zi_left_down is null or trim(zi_left_down) ='')
        and (zi_down is null or trim(zi_down) ='')
        and (zi_right_down is null or trim(zi_right_down) ='')

)
select * from x order by zi_left, zi_up
"""

with DBConn() as _conn:
    df_double = pd.read_sql(sql_stmt, _conn).fillna("")

In [16]:
df_double

Unnamed: 0,zi,u_id,zi_left_up,zi_left,zi_left_down,zi_up,zi_mid,zi_down,zi_right_up,zi_right,zi_right_down,zi_mid_out,zi_mid_in,ts,desc_cn,is_active,id_shuowen,hsk_note,desc_en
0,吕,6941,,,,口,,口,,,,,,2024-01-27 00:56:21,,Y,,from HSK,"pitchpipe, pitch standard, one of the twelve s..."
1,哥,3896,,,,可,,可,,,,,,2024-01-27 08:55:35,,Y,,from HSK,elder brother
2,多,200,,,,夕,,夕,,,,,,2024-01-15 12:50:41,,Y,,from HSK,many; much/too many
3,出,2417,,,,山,,山,,,,,,2024-01-14 15:15:41,,Y,,from HSK,to go out/to come out/to occur/to produce/to g...
4,炎,11544,,,,火,,火,,,,,,2024-01-26 23:04:27,,Y,,from HSK,flame/inflammation/-itis
5,爻,13837,,,,乂,,乂,,,,,,2024-01-25 02:10:54,,Y,,,
6,圭,13732,,,,土,,土,,,,,,2024-01-21 18:14:29,,Y,,,
7,戋,13901,,,,戈,,戈,,,,,,2024-01-28 11:09:59,,Y,,,
8,戔,13900,,,,戈,,戈,,,,,,2024-01-28 11:09:30,,Y,,,
9,羽,12309,,习,,,,,,习,,,,2024-01-17 00:11:33,,Y,,from HSK,feather/5th note in pentatonic scale


In [17]:
df_double.to_csv("zi_doubles.csv", index=False)

### triples

In [18]:
sql_stmt = """
with x as (
    select * from t_zi_part 
    where zi_up = zi_left_down and zi_up = zi_right_down
        and (zi_up is not null and trim(zi_up) !='')
        -- others null
        and (zi_mid is null or trim(zi_mid) ='')
        and (zi_mid_in is null or trim(zi_mid_in) ='')
        and (zi_mid_out is null or trim(zi_mid_out) ='')

        and (zi_left_up is null or trim(zi_left_up) ='')
        and (zi_left is null or trim(zi_left) ='')

        and (zi_right_up is null or trim(zi_right_up) ='')
        and (zi_right is null or trim(zi_right) ='')

)
select * from x order by zi_left, zi_up
"""

with DBConn() as _conn:
    df_triple = pd.read_sql(sql_stmt, _conn).fillna("")

In [19]:
df_triple

Unnamed: 0,zi,u_id,zi_left_up,zi_left,zi_left_down,zi_up,zi_mid,zi_down,zi_right_up,zi_right,zi_right_down,zi_mid_out,zi_mid_in,ts,desc_cn,is_active,id_shuowen,hsk_note,desc_en
0,众,199,,,人,人,,,,,人,,,2024-01-15 12:50:32,,Y,,from HSK,crowd; multitude/many
1,品,280,,,口,口,,,,,口,,,2024-01-25 01:35:16,,Y,,from HSK,(bound form) article; commodity
2,晶,5502,,,日,日,,,,,日,,,2024-01-23 22:31:46,,Y,,,
3,矗,2455,,,直,直,,,,,直,,,2024-01-28 13:29:42,,Y,,from HSK,lofty/upright
4,劦,13699,,,力,力,,,,,力,,,2024-01-21 12:55:56,,Y,,,
5,厽,13828,,,厶,厶,,,,,厶,,,2024-01-23 22:27:18,,Y,,,
6,叒,13866,,,又,又,,,,,又,,,2024-01-27 10:47:57,,Y,,6,
7,惢,13852,,,心,心,,,,,心,,,2024-01-25 22:16:16,,Y,,,
8,森,9077,,,木,木,,,,,木,,,2024-01-20 05:26:02,木多皃。从林从木。讀若曾參之參。,Y,3838.0,in HSK,
9,毳,13706,,,毛,毛,,,,,毛,,,2024-01-21 13:14:04,,Y,,,


In [20]:
df_triple.to_csv("zi_triples.csv", index=False)

### quads

In [21]:
sql_stmt = """
with x as (
    select * from t_zi_part 
    where zi_left_up = zi_left_down and zi_left_up = zi_right_up and zi_left_up = zi_right_down
        and (zi_left_up is not null and trim(zi_left_up) !='')
        -- others null
        and (zi_mid is null or trim(zi_mid) ='')
        and (zi_mid_in is null or trim(zi_mid_in) ='')
        and (zi_mid_out is null or trim(zi_mid_out) ='')

        and (zi_left is null or trim(zi_left) ='')
        and (zi_right is null or trim(zi_right) ='')

        and (zi_up is null or trim(zi_up) ='')
        and (zi_down is null or trim(zi_down) ='')

)
select * from x order by zi_left, zi_up
"""

with DBConn() as _conn:
    df_quad = pd.read_sql(sql_stmt, _conn).fillna("")

In [22]:
df_quad

Unnamed: 0,zi,u_id,zi_left_up,zi_left,zi_left_down,zi_up,zi_mid,zi_down,zi_right_up,zi_right,zi_right_down,zi_mid_out,zi_mid_in,ts,desc_cn,is_active,id_shuowen,hsk_note,desc_en
0,叕,13675,又,,又,,,,又,,又,,,2024-01-21 11:29:31,,Y,,,
1,㗊,13831,口,,口,,,,口,,口,,,2024-01-23 22:30:37,,Y,,,
2,㸚,13868,乂,,乂,,,,乂,,乂,,,2024-01-27 10:54:51,,Y,,,


In [23]:
df_quad.to_csv("zi_quads.csv", index=False)