### Categorize kanji

In [1]:
# imports
import pandas
import logging

from dataclasses import dataclass

logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s %(levelname)-4s %(message)s', datefmt='%m/%d %H:%M:%S')

# data classes
@dataclass        
class Kanji:
    rank: int
    name: str

In [2]:
# read excel file (with mapping)
df_kanji = pandas.read_excel("1500 KANJI COMPONENTS - ver. 1.2.xlsx", sheet_name="MAIN")
df_kanji.columns = ["CHAR", "COMPONENTS1", "COMPONENTS2", "COMPONENTS3", "COMPONENTS4", "COMPONENTS5", "ON READING", "KUN READING", "KEYWORD", "SRL", "TYPE", "FREQ", "TAGS"]
df_kanji

Unnamed: 0,CHAR,COMPONENTS1,COMPONENTS2,COMPONENTS3,COMPONENTS4,COMPONENTS5,ON READING,KUN READING,KEYWORD,SRL,TYPE,FREQ,TAGS
0,一,,,,,,イチ、イツ,ひと・つ,one,5,STEM,10.800000,
1,二,,,,,,ニ,ふた・つ,two,5,MEAN,128.300000,
2,三,,,,,,サン,みっ・つ,three,5,MEAN,120.700000,
3,四,,,,,,シ,よっ・つ、よん、よ,four,5,MEAN,312.073333,
4,五,,,,,,ゴ,いつ・つ,five,5,MEAN,315.626667,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,気,气,㐅,,,,キ、ケ,,atmosphere,1,OTHER,64.000000,
1496,風,𠘨,䖝,,,,フウ、フ,かぜ,1 wind 2 style,1,OTHER,289.700000,
1497,乳,⺤,子,乚,,,ニュウ,ち、ちち,milk,1,OTHER,1067.000000,
1498,興,臼,同,ハ,,,コウ、キョウ,,interest,1,OTHER,649.600000,


In [3]:
df_keyword = pandas.read_excel("1500 KANJI COMPONENTS - ver. 1.2.xlsx", sheet_name="keyword.list")
# df_keyword

In [4]:
df_stem = pandas.read_excel("1500 KANJI COMPONENTS - ver. 1.2.xlsx", sheet_name="stem.list")
# df_stem

### Algorithm

In [16]:
random_row = df_kanji[df_kanji["CHAR"] == '弁'].iloc[0]
random_row

CHAR                       弁
COMPONENTS1                厶
COMPONENTS2                廾
COMPONENTS3              NaN
COMPONENTS4              NaN
COMPONENTS5              NaN
ON READING                ベン
KUN READING              NaN
KEYWORD        clarification
SRL                        1
TYPE                  VISUAL
FREQ                  1609.5
TAGS                     NaN
Name: 61, dtype: object

In [19]:
categorization = {}
special_grp = '78 special'
other_grp = '77 other'

for grp in df_keyword["GROUP"].unique():
    categorization[grp] = []

for grp in df_stem["GROUP"].unique():
    categorization[grp] = []

categorization[other_grp] = []
categorization[special_grp] = []
categorization["visual"] = []

# categorization["1 numbers"] = []
# categorization["1 numbers"].append(Test("aa", "bb"))
# categorization["1 numbers"].append(Test("ca", "ab"))
# categorization["1 numbers"].insert(0, Test("xxx", "ab"))

# categorization

queue_categorization = {}
categorization

{'1 numbers': [],
 '2 family': [],
 '3 colors': [],
 '31 cardinals': [],
 '32 season of the year': [],
 '33 parts of the day': [],
 '77 other': [],
 '4 Sunday': [],
 '5 Monday': [],
 '6 Tuesday': [],
 '7 flame': [],
 '8 Wednesday': [],
 '9 alcohol': [],
 '10 Thursday': [],
 '11 Friday': [],
 '12 Saturday': [],
 '13 human': [],
 '14 woman': [],
 '15 child': [],
 '16 ear': [],
 '17 eye': [],
 '18 to see': [],
 '19 mouth': [],
 '20 tongue': [],
 '21 talk': [],
 '22 beard': [],
 '23 heart': [],
 '24 arm': [],
 '25 leg': [],
 '26 kneel': [],
 '27 stand': [],
 '28 run': [],
 '29 route': [],
 '30 direction': [],
 '32 seasons of the year': [],
 '34 car': [],
 '35 ship': [],
 '36 rain': [],
 '37 gate': [],
 '38 roof A': [],
 '39 roof B': [],
 '40 roof C': [],
 '41 roof D': [],
 '42 temple': [],
 '43 insect': [],
 '44 dog': [],
 '45 hound': [],
 '46 sheep': [],
 '47 cow': [],
 '48 horse': [],
 '49 bird': [],
 '50 feather': [],
 '51 field': [],
 '52 mound': [],
 '53 grass': [],
 '54 thread': [],


In [21]:
# print("char: ", random_row)

# first rule (check keyword)
def find_keyword(row):
    group = df_keyword[df_keyword["KEYWORD"] == row["KEYWORD"]]["GROUP"]
    if group.empty:
        return "none"
    else:
        return group.iloc[0]

def find_stem(row):
    group = df_stem[df_stem["STEM KANJI"] == row["CHAR"]]["GROUP"]
    if group.empty:
        return "none"
    else:
        return group.iloc[0]
    
def append_categorization(char, row, is_first):
    if is_first:
        if char in queue_categorization.keys():
            for ch in queue_categorization[char]:
                categorization[char].insert(0, ch)
            categorization[char].insert(0, row)
        else:
            categorization[char].insert(0, row)
    else:
        if char in queue_categorization.keys():
            categorization[char].append(row)
            for ch in queue_categorization[char]:
                categorization[char].append(ch)
        else:
            categorization[char].append(row)

def find_cluster_1_2_components(component, row):
    return df_kanji[
        ((df_kanji["COMPONENTS1"] == row[component]) | 
         (df_kanji["COMPONENTS2"] == row[component])) 
        & (df_kanji["CHAR"] != row["CHAR"])
    ]

def find_cluster_components(component, row):
    return df_kanji[
        ((df_kanji["COMPONENTS2"] == row[component]) | 
         (df_kanji["COMPONENTS3"] == row[component]) |
         (df_kanji["COMPONENTS4"] == row[component]) |
         (df_kanji["COMPONENTS5"] == row[component])) 
        & (df_kanji["CHAR"] != row["CHAR"])
    ]

def find_cluster_all_components(component, row):
    return df_kanji[
        ((df_kanji["COMPONENTS1"] == row[component]) | 
        (df_kanji["COMPONENTS2"] == row[component]) | 
        (df_kanji["COMPONENTS3"] == row[component]) |
        (df_kanji["COMPONENTS4"] == row[component]) |
        (df_kanji["COMPONENTS5"] == row[component]))
        & (df_kanji["CHAR"] != row["CHAR"])
    ]

def find_onyomi(random_row, vr_cluster):
    vr_crowns = random_row["ON READING"].split("、")
    onyomi = vr_cluster[vr_cluster["ON READING"].isin(vr_crowns)]
    if onyomi.empty:
        fifth_rule(random_row)
    else:
        if len(onyomi.index) > 1:
            print("kanji > 1")
            max_srl_kanji = onyomi[onyomi["SRL"] == onyomi["SRL"].max()].iloc[0]
            random_row["TYPE"] = "VR"

            if max_srl_kanji["CHAR"] in queue_categorization.keys():
                queue_categorization[max_srl_kanji["CHAR"]].append(random_row)
            else:
                queue_categorization[max_srl_kanji["CHAR"]] = []
                queue_categorization[max_srl_kanji["CHAR"]].append(random_row)
        else:
            print("kanji = 1")
            random_row["TYPE"] = "VR"
            max_srl_kanji = onyomi.iloc[0]
            if max_srl_kanji["CHAR"] in queue_categorization.keys():
                queue_categorization[onyomi.iloc[0]["CHAR"]].append(random_row)
            else:
                queue_categorization[onyomi.iloc[0]["CHAR"]] = []
                queue_categorization[onyomi.iloc[0]["CHAR"]].append(random_row)

def seventh_rule(random_row):
    print("7. rule")
                
def sixth_rule(random_row):
    print("6. rule")
    vr_cluster_1_2 = pandas.concat([
        find_cluster_1_2_components("COMPONENTS1", random_row), 
        find_cluster_1_2_components("COMPONENTS2", random_row)
    ])
    if vr_cluster_1_2.empty:
        seventh_rule(random_row)
    else:
        if len(vr_cluster_1_2.index) > 1:
            append_categorization("visual", random_row, False)
        else:
            random_row["TYPE"] = "VISUAL"
            max_srl_kanji = vr_cluster_1_2.iloc[0]
            if max_srl_kanji["CHAR"] in queue_categorization.keys():
                queue_categorization[vr_cluster_1_2.iloc[0]["CHAR"]].append(random_row)
            else:
                queue_categorization[vr_cluster_1_2.iloc[0]["CHAR"]] = []
                queue_categorization[vr_cluster_1_2.iloc[0]["CHAR"]].append(random_row)
                
def fifth_rule(random_row):
    print("5. rule")
    group_stem = df_stem[
        (df_stem["STEM KANJI"] == random_row["COMPONENTS1"]) |
        (df_stem["STEM KANJI"] == random_row["COMPONENTS2"]) |
        (df_stem["STEM KANJI"] == random_row["COMPONENTS3"])
    ]["GROUP"]
    if group_stem.empty:
        sixth_rule(random_row)
    else:
        if (random_row["SRL"] == 1):
            random_row["TYPE"] = "FORM"
        else:
            random_row["TYPE"] = "MEAN"
        if len(group_stem) == 1: 
            append_categorization(group_stem.iloc[0], random_row, False)
        else:
            print("more stems TODO")
                
def fourth_rule(random_row):
    print("4. rule")
    vr_cluster = pandas.concat([
        find_cluster_components("COMPONENTS2", random_row), 
        find_cluster_components("COMPONENTS3", random_row), 
        find_cluster_components("COMPONENTS4", random_row), 
        find_cluster_components("COMPONENTS5", random_row)
    ])
#     print("vr_cluster", vr_cluster)
    if vr_cluster.empty:
        vr_all_cluster = pandas.concat([
            find_cluster_all_components("COMPONENTS1", random_row),
            find_cluster_all_components("COMPONENTS2", random_row),
            find_cluster_all_components("COMPONENTS3", random_row),
            find_cluster_all_components("COMPONENTS4", random_row),
            find_cluster_all_components("COMPONENTS5", random_row)
        ])
#         print("vr_all_cluster", vr_all_cluster)
        if vr_all_cluster.empty:
#             vr_third_cluster = df_kanji[df_kanji["CHAR"] == random_row["COMPONENTS2"]]  
            print("4. rule - 3rd condition TODO")
#             fifth_rule(random_row)
        else:
            find_onyomi(random_row, vr_all_cluster)
    else:
        print("vr clusters")
        find_onyomi(random_row, vr_cluster)
            
first_rule = find_keyword(random_row)
second_rule = find_stem(random_row)
if (first_rule != "none"):
    print("1. rule")
    if (random_row["TYPE"] == "MEAN"):
        append_categorization(first_rule, random_row, False)
    else:
        if (random_row["TYPE"] == "SPECIAL"):
            append_categorization(special_grp, random_row, False)
        else:
            if (random_row["TYPE"] == "OTHER"):
                append_categorization(other_grp, random_row, False)
            else:
                print("ERROR: missing grp")
else:
    if (second_rule != "none"):
        print("2. rule")
        if (random_row["TYPE"] == "STEM"):
            append_categorization(second_rule, random_row, True)
        else:
            print("ERROR: missing rule")
    else:
        components = df_kanji[(df_kanji["COMPONENTS1"] == random_row["CHAR"]) | (df_kanji["COMPONENTS2"] == random_row["CHAR"])]
        print("components count: " + str(len(components.index)))
        if components.empty:
            fourth_rule(random_row)
        else:
            print("3. rule")
            vr_crowns = random_row["ON READING"].split("、")
            print(vr_crowns)
            onyomi = components[components["ON READING"].isin(vr_crowns)]
            if onyomi.empty:
                print("onyomi empty")
                fourth_rule(random_row)
            else:
                print("3. rule a) b)")
                if len(onyomi.index) > 1:
                    print("kanji > 1")
                    print(onyomi)
                    max_srl_kanji = onyomi[onyomi["SRL"] == onyomi["SRL"].max()].iloc[0]
                    random_row["TAG"] = "CROWN_TAG"
                    random_row["TYPE"] = "VR"
                    
                    if max_srl_kanji["CHAR"] in queue_categorization.keys():
                        queue_categorization[max_srl_kanji["CHAR"]].append(random_row)
                    else:
                        queue_categorization[max_srl_kanji["CHAR"]] = []
                        queue_categorization[max_srl_kanji["CHAR"]].append(random_row)
                else:
                    print("kanji = 1")
                    random_row["TAG"] = "CROWN_TAG"
                    random_row["TYPE"] = "VR"
                    max_srl_kanji = onyomi.iloc[0]
                    if max_srl_kanji["CHAR"] in queue_categorization.keys():
                        queue_categorization[onyomi.iloc[0]["CHAR"]].append(random_row)
                    else:
                        queue_categorization[onyomi.iloc[0]["CHAR"]] = []
                        queue_categorization[onyomi.iloc[0]["CHAR"]].append(random_row)
                    
categorization
# queue_categorization
# find_keyword(random_row)
# find_stem(random_row)

components count: 0
4. rule
vr clusters
5. rule
6. rule


{'1 numbers': [],
 '2 family': [],
 '3 colors': [],
 '31 cardinals': [],
 '32 season of the year': [],
 '33 parts of the day': [],
 '77 other': [],
 '4 Sunday': [],
 '5 Monday': [],
 '6 Tuesday': [],
 '7 flame': [],
 '8 Wednesday': [],
 '9 alcohol': [],
 '10 Thursday': [],
 '11 Friday': [],
 '12 Saturday': [],
 '13 human': [],
 '14 woman': [],
 '15 child': [],
 '16 ear': [],
 '17 eye': [],
 '18 to see': [],
 '19 mouth': [],
 '20 tongue': [],
 '21 talk': [],
 '22 beard': [],
 '23 heart': [],
 '24 arm': [],
 '25 leg': [],
 '26 kneel': [],
 '27 stand': [],
 '28 run': [],
 '29 route': [],
 '30 direction': [],
 '32 seasons of the year': [],
 '34 car': [],
 '35 ship': [],
 '36 rain': [],
 '37 gate': [],
 '38 roof A': [],
 '39 roof B': [],
 '40 roof C': [],
 '41 roof D': [],
 '42 temple': [],
 '43 insect': [],
 '44 dog': [],
 '45 hound': [],
 '46 sheep': [],
 '47 cow': [],
 '48 horse': [],
 '49 bird': [],
 '50 feather': [],
 '51 field': [],
 '52 mound': [],
 '53 grass': [],
 '54 thread': [],


In [None]:
# df[df["COMPONENTS1"] == "c"]
# mb = mb.dropna(subset=['ON READING'])
# mb[mb["ON READING"].str.ns("フク")]