In [1]:
import pickle
import collections


def flatten(x):
    if isinstance(x, collections.Iterable):
        return [a for i in x for a in flatten(i)]
    else:
        return [x]

def strip_ideographic(text):
    # Ideographic_Description_Characters = ["⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻"]
    Ideographic_Description_Characters = "⿰⿱⿲⿳⿴⿵⿶⿷⿸⿹⿺⿻"
    translator = str.maketrans("", "", Ideographic_Description_Characters)
    return text.translate(translator)


def get_all_word_bukken(filename="IDS-UCS-Basic.txt"):
    bukkens = []
    words = []  # actually chars
    word_bukken = {}
    for i, line in enumerate(open(filename, "r").readlines()):
        if line[0] != "U":  # not start with U+XXXX means it is not a word
            continue
        line = line.split()
        word = line[1]
        components = line[2]
        components = strip_ideographic(components)
        bukken = []
        while ";" in components:
            bukken.append(components[:components.find(";") + 1])
            components = components[components.find(";") + 1:]
        while len(components) > 1:
            bukken.append(components[0])
            components = components[1:]
        bukken.append(components)
        words.append(word)
        word_bukken[words.index(word)] = bukken
        if len(bukken) == 1 and bukken[0] == word:
            bukkens.append(word)

    def expand_bukken(bukken):
        expanded_bukken = []
        for b in bukken:
            if b in bukkens:
                expanded_bukken.append(bukkens.index(b))
            else:
                if b in words:
                    expanded_bukken.append(expand_bukken(word_bukken[words.index(b)]))
                else:
                    bukkens.append(b)
                    expanded_bukken.append(bukkens.index(b))
        return expanded_bukken

    for i_word, i_bukken in word_bukken.items():
        b_list = expand_bukken(i_bukken)
        b_list = flatten(b_list)
        word_bukken[i_word] = b_list
    return words, bukkens, word_bukken


def get_all_character(filename="IDS-UCS-Basic.txt"):
    chars = []

    for i, line in enumerate(open(filename, "r").readlines()):
        if line[0] != "U":  # not start with U+XXXX means it is not a word
            continue
        line = line.split()
        char = line[1]
        chars.append(char)
    return chars

In [2]:
words, bukkens, word_bukken = get_all_word_bukken("IDS-UCS-Basic.txt")

In [3]:
bukkenList_word = {}
for word, bukkenList in word_bukken.items():
    bukkenList=','.join(map(str,bukkenList))
    if bukkenList not in bukkenList_word.keys():
        bukkenList_word[bukkenList] = [word]
    else:
        bukkenList_word[bukkenList].append(word)

In [4]:
similar_words = [ws for bukkenList, ws in bukkenList_word.items() if len(ws)>1]

In [5]:
for w_list in similar_words:
    print([words[x] for x in w_list])

['仌', '从']
['冈', '罓']
['勀', '勊']
['勇', '勈']
['另', '叻']
['叧', '叨']
['吅', '吕']
['呐', '呙']
['呗', '员']
['咠', '咡']
['員', '唄']
['喦', '嵒']
['夐', '敻']
['娄', '籹']
['季', '秄']
['屺', '岂']
['屻', '岃']
['屾', '岀']
['岋', '岌']
['岑', '岒']
['岝', '岞']
['岧', '岹']
['岫', '峀']
['岭', '岺']
['峆', '峇']
['峈', '峉']
['峏', '耑']
['峒', '峝']
['峗', '峞']
['峛', '峢']
['峨', '峩']
['峯', '峰']
['崐', '崑']
['崒', '崪']
['崓', '崮']
['崕', '崖']
['崘', '崙']
['崛', '崫']
['崟', '崯']
['崠', '崬']
['崳', '嵛']
['嵏', '嵕']
['嵩', '嵪']
['嵯', '嵳']
['嵷', '嵸']
['嶃', '嶄']
['嶋', '嶌']
['嶕', '嶣']
['嶚', '嶛']
['嶡', '嶥']
['嶢', '嶤']
['嶪', '嶫']
['巃', '巄']
['巖', '巗']
['巘', '巚']
['弑', '弒']
['悥', '訫']
['抛', '拋']
['抳', '抿']
['插', '揷']
['早', '旪']
['旭', '旮']
['旰', '旱']
['旻', '旼']
['昉', '昘']
['昒', '易']
['昞', '昺']
['星', '甠']
['晀', '晁']
['晃', '晄']
['晌', '晑']
['晚', '晩']
['晟', '晠']
['景', '晾']
['暈', '暉']
['暏', '暑']
['曄', '曅']
['朞', '期']
['杍', '李']
['杠', '杢']
['松', '枩']
['柤', '査']
['查', '柦']
['柰', '标']
['棗', '棘']
['毗', '毘']
['泵', '砅']
['炅', '炚']
['炎', '炏']
['烘', '烡']
['町', '甼']

In [6]:
print(len(similar_words), ' "similar" characters in totally ', len(words), 'characters in IDS-UCS-Basic.')

109  "similar" characters in totally  20902 characters in IDS-UCS-Basic.


In [8]:
import pickle
with open('words_bukkens_word_bukken.pkl', 'wb') as f:
    pickle.dump((words, bukkens, word_bukken),f)