In [69]:
import json

In [70]:
# with open('CHAR_TO_IDS_FULL_UNIFIED.json') as f:
#     char_to_ids = json.load(f)
# with open('char_to_alts.json') as f:
#     char_to_alts = json.load(f)

import json
from collections import defaultdict

with open('char_to_alts.json') as f:
    char_to_alts = defaultdict(list, json.load(f))
    

with open('CHAR_TO_IDS_FULL_UNIFIED.json') as f:
    char_list = {}
    for k, v in json.load(f).items():
        if v:
            char_list[k] = v[0]
        else:
            char_list[k] = []

In [71]:
alts = sum(char_to_alts.values(), [])
alt_to_standard = {
    alt: [c for c, a in char_to_alts.items() if alt in a][0]
    for alt in alts
}
def standardize(s, k):
    out = ''
    for c in s:
        if c in alt_to_standard:
            out += alt_to_standard[c]
        else:
            out += c
    return out

In [72]:
char_list = {
    k: standardize(v, k)
    for k, v in char_list.items()
}

In [73]:
words['老'] = '⿱耂匕'

In [74]:
words['馬']

'⿹⿺㇉⿻三丨灬'

In [75]:
comp_to_n_children = {
    '⿰':2,
    '⿱':2,
    '⿲':3,
    '⿳':3,
    '⿴':2,
    '⿵':2,
    '⿶':2,
    '⿷':2,
    '⿸':2,
    '⿹':2,
    '⿺':2,
    '⿻':2
}
compositions = {
    'binary': ['⿰', '⿱', '⿴', '⿵', '⿶', '⿷', '⿸', '⿹', '⿺', '⿻'],
    'ternary': ['⿲', '⿳']
}

In [76]:
class Composition:
    def __init__(self, data):
        self.type = 'comp'
        self.glyph = data
        self.children = []
        self.parent = None

class Char:
    def __init__(self, data):
        self.type = 'char'
        self.glyph = data
        self.child = None
        self.parents = []

In [77]:
chars = {}
leftover = None
def recurse(curr, num_children_expected, parent):
    global leftover
    global chars
    for _ in range(num_children_expected):
        curr_symbol = leftover[0]
        if curr_symbol in compositions['binary'] or curr_symbol in compositions['ternary']:
            leftover = leftover[1:]
            num_children_expected = 3 if curr_symbol in compositions['ternary'] else 2
            composition = Composition(curr_symbol)
            curr.children.append(composition)
            composition.parent = parent
            recurse(composition, num_children_expected=num_children_expected, parent=composition)
        else:
            if curr_symbol not in chars:
                char = Char(curr_symbol)
                chars[curr_symbol] = char
            curr.children.append(chars[curr_symbol])
            chars[curr_symbol].parents.append(curr)
            leftover = leftover[1:]

def process(input_word, breakdown):
    if len(breakdown) == 1:
        # trivial
        return
    global leftover
    global chars
    first = breakdown[0]
    if input_word not in chars:
        word = Char(input_word)
        chars[input_word] = word
    word = chars[input_word]
    c = Composition(first)
    c.parent = word
    word.child = c
    leftover = breakdown[1:]
    
#     if input_word == '㔐':
#         print(word.symbol)
#         print(word.child.parent.symbol)
    
    num_children_expected = 3 if first in compositions['ternary'] else 2
    recurse(c, num_children_expected=num_children_expected, parent=c)
    return word

In [78]:
def bfs(root):
    q = [(0, root)]
    while len(q) != 0:
        level, curr = q[0]
        q = q[1:]
        if curr.type == 'char':
            print('#' * level, curr.glyph)
            if curr.child:
                q.append((level + 1, curr.child))
        else:
            print('$' * level, curr.glyph)
            for c in curr.children:
                q.append((level + 1, c))

In [79]:
word = '佥'
breakdown = '⿳亼𭕄一'
a = process(word, breakdown)
bfs(a)

word = '但'
breakdown = '⿰亻旦'
b = process(word, breakdown)
bfs(b)

 佥
$ ⿳
## 亼
## 𭕄
## 一
 但
$ ⿰
## 亻
## 旦


In [80]:
for i, (char, breakdown) in enumerate(char_list.items()):
    if not breakdown:
        continue
    try:
        process(char, breakdown)
    except Exception:
        print(char, breakdown, len(char), len(breakdown))

𠀌 ⿻⿱一丨 1 4
𠁔 ⿱⿻並甹 1 4
𠂶 ⿱丿⿹⿺㇉⿱一⿰一丿 1 10
𠃛 ⿰⿷𰀄亅 1 4
𠃢 ⿻⿰丿丨⿹𠃊 1 6
𠃬 ⿻⿷己匚 1 4
𠄷 ⿱⿲𠔼 1 3
𠇇 ⿰人⿺⿻二丶 1 6
𠑻 ⿳亠儿 1 3
𠦂 ⿻十 1 2
𠨰 ⿸厂⿻乙 1 4
𠬶 ⿳⿻一冖又 1 5
𡆢 ⿴囗 1 2
𡖈 ⿱⿴丨⿴丨 1 5
𡦹 ⿱丶⿵𠘨 1 4
𡧑 ⿳宀大 1 3
𡰣 ⿱丿 1 2
𢀓 ⿻工 1 2
𢀖 ⿱工 1 2
𢋪 ⿸广⿳⿻⿰丨丨八⿲𠀉乂彐 1 12
𤔔 ⿳爪龴⿵冂⿱厶 1 7
𤕈 ⿱爪⿲呂⿱⿱丨呂 1 8
𥪐 ⿱立⿰⿺乙仌 1 6
𥪑 ⿱立⿰⿺乙牛 1 6
𥸧 米⿰丁 1 3
𦉭 ⿻儿⿹𠃊 1 4
𦱉 ⿱艸 1 2
𧩶 ⿰言 1 2
𧭹 ⿱言 1 2
𨸏 ⿱丿⿰丨⿳ 1 5
𩻝 ⿰魚 1 2
𩼁 ⿰魚 1 2
𩼧 ⿰魚 1 2
𪛄 ⿳㽞𠮥⿰𢑑⿺乙⿰丨 1 9
