In [4]:
import json
from collections import defaultdict

In [5]:
with open('CHAR_TO_IDS_FULL_UNIFIED.json') as f:
    char_to_ids = json.load(f)
with open('char_to_alts.json') as f:
    char_to_alts = defaultdict(list, json.load(f))

In [55]:
char_to_ids['䆗'] = ['⿱穴叫']
char_to_ids['㵝'] = ['⿰氵裔']
char_to_ids['䨴'] = ['⿱雨對']

more_char = ['㕉',
 '㛂',
 '㛹',
 '㝗',
 '㝩',
 '㟅',
 '㩻',
 '㯓',
 '㱥',
 '㲪',
 '㺄',
 '㺒',
 '䑃',
 '䔿',
 '䗊',
 '䚧',
 '䥈',
 '䧢',
 '䧱',
 '䱭',
 '䲛',
 '䴀',
 '䴊',
 '䴌',
 '𣶏']
more_comp = [
    '⿸厂合',
    '⿰女𠂬',
    '⿰女便',
    '⿱宀良',
    '⿱宀康',
    '⿰山兇',
    '⿰危支',
    '⿰木翕',
    '⿰歹夌',
    '⿰登毛',
    '⿰犭俞',
    '⿰犭翏',
    '⿰月蒙',
    '⿱艹尊',
    '⿰虫炎',
    '⿰角翏',
    '⿰釒莽',
    '⿰阝區',
    '⿰九隹',
    '⿰魚恆',
    '⿰魚瞢',
    '⿰童鳥',
    '⿰義鳥',
    '⿰鳥蒙',
    '⿰氵疌'
]

In [56]:
for (a,b) in zip(more_char,more_comp):
    char_to_ids[a] = [b]

In [57]:
alts = sum(char_to_alts.values(), [])
alt_to_standard = {
    alt: [c for c, a in char_to_alts.items() if alt in a][0]
    for alt in alts
}
def standardize(s, k):
    out = ''
    for c in s:
        if c in alt_to_standard:
            out += alt_to_standard[c]
        else:
            out += c
    return out

In [58]:
char_to_ids = {
    k: [standardize(vv, k) for vv in v]
    for k, v in char_to_ids.items()
}

In [59]:
comp_to_n_children = {
    '⿰':2,
    '⿱':2,
    '⿲':3,
    '⿳':3,
    '⿴':2,
    '⿵':2,
    '⿶':2,
    '⿷':2,
    '⿸':2,
    '⿹':2,
    '⿺':2,
    '⿻':2
}

In [60]:
class Node:
    def __init__(self, glyph, type_):
        self.glyph = glyph
        self.type_ = type_
        self.parent = None
        self.children = []
    def __repr__(self):
        return f"<Node '{self.glyph}' with {len(self.children)} children>"

In [61]:
def recurse(parent, n_children, leftover):
    for _ in range(n_children):
#                 curr = next(leftover)
        curr = leftover[0]
        leftover = leftover[1:]
        type_ = 'comp' if curr in comp_to_n_children else 'char'
        curr_node = Node(curr, type_)
        curr_node.parent = parent
        parent.children.append(curr_node)
        if type_ == 'char':
            pass
        else:
            n_grandchildren = comp_to_n_children[curr]
            recurse(curr_node, n_grandchildren, leftover=leftover)
            
# char = '夒'
# parses = set(char_to_ids[char])
# for parse in parses:
#     if not parse:
#         continue
#     n_children = comp_to_n_children[parse[0]]
#     leftover = (c for c in parse[1:])
#     root = Node(parse[0], 'comp')
#     recurse(parent=root, n_children=n_children)

In [62]:
ParseDict = dict()

for char, parses in char_to_ids.items():
#     if char != '㐔':
#         continue
    ParseDict[char] = []
    for parse_string in set(parses):
        try:
            n_children = comp_to_n_children[parse_string[0]]
            leftover = parse_string[1:]
            root = Node(parse_string[0], 'comp')
            recurse(parent=root, n_children=n_children, leftover=leftover)
            ParseDict[char].append((parse_string, root))
        except:
            print('malformed input at:', char, parse_string, parses)

malformed input at: 𠄏 了 ['了']
malformed input at: 𠄔 予 ['予']
malformed input at: 𠄷 ⿱⿲𠔼 ['⿱⿲𠔼']
malformed input at: 𠑻 ⿳亠儿 ['⿳亠儿']
malformed input at: 𠦂 ⿻十 ['⿻十']
malformed input at: 𠨰 ⿸厂⿻乚 ['⿸厂⿻乚']
malformed input at: 𡆢 ⿴囗 ['⿴囗']
malformed input at: 𡖈 ⿱⿴丨⿴丨 ['⿱⿴丨⿴丨']
malformed input at: 𡦹 ⿱丶⿵𠘨 ['⿱丶⿵𠘨']
malformed input at: 𡧑 ⿳宀大 ['⿳宀大']
malformed input at: 𡰣 ⿱丿 ['⿱丿']
malformed input at: 𡿦 巛 ['巛']
malformed input at: 𢀑 ⿻丅 ['⿷工𠃍', '⿻丅']
malformed input at: 𢀓 ⿻工 ['⿻工']
malformed input at: 𢀖 ⿱工 ['⿱工']
malformed input at: 𣥄 正 ['正']
malformed input at: 𤔔 ⿳爪龴⿵冂⿱厶 ['⿳爪龴⿵冂⿱厶', '⿳爪龴⿵冂⿱厶又']
malformed input at: 𥸧 米⿰丁 ['米⿰丁']
malformed input at: 𦉭 ⿻儿⿹𠃊 ['⿻儿⿹𠃊']
malformed input at: 𦱉 ⿱艸 ['⿱艸']
malformed input at: 𧩶 ⿰言 ['⿰言']
malformed input at: 𧭹 ⿱言 ['⿱言']
malformed input at: 𨙨 邑 ['邑']
malformed input at: 𨸏 ⿱丿⿰丨⿳ ['⿱丿⿰丨⿳']
malformed input at: 𩻝 ⿰魚 ['⿰魚']
malformed input at: 𩼁 ⿰魚 ['⿰魚']
malformed input at: 𩼧 ⿰魚 ['⿰魚']
malformed input at: 𪛄 ⿳㽞𠮥⿰𢑑⿺乚⿰丨 ['⿳㽞𠮥⿰𢑑⿺乚⿰丨']


In [63]:
import pickle as pkl

with open('data/database.pkl', 'wb') as f:
    pkl.dump(ParseDict, f)

In [64]:
from database import load_database

In [65]:
db = load_database()

In [66]:
db['䆗']

[('⿱穴叫', <Node '⿱' with 2 children>)]

In [67]:
db['㵝']

[('⿰水裔', <Node '⿰' with 2 children>)]