In [1]:
import json
from collections import defaultdict

# with open("char_to_ids_singles.json") as f:
#     words = json.load(f)

with open('CHAR_TO_ALTS.json') as f:
    char_to_alts = defaultdict(list, json.load(f))
    

with open('CHAR_TO_IDS.json') as f:
    words = json.load(f)
    words = {k: v[0] for k, v in words.items()}

In [2]:
alts = sum(char_to_alts.values(), [])
alt_to_standard = {
    alt: [c for c, a in char_to_alts.items() if alt in a][0]
    for alt in alts
}
def standardize(s, k):
    out = ''
    for c in s:
        if c in alt_to_standard:
            out += alt_to_standard[c]
        else:
            out += c
    return out

In [3]:
words = {
    k: standardize(v, k)
    for k, v in words.items()
}

In [4]:
words['老'] = '⿱耂匕'

In [5]:
words['馬']

'⿹⿺㇉⿻三丨火'

In [6]:
compositions = {
    'binary': ['⿰', '⿱', '⿴', '⿵', '⿶', '⿷', '⿸', '⿹', '⿺', '⿻'],
    'ternary': ['⿲', '⿳']
}

In [7]:
class Composition:
    def __init__(self, data):
        self.classification = "composition"
        self.symbol = data
        self.children = []
        self.parent = None

In [8]:
class Char:
    def __init__(self, data):
        self.classification = "char"
        self.symbol = data
        self.child = None
        self.parents = []

In [9]:
chars = {}
leftover = None
def recurse(curr, num_children_expected, parent):
    global leftover
    global chars
    for _ in range(num_children_expected):
        curr_symbol = leftover[0]
        if curr_symbol in compositions['binary'] or curr_symbol in compositions['ternary']:
            leftover = leftover[1:]
            num_children_expected = 3 if curr_symbol in compositions['ternary'] else 2
            composition = Composition(curr_symbol)
            curr.children.append(composition)
            composition.parent = parent
            recurse(composition, num_children_expected=num_children_expected, parent=composition)
        else:
            if curr_symbol not in chars:
                char = Char(curr_symbol)
                chars[curr_symbol] = char
            curr.children.append(chars[curr_symbol])
            chars[curr_symbol].parents.append(curr)
            leftover = leftover[1:]

def process(input_word, breakdown):
    if len(breakdown) == 1:
        # trivial
        return
    global leftover
    global chars
    first = breakdown[0]
    if input_word not in chars:
        word = Char(input_word)
        chars[input_word] = word
    word = chars[input_word]
    c = Composition(first)
    c.parent = word
    word.child = c
    leftover = breakdown[1:]
    
#     if input_word == '㔐':
#         print(word.symbol)
#         print(word.child.parent.symbol)
    
    num_children_expected = 3 if first in compositions['ternary'] else 2
    recurse(c, num_children_expected=num_children_expected, parent=c)
    return word

In [10]:
def bfs(root):
    q = [(0, root)]
    while len(q) != 0:
        level, curr = q[0]
        q = q[1:]
        if curr.classification == 'char':
            print('#' * level, curr.symbol)
            if curr.child:
                q.append((level + 1, curr.child))
        else:
            print('$' * level, curr.symbol)
            for c in curr.children:
                q.append((level + 1, c))

In [11]:
word = '佥'
breakdown = '⿳亼𭕄一'
a = process(word, breakdown)

In [12]:
word = '但'
breakdown = '⿰亻旦'
b = process(word, breakdown)

In [13]:
bfs(a)

 佥
$ ⿳
## 亼
## 𭕄
## 一


In [14]:
bfs(b)

 但
$ ⿰
## 亻
## 旦


In [15]:
word = '旦'
breakdown = words['旦']
c = process(word, breakdown)

In [16]:
bfs(c)

 旦
$ ⿱
## 日
## 一


In [17]:
for i, (char, breakdown) in enumerate(words.items()):
    try:
        process(char, breakdown)
    except Exception:
        print(char, breakdown)

专 ⿻二⿱丶
衰 ⿳亠𧘇
𠀌 ⿻⿱一丨
𠁔 ⿱⿻並甹
𠂶 ⿱丿⿹⿺㇉⿱一⿰一丿
𠃛 ⿰⿷𰀄亅
𠃢 ⿻⿰丿丨⿹𠃊
𠃬 ⿻⿷己匚
𠄷 ⿱⿲𠔼
𠇇 ⿰人⿺⿻二丶
𠑻 ⿳亠儿
𠦂 ⿻十
𠨰 ⿸厂⿻乙
𠬶 ⿳⿻一冖又
𡆢 ⿴囗
𡖈 ⿱⿴丨⿴丨
𡦹 ⿱丶⿵𠘨
𡧑 ⿳宀大
𡰣 ⿱丿
𢀓 ⿻工
𢀖 ⿱工
𢋪 ⿸广⿳⿻⿰丨丨八⿲𠀉乂彐
𤔔 ⿳爪龴⿵冂⿱厶
𤕈 ⿱爪⿲呂⿱⿱丨呂
𥪐 ⿱立⿰⿺乙仌
𥪑 ⿱立⿰⿺乙牛
𥸧 米⿰丁
𦉭 ⿻儿⿹𠃊
𦱉 ⿱艸
𧩶 ⿰言
𧭹 ⿱言
𨸏 ⿱丿⿰丨⿳
𩻝 ⿰魚
𩼁 ⿰魚
𩼧 ⿰魚
𪛄 ⿳㽞𠮥⿰𢑑⿺乙⿰丨


In [18]:
bfs(chars['老'])

 老
$ ⿱
## 耂
## 匕
$$$ ⿻
$$$ ⿺
#### 丿
#### 土
#### 乙
#### 丿
$$$$$ ⿱
###### 十
###### 一
$$$$$$$ ⿻
######## 一
######## 丨


In [19]:
import sys

sys.setrecursionlimit(8000)

In [20]:
import pickle as pkl

with open('CHAR_TO_IDS.pkl', 'wb') as f:
    pkl.dump(chars, f)