In [84]:
import os
import json
from alternative_replacements import make_bin_alt_replacements
from collections import defaultdict

In [85]:
cjk_ranges = [
        ( 0x4E00,  0x62FF),
        ( 0x6300,  0x77FF),
        ( 0x7800,  0x8CFF),
        ( 0x8D00,  0x9FCC),
        ( 0x3400,  0x4DBF),
        (0x20000, 0x215FF),
        (0x21600, 0x230FF),
        (0x23100, 0x245FF),
        (0x24600, 0x260FF),
        (0x26100, 0x275FF),
        (0x27600, 0x290FF),
        (0x29100, 0x2A6DF),
        (0x2A700, 0x2B734),
        (0x2B740, 0x2B81D),
        (0x2B820, 0x2CEAF),
        (0x2CEB0, 0x2EBEF),
        (0x2F800, 0x2FA1F),
        (0x30000, 0x3134A), # cjk ext-g
        (0x31c0,  0x31e3), # cjk strokes
        (0x2E80,  0x2EF3), # cjk radical supplement
        (0x2F00, 0x2fdf), # kangxi radicals
        (0x2FF0,  0x2FFB), #ideographic description block
        (0xF900, 0xFAFF) # cjk compatibility ideographs
    ]

exclusion_ranges = [
    (0x30A1, 0x30FF), # katakana
    (0x3041, 0x309f), # hiragana
    (0x3000, 0x303F) # cjk symbols/punct
]

def is_cjk(char):
    char = ord(char)
    for bottom, top in cjk_ranges:
        if char >= bottom and char <= top:
            return True
    return False

def should_be_excluded(char):
    char = ord(char)
    for bottom, top in exclusion_ranges:
        if bottom <= char <= top:
            return True
    return False

In [86]:
char_to_ids = defaultdict(list)

In [87]:
with open('RAW_01.json') as f:
    raw = json.load(f)

In [88]:
for char, strings in raw.items():
    ss = [s for s in strings if 'ids=' in s]
    if not ss: print(char, strings)
    for s in ss:
        s = make_bin_alt_replacements(s)
        split = s.replace('{', '').replace('}', '').split('|')
            
        ids = [sp for sp in split if 'ids=' in sp]
        assert len(ids) == 1
        forms = ids[0].replace('ids=', '').split(',')
        
        if not forms: print(char, strings)
        for f in forms:
            if ''.join([c for c in f if is_cjk(c)]):
                if any(should_be_excluded(c) for c in f):
                    continue
                char_to_ids[char].append(''.join([c for c in f if is_cjk(c)]))

身 ['{{zh-forms|alt=⿻㇒力-2nd round simp.}']
鬬 [' <!-- Not the same as ⿵鬥斲 -->====References====']


In [89]:
print(raw['专'])
make_bin_alt_replacements(raw['筑'][0])

['{{Han char|rn=1|rad=一|as=03|sn=4|four=|canj=QNI|ids=⿻二⿱ㄣ丶}']


'{{Han char|rn=118|rad=竹|as=06|sn=12|four=88117|canj=HMHN,HMNJ|ids=⿱[[竹]][[巩]]}'

In [90]:
for i in raw['专'][0][-10:]:
    print(i, is_cjk(i))

i False
d False
s False
= False
⿻ True
二 True
⿱ True
ㄣ False
丶 True
} False


In [91]:
len(char_to_ids)

21178

In [92]:
import re
# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

In [93]:
for path in os.listdir('missing_from_parse/'):
    if len(path) != 5:
        continue
    
    char = path[0]
    with open(os.path.join('missing_from_parse/', path)) as f:
        text = f.readlines()[0]
    clean = cleanhtml(text)
    clean = clean[clean.index('composition') + len('composition '):-2].split(' or ')
    
    for form in clean:
        f = ''.join([c for c in form if is_cjk(c)])
        if char == '𩼁':
            print(clean, f)
        if not f: continue
        if char in char_to_ids:
            if f not in char_to_ids[char]:
                char_to_ids[char].append(f)
        else:
            char_to_ids[char].append(f)

['⿰魚&#xfa26;'] ⿰魚


In [94]:
with open('CHAR_TO_IDS.json', 'w', encoding='utf-8') as f:
    json.dump(char_to_ids, f, ensure_ascii=False, indent=2)

In [95]:
ct = 0
for char, ids in char_to_ids.items():
    for i in ids:
        if len(i) == 2 and 0x2FF0 <= ord(i[0]) <= 0x2FFB:
            print(char, ids)
            ct += 1

print(f'total: {ct}')

𠦂 ['⿻十']
𡆢 ['⿴囗']
𡰣 ['⿱丿']
𢀑 ['⿷工𠃍', '⿻丅']
𢀓 ['⿻工']
𢀖 ['⿱工']
𦱉 ['⿱艹']
𧩶 ['⿰言']
𧭹 ['⿱言']
𩻝 ['⿰魚']
𩼁 ['⿰魚']
𩼧 ['⿰魚']
total: 12
