- https://chat.qwen.ai/c/cd9c27bf-94c6-4d9f-8e1d-ffeb54d55ac8

In [1]:
from pypinyin import pinyin, Style
from rapidfuzz import distance

# Function to extract Pinyin and split into initial/final
def extract_pinyins(character, style=Style.NORMAL, heteronym=True):
    """
    Extract PinYin pronunciations (heteronyms) from character:
    support the following styles:
        - Style.NORMAL (default) : ['xing', 'hang', 'heng']
        - Style.TONE   : ['xíng', 'háng', 'héng', 'xìng', 'hàng']
        - Style.TONE2  : ['xi2ng', 'ha2ng', 'he2ng', 'xi4ng', 'ha4ng']
        - Style.TONE3  : ['xing2', 'hang2', 'heng2', 'xing4', 'hang4']

    Returns a tuple:
        pinyins (拼音), initials (声母), finals (韵母)
    """
    
    pinyins = pinyin(character, style=style, heteronym=heteronym)[0]
    initials = pinyin(character, style=Style.INITIALS|style, heteronym=heteronym)[0]
    finals = pinyin(character, style=Style.FINALS|style, heteronym=heteronym)[0]
    return pinyins, initials, finals

def calculate_similarity(pron1, pron2):
    # Helper function to calculate similarity
    return 1 - distance.Levenshtein.normalized_distance(pron1, pron2)

def get_similarity(char, comp, threshold=0.0, first_only=True, style=Style.NORMAL, heteronym=True, debug=True):
    """
    Calculate pinyin similarity scores in 2 steps

    Returns a tuple of tuple:
        (max_raw_similarity, max_refined_similarity), (char, char_pinyins), (comp, phon_pinyins)
    """
    
    max_raw_similarity, max_refined_similarity = -1, -1
    
    char_pinyins = extract_pinyins(char, style=style, heteronym=heteronym)
    phon_pinyins = extract_pinyins(comp, style=style, heteronym=heteronym)
    if debug:
        print(f"Character: {char_pinyins}")
        print(f"Phonetic Component: {phon_pinyins}")        

    ## Step 1: Calculate raw similarity (full Pinyin)
    ## =================================================
    if first_only:
        pron1 = char_pinyins[0][0]
        pron2 = phon_pinyins[0][0]
        max_raw_similarity = calculate_similarity(pron1, pron2)
    else:
        for pron1 in char_pinyins[0]:
            for pron2 in phon_pinyins[0]:
                raw_similarity = calculate_similarity(pron1, pron2)
                
                # Update maximum raw similarity
                if raw_similarity > max_raw_similarity:
                    max_raw_similarity = raw_similarity

    ## Step 2: Calculate refined similarity (finals) if raw similarity is above threshold
    ## =================================================
    if max_raw_similarity >= threshold:
        for pron1 in char_pinyins[2]:
            for pron2 in phon_pinyins[2]:
                refined_similarity = calculate_similarity(pron1, pron2)
                                
                # Update maximum refined similarity
                if refined_similarity > max_refined_similarity:
                    max_refined_similarity = refined_similarity
    
    return (f"{max_raw_similarity:.3f}", f"{max_refined_similarity:.3f}"), (char, char_pinyins), (comp, phon_pinyins)


## Eval styles

In [2]:
character = "行"

styles = [Style.NORMAL, Style.TONE, Style.TONE2, Style.TONE3]

for style in styles:
    x = pinyin(character, style=style, heteronym=True)[0]
    print(f"style={style}\n \tpinyins = {x}")

extract_pinyins(character, style=Style.NORMAL)

style=0
 	pinyins = ['xing', 'hang', 'heng']
style=1
 	pinyins = ['xíng', 'háng', 'héng', 'xìng', 'hàng']
style=2
 	pinyins = ['xi2ng', 'ha2ng', 'he2ng', 'xi4ng', 'ha4ng']
style=8
 	pinyins = ['xing2', 'hang2', 'heng2', 'xing4', 'hang4']


(['xing', 'hang', 'heng'], ['x', 'h'], ['ing', 'ang', 'eng'])

## Test : 始 = 女 + 厶 + 口

In [11]:
STYLE = Style.NORMAL # Style.TONE  # Style.TONE2  # Style.TONE3
THRESHOLD = 0.4  # 0.0
FIRST_ONLY = True

In [12]:
# Example
character, phonetic_component = "始", "厶"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['shi'], ['sh'], ['i'])
Phonetic Component: (['si', 'mou'], ['s', 'm'], ['i', 'ou'])
(('0.667', '1.000'), ('始', (['shi'], ['sh'], ['i'])), ('厶', (['si', 'mou'], ['s', 'm'], ['i', 'ou'])))


In [13]:
# Example
character, phonetic_component = "始", "台" 

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['shi'], ['sh'], ['i'])
Phonetic Component: (['tai', 'yi', 'si'], ['t', 's'], ['ai', 'i'])
(('0.333', '-1.000'), ('始', (['shi'], ['sh'], ['i'])), ('台', (['tai', 'yi', 'si'], ['t', 's'], ['ai', 'i'])))


In [14]:
# Example
character, phonetic_component = "始", "口"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['shi'], ['sh'], ['i'])
Phonetic Component: (['kou'], ['k'], ['ou'])
(('0.000', '-1.000'), ('始', (['shi'], ['sh'], ['i'])), ('口', (['kou'], ['k'], ['ou'])))


## Test : 行 + 圭 = 街

use Google Gemini to decompose chinese characters: see https://gemini.google.com/app/308c1d21326b3639

In [23]:
# Example
character, phonetic_component = "街", "行"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['jie'], ['j'], ['ie'])
Phonetic Component: (['xing', 'hang', 'heng'], ['x', 'h'], ['ing', 'ang', 'eng'])
(('0.250', '-1.000'), ('街', (['jie'], ['j'], ['ie'])), ('行', (['xing', 'hang', 'heng'], ['x', 'h'], ['ing', 'ang', 'eng'])))


In [24]:
# Example
character, phonetic_component = "街", "圭"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['jie'], ['j'], ['ie'])
Phonetic Component: (['gui'], ['g'], ['uei'])
(('0.000', '-1.000'), ('街', (['jie'], ['j'], ['ie'])), ('圭', (['gui'], ['g'], ['uei'])))


## Test : 行 = 彳 (chì) + 亍 (chù)

In [21]:
# Example
character, phonetic_component = "行", "彳"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['xing', 'hang', 'heng'], ['x', 'h'], ['ing', 'ang', 'eng'])
Phonetic Component: (['chi', 'fu'], ['ch', 'f'], ['i', 'u'])
(('0.000', '-1.000'), ('行', (['xing', 'hang', 'heng'], ['x', 'h'], ['ing', 'ang', 'eng'])), ('彳', (['chi', 'fu'], ['ch', 'f'], ['i', 'u'])))


In [22]:
# Example
character, phonetic_component = "行", "亍"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['xing', 'hang', 'heng'], ['x', 'h'], ['ing', 'ang', 'eng'])
Phonetic Component: (['chu'], ['ch'], ['u'])
(('0.000', '-1.000'), ('行', (['xing', 'hang', 'heng'], ['x', 'h'], ['ing', 'ang', 'eng'])), ('亍', (['chu'], ['ch'], ['u'])))


## Test : 初 = 衤 + 刀

初: 始也。从刀从衣。裁衣之始也。

In [16]:
# Example
character, phonetic_component = "初", "衤"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['chu'], ['ch'], ['u'])
Phonetic Component: (['yi'], [''], ['i'])
(('0.000', '-1.000'), ('初', (['chu'], ['ch'], ['u'])), ('衤', (['yi'], [''], ['i'])))


In [17]:
# Example
character, phonetic_component = "初", "衣"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['chu'], ['ch'], ['u'])
Phonetic Component: (['yi'], [''], ['i'])
(('0.000', '-1.000'), ('初', (['chu'], ['ch'], ['u'])), ('衣', (['yi'], [''], ['i'])))


In [18]:
# Example
character, phonetic_component = "初", "刀"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['chu'], ['ch'], ['u'])
Phonetic Component: (['dao', 'diao'], ['d'], ['ao', 'iao'])
(('0.000', '-1.000'), ('初', (['chu'], ['ch'], ['u'])), ('刀', (['dao', 'diao'], ['d'], ['ao', 'iao'])))


## Test : 弓 + 长 = 张

In [19]:
# Example
character, phonetic_component = "张", "长"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['zhang'], ['zh'], ['ang'])
Phonetic Component: (['zhang', 'chang'], ['zh', 'ch'], ['ang'])
(('1.000', '1.000'), ('张', (['zhang'], ['zh'], ['ang'])), ('长', (['zhang', 'chang'], ['zh', 'ch'], ['ang'])))


In [20]:
# Example
character, phonetic_component = "张", "弓"

x = get_similarity(character, phonetic_component, threshold=THRESHOLD, first_only=FIRST_ONLY, style=STYLE, heteronym=True)
print(x)

Character: (['zhang'], ['zh'], ['ang'])
Phonetic Component: (['gong'], ['g'], ['ong'])
(('0.400', '0.667'), ('张', (['zhang'], ['zh'], ['ang'])), ('弓', (['gong'], ['g'], ['ong'])))
