### Pinyin annotators for Python 3

In [1]:
!pip install g2pc
!pip install xpinyin
!pip install pypinyin
!pip install g2pM



In [50]:
from g2pc import G2pC
from g2pc import G2pCFH
from g2pc import G2pCUD
from g2pM import G2pM
from xpinyin import Pinyin
from pypinyin import pinyin, lazy_pinyin, Style
import pypinyin
import re

g2p = G2pC()
g2pFH = G2pCFH()
g2pUD = G2pCUD()
g2pm = G2pM()
xp = Pinyin()

In [51]:
PUNCT = '，。 “：？！ ”·；———'

In [44]:
# Reading files

with open('test.txt', 'r', encoding='utf-8') as f:
    test = [line.strip() for line in f.readlines()]
with open('target_new.txt', 'r', encoding='utf-8') as f:
    target = [line.strip() for line in f.readlines()]
    num = re.compile('.+?\d')
    target = [re.findall(num, i) for i in target]

In [45]:
N_sent = len(target)
N = sum(len(line) for line in target)

In [46]:
# G2pC with fine-tuning for different pos-taggers and G2pM

def prediction_g2p(pos='pku', model='g2pc'):
    res = []
    for sent in test:
        if model == 'g2pm':
            ann = [word for word in g2pm(sent, tone=True, char_split=False) if word not in PUNCT]
        else:
            if pos == 'FH':
                ann = [word[2] for word in g2pFH(sent) if word[2] not in PUNCT]
            elif pos == 'UD':
                ann = [word[2] for word in g2pUD(sent) if word[2] not in PUNCT]
            else:
                ann = [word[2] for word in g2p(sent) if word[2] not in PUNCT]
        ann = ''.join(ann).replace(' ', '')
        res.append(re.findall(num, ann))
    return res

In [18]:
# Xpinyin 

def prediction_xp():
    res = []
    for sent in test:
        ann = xp.get_pinyin(sent, ' ', tone_marks='numbers').split()
        ann = [word for word in ann if word not in PUNCT]
        res.append(ann)
    return res

In [26]:
# Pypinyin

def prediction_pyp():
    res = []
    for sent in test:
        ann = pinyin(sent, style=pypinyin.TONE3, heteronym=False)
        ann = [word[0] for word in ann if word[0] not in PUNCT]
        res.append(ann)
    return res

In [37]:
# Calculate percent of true annotated characters

def metrics(name, res):
    per = 0
    for pred, true in zip(res, target):
        for pr, tr in zip(pred, true):
            if pr == tr:
                per += 1
    return name, str(per/N)

In [34]:
import pandas as pd
result = pd.DataFrame(columns=['model', 'accuracy'])

In [47]:
result.loc[0] = metrics('G2pC-pkuseg', prediction_g2p())
result.loc[1] = metrics('G2pC-FastHan', prediction_g2p('FH'))
result.loc[2] = metrics('G2pC-UDPipe', prediction_g2p('UD'))
result.loc[3] = metrics('G2pM-pkuseg', prediction_g2p(model='g2pm'))
result.loc[4] = metrics('xpinyin', prediction_xp())
result.loc[5] = metrics('pypinyin', prediction_pyp())

loading vocabulary file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\vocab.txt
Load pre-trained BERT parameters from file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\model.bin.
loading vocabulary file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\vocab.txt
Load pre-trained BERT parameters from file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\model.bin.
loading vocabulary file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\vocab.txt
Load pre-trained BERT parameters from file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\model.bin.
loading vocabulary file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\vocab.txt
Load pre-trained BERT parameters from file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\model.bin.
loading vocabulary file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\vocab.txt
Load pre-trained BERT parameters from file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\model.bin.
loading vocabulary file C:\Users\Xiaomi\.fastNLP\fasthan\fasthan_base\vocab.txt
Load pre-trained BER

In [48]:
result.to_csv('result.csv', index=False)

In [49]:
result

Unnamed: 0,model,accuracy
0,G2pC-pkuseg,0.9029982363315696
1,G2pC-FastHan,0.8994708994708994
2,G2pC-UDPipe,0.8800705467372134
3,G2pM-pkuseg,0.8871252204585538
4,xpinyin,0.8306878306878307
5,pypinyin,0.7742504409171076
