# 创建拼音映射字典

In [7]:
import json
import re

## 加载拼音表

In [2]:
!wget https://raw.githubusercontent.com/wu-jingtao/training-data/master/speech_recognition/chinese/拼音字典/phonetic.txt

--2021-04-22 02:01:17--  https://raw.githubusercontent.com/wu-jingtao/training-data/master/speech_recognition/chinese/%E6%8B%BC%E9%9F%B3%E5%AD%97%E5%85%B8/phonetic.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1737 (1.7K) [text/plain]
Saving to: ‘phonetic.txt’


2021-04-22 02:01:17 (28.9 MB/s) - ‘phonetic.txt’ saved [1737/1737]



In [50]:
with open('phonetic.txt') as f:
  phonetic = f.readlines()
  phonetic = list(map(lambda x: x.strip(), phonetic))

## 创建 `5` 种声调的拼音映射字典

`1` 为[阴平](https://baike.baidu.com/item/%E9%98%B4%E5%B9%B3)、`2` 为[阳平](https://baike.baidu.com/item/%E9%98%B3%E5%B9%B3)、`3` 为[上声](https://baike.baidu.com/item/%E4%B8%8A%E5%A3%B0)、`4` 为[去声](https://baike.baidu.com/item/%E5%8E%BB%E5%A3%B0)、`5` 为`轻声`
    


In [51]:
intonation_5 = {'count': 0}

for pinyin in phonetic:
  for intonation in range(1, 6):
    pronunciation = pinyin + str(intonation)
    intonation_5[intonation_5["count"]] = pronunciation
    intonation_5[pronunciation] = intonation_5["count"]
    intonation_5["count"] += 1
  
  # 将无声调归为轻声
  intonation_5[pinyin] = intonation_5[pinyin + '5']
  intonation_5[pinyin + '0'] = intonation_5[pinyin + '5']

## 创建 `4` 种声调的拼音映射字典

`1` 为[阴平](https://baike.baidu.com/item/%E9%98%B4%E5%B9%B3)、`2` 为[阳平](https://baike.baidu.com/item/%E9%98%B3%E5%B9%B3)、`3` 为[上声](https://baike.baidu.com/item/%E4%B8%8A%E5%A3%B0)、`4` 为[去声](https://baike.baidu.com/item/%E5%8E%BB%E5%A3%B0)

In [52]:
intonation_4 = {'count': 0}

for pinyin in phonetic:
  for intonation in range(1, 5):
    pronunciation = pinyin + str(intonation)
    intonation_4[intonation_4["count"]] = pronunciation
    intonation_4[pronunciation] = intonation_4["count"]
    intonation_4["count"] += 1
  
  # 将轻声归为一声
  intonation_4[pinyin + '5'] = intonation_4[pinyin + '1']

  # 将无声调归为一声
  intonation_4[pinyin] = intonation_4[pinyin + '1']
  intonation_4[pinyin + '0'] = intonation_4[pinyin + '1']

## 创建无声调的拼音映射字典

In [53]:
intonation_0 = {'count': 0}

for pinyin in phonetic:
  intonation_0[intonation_0["count"]] = pinyin
  intonation_0[pinyin] = intonation_0["count"]
  intonation_0["count"] += 1

  # 添加5声调到无声调的索引
  for intonation in range(6):
    pronunciation = pinyin + str(intonation)
    intonation_0[pronunciation] = intonation_0[pinyin]

## 创建无声调加模糊音拼音映射字典

模糊音参考的是 [百度拼音输入法](https://jingyan.baidu.com/article/636f38bb35428fd6b84610f5.html)

In [54]:
# 模糊音列表
fuzzy_pinyin = [
  {'target': re.compile('^ch'), 'replace': 'c'}, 
  {'target': re.compile('^sh'), 'replace': 's'}, 
  {'target': re.compile('^zh'), 'replace': 'z'}, 
  {'target': re.compile('^n'), 'replace': 'l'}, 
  {'target': re.compile('^h'), 'replace': 'f'}, 
  {'target': re.compile('^l'), 'replace': 'r'}, 
  {'target': re.compile('ang$'), 'replace': 'an'}, 
  {'target': re.compile('eng$'), 'replace': 'en'}, 
  {'target': re.compile('ing$'), 'replace': 'in'}, 
  {'target': re.compile('iang$'), 'replace': 'ian'}, 
  {'target': re.compile('uang$'), 'replace': 'uan'}
]

In [55]:
intonation_fuzzy = {'count': 0}

for pinyin in phonetic:
  fuzzy = pinyin
  for pattern in fuzzy_pinyin:
    fuzzy = pattern['target'].sub(pattern['replace'], fuzzy)

  if fuzzy not in intonation_fuzzy:
    intonation_fuzzy[intonation_fuzzy["count"]] = fuzzy
    intonation_fuzzy[fuzzy] = intonation_fuzzy["count"]
    intonation_fuzzy["count"] += 1

  # 添加5声调到模糊音的索引
  for intonation in range(6):
    pronunciation = pinyin + str(intonation)
    intonation_fuzzy[pronunciation] = intonation_fuzzy[fuzzy]

## 保存结果

In [56]:
with open('intonation_5.json', 'w') as f:
  json.dump(intonation_5, f)

with open('intonation_4.json', 'w') as f:
  json.dump(intonation_4, f)

with open('intonation_0.json', 'w') as f:
  json.dump(intonation_0, f)

with open('intonation_fuzzy.json', 'w') as f:
  json.dump(intonation_fuzzy, f)