In [21]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import hanja
import pinyin
from pyjyutping import jyutping

In [2]:
def char_info(char):
    info = {'char': char}
    response = requests.get(f'https://ytenx.org/zim?dzih={char}&dzyen=1&jtkb=1&jtkd=1&jtdt=1&jtgt=1')
    try: 
        soup = BeautifulSoup(response.text, 'lxml')
        entry = list(soup.find_all('p')[0])
        if entry[0] == '\n':
            return pd.DataFrame(info, index=[0])
        info['tone'] = entry[0][-1]
        info['rime'] = entry[1].text
        if info['tone'] == '入':
            info['ipa'] = entry[4].text.split()[1][0:-1]
            info['tone_class'] = 'E'
        elif info['tone'] == '平':
            info['ipa'] = entry[4].text.split()[1][0:-1]
            info['tone_class'] = 'L'        
        else:
            info['ipa'] = entry[4].text.split()[1][0:-2]
            info['tone_class'] = entry[4].text.split()[1][-2]
        return pd.DataFrame(info, index=[0])
    except:
        return pd.DataFrame()

In [3]:
char_info('明')

Unnamed: 0,char,tone,rime,ipa,tone_class
0,明,平,庚,mjaeng,L


In [4]:
char_info('起')

Unnamed: 0,char,tone,rime,ipa,tone_class
0,起,上,止,khi,X


In [5]:
char_info('會')

Unnamed: 0,char,tone,rime,ipa,tone_class
0,會,去,泰,hwaj,H


In [6]:
char_info('烈')

Unnamed: 0,char,tone,rime,ipa,tone_class
0,烈,入,薛,ljet,E


In [9]:
df = pd.read_csv('../../data/char_list.csv', index_col=0)

In [10]:
df

Unnamed: 0,char
0,㑹
1,㒿
2,㕙
3,㕮
4,㖀
...,...
8563,𫇢
8564,𬋖
8565,𮪃
8566,𮫃


In [11]:
%%time
dictionary = pd.concat([char_info(i) for i in df.char])

Wall time: 30min 9s


In [13]:
dictionary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8443 entries, 0 to 0
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   char        8443 non-null   object
 1   tone        8010 non-null   object
 2   rime        8010 non-null   object
 3   ipa         8010 non-null   object
 4   tone_class  8010 non-null   object
dtypes: object(5)
memory usage: 395.8+ KB


In [14]:
dictionary.to_csv(f'../../data/dictionary.csv')

In [17]:
dictionary['pinyin'] = dictionary['char'].map(lambda x: pinyin.get(x))

In [36]:
dictionary['jyutping'] = dictionary['char'].map(lambda x: jyutping.convert(x))

In [38]:
dictionary['hangul'] = dictionary['char'].map(lambda x: hanja.translate(x, 'substitution'))

In [39]:
dictionary.iloc[1000:1030,:]

Unnamed: 0,char,tone,rime,ipa,tone_class,pinyin,jyutping,hangul
0,噲,去,夬,khwaej,H,kuài,faai3,쾌
0,噴,平,魂,phwon,L,pēn,pan3,분
0,嚀,平,青,neng,L,níng,ning4,영
0,嚄,入,陌,hwaek,E,huò,o2,획
0,嚅,平,虞,nyu,L,rú,jyu4,유
0,嚇,去,禡,xae,H,xià,haak3,하
0,嚌,去,霽,dzej,H,jì,zai6,제
0,嚎,平,豪,haw,L,háo,hou4,호
0,嚏,去,霽,tej,H,tì,tai3,체
0,嚕,,,,,lū,lou1,노


In [40]:
dictionary.to_csv('../../data/dictionary_expanded.csv')

In [47]:
clean_dictionary = pd.read_csv('../../data/dictionary.csv', index_col=0)
clean_dictionary = clean_dictionary.dropna()
clean_dictionary['pinyin'] = clean_dictionary['char'].map(lambda x: pinyin.get(x))
clean_dictionary['jyutping'] = clean_dictionary['char'].map(lambda x: jyutping.convert(x))
clean_dictionary['hangul'] = clean_dictionary['char'].map(lambda x: hanja.translate(x, 'substitution'))
clean_dictionary.reset_index(drop=True, inplace=True)

In [62]:
clean_dictionary['rime'] = clean_dictionary['rime'].map(lambda x: x[0])

In [121]:
rimes = list(clean_dictionary.rime.unique())
rimes = pd.DataFrame(rimes, index=range(0,208), columns=['rime'])
rimes.to_csv('../../data/rimes.csv', index=None)

In [122]:
rimes['rime_index'] = rimes.index

In [140]:
rimes[rimes['rime'] == '送']

Unnamed: 0,rime,rime_index
91,送,91


In [128]:
clean_dictionary = clean_dictionary.join(rimes.set_index('rime'), on='rime', how='left')

In [130]:
clean_dictionary['meter_class'] = clean_dictionary['tone_class'].map(lambda x: 1 if x=='L' else 0)

In [131]:
clean_dictionary.iloc[1000:1010,:]

Unnamed: 0,char,tone,rime,ipa,tone_class,pinyin,jyutping,hangul,rime_index,meter_class
1000,圭,平,齊,kwej,L,gūi,gwai1,규,5,1
1001,圮,上,旨,bij,X,pǐ,pei2,비,134,0
1002,圯,平,之,yi,L,yí,ji4,이,67,1
1003,地,去,至,dij,H,dì,dei6,지,72,0
1004,圻,平,微,gj+j,L,qí,kei4,기,107,1
1005,址,上,止,tsyi,X,zhǐ,zi2,지,88,0
1006,坂,上,阮,pjon,X,bǎn,baan2,판,119,0
1007,均,平,諄,kjwin,L,jūn,gwan1,균,0,1
1008,坊,平,陽,bjang,L,fāng,fong1,방,78,1
1009,坌,上,吻,bjun,X,bèn,ban6,분,146,0


In [161]:
rime_table = pd.read_excel('../../data/guangyun_rime_table.xlsx')

In [167]:
rime_table['L'] = rime_table['L'].map(lambda x: x[0] if type(x)==str else np.NaN)
rime_table['X'] = rime_table['X'].map(lambda x: x[0] if type(x)==str else np.NaN)
rime_table['H'] = rime_table['H'].map(lambda x: x[0] if type(x)==str else np.NaN)
rime_table['E'] = rime_table['E'].map(lambda x: x[0] if type(x)==str else np.NaN)

In [None]:
for i in rime_table.L:
    print(type(i)

In [241]:
rime_index = [{rime_table.E[i]: rime_table.rime_index[i]} for i in range(0,79) if type(rime_table.E[i])==str] + [{rime_table.L[i]: rime_table.rime_index[i]} for i in range(0,79) if type(rime_table.L[i])==str] + [{rime_table.X[i]: rime_table.rime_index[i]} for i in range(0,79) if type(rime_table.X[i])==str] + [{rime_table.H[i]: rime_table.rime_index[i]} for i in range(0,79) if type(rime_table.H[i])==str]

In [243]:
rime_indices = {}
for i in rime_index:
    rime_indices.update(i)

In [171]:
rime_table.to_csv('../../data/guangyun_rime_table.csv')

In [169]:
clean_dictionary['rime_index'] = 999

In [246]:
clean_dictionary['rime_index'] = clean_dictionary.rime.map(rime_indices)

In [248]:
clean_dictionary[clean_dictionary['rime_index']==999]

Unnamed: 0,char,tone,rime,ipa,tone_class,pinyin,jyutping,hangul,rime_index,meter_class


In [247]:
clean_dictionary.iloc[1000:1010,:]

Unnamed: 0,char,tone,rime,ipa,tone_class,pinyin,jyutping,hangul,rime_index,meter_class
1000,圭,平,齊,kwej,L,gūi,gwai1,규,5,1
1001,圮,上,旨,bij,X,pǐ,pei2,비,3,0
1002,圯,平,之,yi,L,yí,ji4,이,3,1
1003,地,去,至,dij,H,dì,dei6,지,3,0
1004,圻,平,微,gj+j,L,qí,kei4,기,3,1
1005,址,上,止,tsyi,X,zhǐ,zi2,지,3,0
1006,坂,上,阮,pjon,X,bǎn,baan2,판,6,0
1007,均,平,諄,kjwin,L,jūn,gwan1,균,6,1
1008,坊,平,陽,bjang,L,fāng,fong1,방,11,1
1009,坌,上,吻,bjun,X,bèn,ban6,분,6,0


In [249]:
clean_dictionary.to_csv('../../data/dictionary_cleaned.csv')