# Scratch: fixing dataset
- **11/29**: Fixed the problem where code wasn't differentiating between `NaN`s and truly empty fields.

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
with open('preprocessing/dataset/processed2.pkl', 'rb') as f:
    sinoxenic = pickle.load(f).drop(['Column5', 'Column6', 'Column7', 'Column8'], axis=1)

In [3]:
sinoxenic.iloc[5000]

character           渰
wiki_id         19172
mando_onset         ∅
mando_nucl         ya
mando_coda          n
mando_tone          3
canto_onset         j
canto_nucl          i
canto_coda          m
canto_tone          2
jp_on_first          
jp_on_rest           
jp_kan_first         
jp_kan_rest          
kor_0               ㅇ
kor_1               ㅓ
kor_2               ㅁ
Name: 5001, dtype: object

In [4]:
sinoxenic_fixed = sinoxenic.applymap(lambda x: np.NaN if not x else x)

In [5]:
sinoxenic_fixed.iloc[5000]

character           渰
wiki_id         19172
mando_onset         ∅
mando_nucl         ya
mando_coda          n
mando_tone          3
canto_onset         j
canto_nucl          i
canto_coda          m
canto_tone          2
jp_on_first       NaN
jp_on_rest        NaN
jp_kan_first      NaN
jp_kan_rest       NaN
kor_0               ㅇ
kor_1               ㅓ
kor_2               ㅁ
Name: 5001, dtype: object

In [6]:
to_encode = sinoxenic_fixed.columns[2:]
for col_name in to_encode:
    one_hot = pd.get_dummies(sinoxenic_fixed[col_name], prefix=col_name)
    sinoxenic_fixed = sinoxenic_fixed.drop([col_name], axis=1).join(one_hot)

In [7]:
{k for k, v in dict(sinoxenic_fixed.iloc[5000]).items() if v == 1}

{'canto_coda_m',
 'canto_nucl_i',
 'canto_onset_j',
 'canto_tone_2',
 'kor_0_ㅇ',
 'kor_1_ㅓ',
 'kor_2_ㅁ',
 'mando_coda_n',
 'mando_nucl_ya',
 'mando_onset_∅',
 'mando_tone_3'}

In [8]:
mc = pd.read_csv('preprocessing/dataset/pron/mc-full-table.tsv', sep='\t')
mc.head()

Unnamed: 0,character,index,tone_label,Zhengzhang_onset,Zhengzhang_nucleus,Zhengzhang_coda,Pan_onset,Pan_nucleus,Pan_coda,Shao_onset,...,Shao_coda,Pulleyblank_onset,Pulleyblank_nucleus,Pulleyblank_coda,Li_onset,Li_nucleus,Li_coda,Karlgren_onset,Karlgren_nucleus,Karlgren_coda
0,一,5611001,checked,ʔ,iɪ,t̚,ʔ,i,t̚,ʔ,...,t̚,ʔ,i,t̚,ʔ,iĕ,t̚,ʔ,i̯ĕ,t̚
1,丁,5611004,level,ʈ,ˠɛ,ŋ,ʈ,ᵚæ,ŋ,ȶ,...,ŋ,ʈ,əɨj,ŋ,ȶ,ɛ,ŋ,ȶ,æ,ŋ
2,丂,5611009,rising,kʰ,ɑu,∅,kʰ,ɑu,∅,kʰ,...,∅,kʰ,aw,∅,kʰ,ɑu,∅,kʰ,ɑu,∅
3,七,5611012,checked,t͡sʰ,iɪ,t̚,t͡sʰ,i,t̚,t͡sʰ,...,t̚,t͡sʰ,i,t̚,t͡sʰ,iĕ,t̚,t͡sʰ,i̯ĕ,t̚
4,丄,5611016,rising,d͡ʑ,ɨɐ,ŋ,d͡ʑ,iɐ,ŋ,d͡ʑ,...,ŋ,d͡ʑ,ɨa,ŋ,ʑ,ia,ŋ,ʑ,i̯a,ŋ


In [9]:
cols = ['character', 'index', 'tone_label', 'Karlgren_onset', 'Karlgren_nucleus', 'Karlgren_coda']
mc_karlgren = mc[cols]
mc_karlgren.head()

Unnamed: 0,character,index,tone_label,Karlgren_onset,Karlgren_nucleus,Karlgren_coda
0,一,5611001,checked,ʔ,i̯ĕ,t̚
1,丁,5611004,level,ȶ,æ,ŋ
2,丂,5611009,rising,kʰ,ɑu,∅
3,七,5611012,checked,t͡sʰ,i̯ĕ,t̚
4,丄,5611016,rising,ʑ,i̯a,ŋ


In [10]:
tones  = pd.get_dummies(mc_karlgren.tone_label, prefix='tone_label')
onsets = pd.get_dummies(mc_karlgren.Karlgren_onset, prefix='Karlgren_onset')
nuclei = pd.get_dummies(mc_karlgren.Karlgren_nucleus, prefix='Karlgren_nucleus')
codas  = pd.get_dummies(mc_karlgren.Karlgren_coda, prefix='Karlgren_coda')
karlgren_full = mc_karlgren.drop([
    'tone_label',
    'Karlgren_onset',
    'Karlgren_nucleus',
    'Karlgren_coda'], axis=1).join([
        tones,
        onsets,
        nuclei,
        codas])
karlgren_full.shape

(19453, 113)

In [11]:
full_matrix = sinoxenic_fixed.set_index('character').join(karlgren_full.set_index('character')).drop(['wiki_id', 'index'], axis=1)

In [12]:
full_matrix = full_matrix.drop('㠛').drop('Karlgren_nucleus_iwɐ', axis=1) # (malformed, not very important)
full_matrix.shape

(15250, 466)

In [13]:
full_matrix.to_csv('model/1129-fixed-data-matrix-karlgren.csv')