# MLP-2: Combining pronunciation and phonetic component data

In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import confusion_matrix
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

Using TensorFlow backend.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9541365452800904555
]


In [32]:
matrix = pd.read_csv('model/1129-fixed-data-matrix-karlgren.csv').set_index('character')
with open('full-ordering.json', encoding='utf-8') as f:
    matrix = matrix[json.load(f)]

In [33]:
with open('hypothesized_phonetic_series.json', encoding='utf8') as f:
    js = json.load(f)

In [34]:
def getps(char):
    if char in js:
        return js[char]
    else:
        return 0
    
def rowIndex(row):
    return getps(row.name)

matrix['ps'] = matrix.apply(rowIndex, axis=1)

one_hot = pd.get_dummies(matrix['ps'], prefix = 'ps')
dropval = 0
print(one_hot.shape)
one_hot.drop([col for col, val in one_hot.sum().iteritems() if val < dropval], axis=1, inplace=True)
print(one_hot.shape)
matrix = matrix.drop('ps', axis=1)
matrix = matrix.join(one_hot)
print(matrix.columns[:10])
print(matrix.columns[-10:])

(15250, 4097)
(15250, 4097)
Index(['mando_onset_b', 'mando_onset_c', 'mando_onset_ch', 'mando_onset_d',
       'mando_onset_f', 'mando_onset_g', 'mando_onset_h', 'mando_onset_j',
       'mando_onset_k', 'mando_onset_l'],
      dtype='object')
Index(['ps_𩠐', 'ps_𩧉', 'ps_𩰪', 'ps_𪁾', 'ps_𪇘', 'ps_𪈮', 'ps_𪙍', 'ps_𪚦', 'ps_𪚮',
       'ps_𪚱'],
      dtype='object')


In [37]:
cols = matrix.columns
features_cols = [c for c in cols if 'Karlgren' not in c and 'tone_label' not in c]
labels_cols = [c for c in cols if c not in features_cols]

In [6]:
X_complete = matrix[features_cols]
Y_complete = matrix[labels_cols]
X_train, X_test, Y_train, Y_test = train_test_split(X_complete, Y_complete, test_size=.3, random_state=0)

In [19]:
labels_arch = [
    ('tone_label', 
     (
        (32, 'relu'),
        (16, 'relu'),
     ),
     10
    ),
    ('Karlgren_onset',
     (
        (768, 'relu'),
        (0.3, 'dropout'),
        (192, 'relu'),
     ),
     4
    ),
    ('Karlgren_nucleus',
     (
        (1024, 'relu'),
        (0.3, 'dropout'),
        (192, 'relu'),
     ),
     4
    ),
    ('Karlgren_coda',
     (
        (96, 'relu'),
        (32, 'relu'),
     ),
     5
    )
]

In [73]:
def fit_keras_models(X, X_test, Y, Y_test):
    category_to_performance = {}
    category_to_cm = {}
    model = []
    for category, architecture, num_epochs in labels_arch:
#         if 'nucleus' in category:
        if 1:
            print("======= Starting training for " + category + " =======")

            Y_subset = Y[[x for x in Y.columns if category in x]]
            Y_test_subset = Y_test[[x for x in Y_test.columns if category in x]]
            print(len(Y_test_subset.columns), Y_test_subset.columns)
            n_bins = Y_subset.shape[1]

            def build_keras_nn_model():
                model = Sequential()
                model.add(Dense(
                    architecture[0][0],
                    input_dim=X.shape[1],
                    activation=architecture[0][1])
                )

                if len(architecture) > 1:
                    for l, (hidden_layer_size, act_fn) in enumerate(architecture[1:]):
                        if act_fn is 'dropout':
                            model.add(Dropout(
                                rate=hidden_layer_size
                            ))
                        else:
                            model.add(Dense(
                                hidden_layer_size,
                                input_dim=architecture[l - 1],
                                activation=act_fn)
                            )

                model.add(Dense(n_bins, activation='softmax'))
                model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
                return model
            KerasNN = KerasClassifier(
                build_fn=build_keras_nn_model,
                epochs=num_epochs,
                batch_size=256,
                verbose=1
            )
            X_train = X
            Y_train = Y_subset
            KerasNN.fit(X_train, Y_train)
            y_pred = KerasNN.predict(X_test)
            Y_test_subset_numeric_idx = Y_test_subset.copy()
            Y_test_subset_numeric_idx.columns = list(range(len(Y_test_subset_numeric_idx.columns)))
            true_y = np.array(Y_test_subset_numeric_idx.idxmax(axis=1))
            cnf_matrix = confusion_matrix(true_y, y_pred, labels=list(range(n_bins)))
            acc = np.sum(true_y == y_pred) / len(true_y)
            print(cnf_matrix, cnf_matrix.shape, max(y_pred), acc)
            category_to_performance[category] = acc
            category_to_cm[category] = cnf_matrix
        
    return category_to_performance, category_to_cm

In [74]:
performance, cms = fit_keras_models(
    X_train,
    X_test,
    Y_train,
    Y_test,
)

4 Index(['tone_label_checked', 'tone_label_departing', 'tone_label_level',
       'tone_label_rising'],
      dtype='object')
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[ 813   12   11    7]
 [  40  534  161  112]
 [  16  100 1800  144]
 [   9   91  174  562]] (4, 4) 3 0.808765808983864
36 Index(['Karlgren_onset_bʱ', 'Karlgren_onset_p', 'Karlgren_onset_pʰ',
       'Karlgren_onset_m', 'Karlgren_onset_dʱ', 'Karlgren_onset_tʰ',
       'Karlgren_onset_d͡zʱ', 'Karlgren_onset_n', 'Karlgren_onset_s',
       'Karlgren_onset_t', 'Karlgren_onset_t͡s', 'Karlgren_onset_t͡sʰ',
       'Karlgren_onset_z', 'Karlgren_onset_ȡʱ', 'Karlgren_onset_d͡ʑʰ',
       'Karlgren_onset_ȵʑ', 'Karlgren_onset_t͡ɕ', 'Karlgren_onset_t͡ɕʰ',
       'Karlgren_onset_ȶ', 'Karlgren_onset_ȶʰ', 'Karlgren_onset_ɕ',
       'Karlgren_onset_ʑ', 'Karlgren_onset_l', 'Karlgren_onset_ɖ͡ʐʰ',
       'Karlgren_onset_ɖ͡ʐʱ', 'Karlgren_onset_ʂ', 'Karlgren_onset_ʈ͡ʂ',
      

In [23]:
import seaborn as sns

In [103]:
THRES = 1

for category, _, _ in labels_arch:
    cm = cms[category]
    cm_norm = cm.T / cm.astype(float).sum(axis=1)
    cm_norm_df = pd.DataFrame(cm_norm)
    cm_norm_df.columns = [x.replace(category, '')[1:] for x in labels_cols if category in x]
    cm_norm_df = cm_norm_df.set_index(cm_norm_df.columns)
    
    # principal submatrix
    for col in cm_norm_df.columns:
        diag = cm_norm_df[col][col]
        if diag != diag or diag > THRES:
            cm_norm_df = cm_norm_df.drop(col).drop(col, axis=1)
    
    mis = []
    for r in cm_norm_df.columns:
        for c in cm_norm_df.columns:
            if r == c or not cm_norm_df[c][r]:
                continue
            mis.append((cm_norm_df[c][r], c, r))
    print([(y, z) for x,y,z in sorted(mis, reverse=True)[:10]])
            
    
#     ax = plt.figure(figsize=(len(cm_norm_df) / 2, len(cm_norm_df) / 2), dpi=200)
#     plt.title(category)
#     sns.heatmap(cm_norm_df, square=True)
#     plt.yticks(rotation=0)
#     plt.savefig('figs/' + category + '_cm_dpi200.png')
#     plt.show()

[('rising', 'level'), ('departing', 'level'), ('departing', 'rising'), ('rising', 'departing'), ('level', 'rising'), ('level', 'departing'), ('departing', 'checked'), ('checked', 'departing'), ('checked', 'level'), ('rising', 'checked')]
[('d͡ʑʰ', 'ʑ'), ('pʰ', 'p'), ('pʰ', 'bʱ'), ('ʂ', 's'), ('t͡ɕʰ', 't͡ɕ'), ('ȶʰ', 't͡ɕʰ'), ('z', 's'), ('bʱ', 'p'), ('t͡s', 'd͡zʱ'), ('ʈ͡ʂ', 't͡ɕ')]
[('i̯ɑ', 'i̯u'), ('wă', 'uɑ'), ('wə', 'wæ'), ('wə', 'uo'), ('iwei', 'iei'), ('wăi', 'uɑ̆i'), ('ɑi', 'ɑ̆i'), ('i̯ɛi', 'iei'), ('wɐ', 'wɑ'), ('wɐ', 'wæ')]
[('p̚', '∅'), ('k̚', '∅'), ('t̚', '∅'), ('t̚', 'n'), ('t̚', 'k̚'), ('m', 'n'), ('n', '∅'), ('∅', 't̚'), ('∅', 'k̚'), ('m', 'p̚')]


  """
