# Part 03 - Modeling

### 01 - Import library

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import librosa
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten

### 02 - Unpickle the features

In [2]:
df_tr = pd.read_pickle('../pickles/train.pkl')
df_ts = pd.read_pickle('../pickles/test.pkl')
df_ho = pd.read_pickle('../pickles/hold_out.pkl')

### 03 - Transform into tensorflow-friendly format

In [3]:
# use label encoder to convert the language labels to numeric target column
encoder = LabelEncoder()
encoder.fit(df_tr['lang'])
labels = encoder.classes_
y_tr = encoder.transform(df_tr['lang'])
y_ts = encoder.transform(df_ts['lang'])
y_ho = encoder.transform(df_ho['lang'])
print(labels)

['cn' 'tw']


In [11]:
# Convert feature matrix into a keras-friendly format
X_tr = np.array(df_tr['features'].tolist())
X_tr = X_tr.reshape(X_tr.shape[0],128,256,1)
X_ts = np.array(df_ts['features'].tolist())
X_ts = X_ts.reshape(X_ts.shape[0],128,256,1)
X_ho = np.array(df_ho['features'].tolist())
X_ho = X_ho.reshape(X_ho.shape[0],128,256,1)

In [12]:
# Check the shape
print(X_tr.shape)
print(y_tr.shape)
print(X_ts.shape)
print(y_ts.shape)
print(X_ho.shape)
print(y_ho.shape)

(5924, 128, 256, 1)
(5924,)
(740, 128, 256, 1)
(740,)
(740, 128, 256, 1)
(740,)


### 04 - CNN Model

In [121]:
model = Sequential()
model.add(Conv2D(32, (5, 5), activation='relu', input_shape=(128, 256, 1), padding = 'same'))
model.add(MaxPooling2D((3, 3)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (5, 5), activation='relu', padding = 'same'))
model.add(MaxPooling2D((3, 3)))
model.add(Dropout(0.25))
model.add(Conv2D(128, (5, 5), activation='relu', padding = 'same'))
model.add(MaxPooling2D((3, 3)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [122]:
model.summary()

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_88 (Conv2D)           (None, 128, 256, 32)      832       
_________________________________________________________________
max_pooling2d_88 (MaxPooling (None, 42, 85, 32)        0         
_________________________________________________________________
dropout_86 (Dropout)         (None, 42, 85, 32)        0         
_________________________________________________________________
conv2d_89 (Conv2D)           (None, 42, 85, 64)        51264     
_________________________________________________________________
max_pooling2d_89 (MaxPooling (None, 14, 28, 64)        0         
_________________________________________________________________
dropout_87 (Dropout)         (None, 14, 28, 64)        0         
_________________________________________________________________
conv2d_90 (Conv2D)           (None, 14, 28, 128)     

In [117]:
tf.keras.losses.SparseCategoricalCrossentropy(name='sparse_categorical_crossentropy')

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [118]:
EPOCHS = 100
checkpoint_filepath = '../temp_checkpoint/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [119]:
history = model.fit(X_tr, y_tr, epochs=EPOCHS, validation_data=(X_ts, y_ts), callbacks=[model_checkpoint_callback])

Train on 5924 samples, validate on 740 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


In [120]:
checkpoint_filepath = '../temp_checkpoint/'
model.load_weights(checkpoint_filepath)
val_loss, val_acc = model.evaluate(X_ho, y_ho, verbose=1)



In [91]:
# model.save('../saved_model/20200604/')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../saved_model/20200604/assets


### 05 - Predict with personal recording

In [4]:
new_model = tf.keras.models.load_model('../saved_model/20200604')

In [27]:
new_model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_19 (Conv2D)           (None, 128, 256, 32)      832       
_________________________________________________________________
max_pooling2d_19 (MaxPooling (None, 42, 85, 32)        0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 42, 85, 32)        0         
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 42, 85, 64)        51264     
_________________________________________________________________
max_pooling2d_20 (MaxPooling (None, 14, 28, 64)        0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 14, 28, 64)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 14, 28, 128)      

In [5]:
# same function from preprocessing notebook
def wav_to_img(path):
    audio, sr = librosa.load(path,duration=2.97)
    # parameters for calculating spectrogram in mel scale
    fmax = 10000 # maximum frequency considered
    fft_window_points = 512
    fft_window_dur = fft_window_points * 1.0 / sr
    hop_size = int(fft_window_points/ 2) # 50% overlap between consecutive frames
    n_mels = 128
    spec = librosa.feature.melspectrogram(audio, sr=sr, n_mels=n_mels, n_fft=fft_window_points, hop_length=hop_size, fmax=fmax)
    spec_gram = librosa.power_to_db(spec, np.max)
    try:
        return librosa.util.pad_center(spec_gram, size = 256, axis = 1)
    except:
        return spec_gram

Test case 1 (Non-native speak Male Recording)

In [9]:
# separate the test cases
test_cases1 = []
file_names1 = os.listdir('../test_audio/')
index_nums1 = [3,4,5,6,8]
output1 = [file_names1[val] for val in index_nums1]
for f in output1:
    path = os.path.join('../test_audio/',f)
    result = wav_to_img(path)
    test_cases1.append(result)
test_arr1 = np.array(test_cases1)
test_arr1 = test_arr1.reshape(test_arr1.shape[0],128,256,1)

In [19]:
prediction1 = model.predict(test_arr1)
pred_val = []
for i in prediction1:
    pred = np.argmax(i)
    pred_val.append(pred)
encoder.inverse_transform(pred_val)

array(['cn', 'cn', 'tw', 'cn', 'cn'], dtype=object)

Test case 2 (Taiwanese Female Recording)

In [7]:
# separate the test cases
test_cases2 = []
file_names2 = os.listdir('../test_audio/')
index_nums2 = [0,1,2,7,9]
output2 = [file_names2[val] for val in index_nums2]
for f in output2:
    path = os.path.join('../test_audio/',f)
    result = wav_to_img(path)
    test_cases2.append(result)
test_arr2 = np.array(test_cases2)
test_arr2 = test_arr2.reshape(test_arr2.shape[0],128,256,1)

In [20]:
prediction2 = model.predict(test_arr2)
pred_val = []
for i in prediction2:
    pred = np.argmax(i)
    pred_val.append(pred)
encoder.inverse_transform(pred_val)

array(['cn', 'tw', 'cn', 'cn', 'tw'], dtype=object)