# Import library

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
from scipy.io import wavfile
from collections import defaultdict, Counter
from scipy import signal
import numpy as np
import librosa
import sklearn
import random
from unicodedata import normalize
from keras.layers import Dense
from keras import Model
from keras import Input
from keras.utils import to_categorical
from keras.regularizers import l2
from keras.layers import Dense, TimeDistributed, Dropout, Bidirectional, GRU, BatchNormalization, Activation, LeakyReLU, LSTM, Flatten, RepeatVector, Permute, Multiply, Conv2D, MaxPooling2D

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Data

In [None]:
DATA_TRAIN = '/content/drive/My Drive/leanai/ML/STT/dataset/train'
DATA_TEST = '/content/drive/My Drive/leanai/ML/STT/dataset/test'

# Preprocessing

In [None]:
# 데이터 셋을 초기화하고, MFCC(Mel-Frequency Cepstral Coefficients) 특징을 추출하기 위한 준비
# pad1d와 pad2d는 데이터 패딩을 위한 람다 함수로, 모든 데이터가 같은 크기를 갖도록
trainset = []
testset = []

train_X = []
train_mfccs = []
train_y = []

test_X = []
test_mfccs = []
test_y = []

pad1d = lambda a, i: a[0: i] if a.shape[0] > i else np.hstack((a, np.zeros(i-a.shape[0])))
pad2d = lambda a, i: a[:, 0:i] if a.shape[1] > i else np.hstack((a, np.zeros((a.shape[0], i-a.shape[1]))))

frame_length = 0.025
frame_stride = 0.0010

In [None]:
# 각 WAV 파일을 로드하고, librosa를 사용하여 MFCC 특징을 추출한 다음, 라벨링을 수행합니다.
# '어', '음', '그'의 비언어적 표현은 각각 0, 1, 2로 라벨링됩니다. 데이터를 셔플하여 모델이 일반화를 잘 하도록 합니다.
from sklearn import preprocessing

for filename in os.listdir(DATA_TRAIN+ "/"):
  filename = normalize('NFC', filename)
  try:
    if '.wav' not in filename:
      continue

    wav, sr = librosa.load(DATA_TRAIN+ "/"+ filename, sr=16000)

    mfcc = librosa.feature.mfcc(y=wav, sr=16000, n_mfcc=100, n_fft=400, hop_length=160)
    mfcc = sklearn.preprocessing.scale(mfcc, axis=1)
    padded_mfcc = pad2d(mfcc, 40)

    if filename[0] == '어':
      trainset.append((padded_mfcc, 0))
    elif filename[0] == '음':
      trainset.append((padded_mfcc, 1))
    elif filename[0] == '그':
      trainset.append((padded_mfcc, 2))
  except Exception as e:
    print(filename, e)
    raise

random.shuffle(trainset)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m


In [None]:
for filename in os.listdir(DATA_TEST + "/"):
  filename = normalize('NFC', filename)
  try:
    if '.wav' not in filename in filename:
      continue

    wav, sr = librosa.load(DATA_TEST+ "/"+ filename, sr=16000)

    mfcc = librosa.feature.mfcc(y=wav, sr=16000, n_mfcc=100, n_fft=400, hop_length=160)
    mfcc = sklearn.preprocessing.scale(mfcc, axis=1)
    padded_mfcc = pad2d(mfcc, 40)

    if filename[0] == '어':
      testset.append((padded_mfcc, 0))
    elif filename[0] == '음':
      testset.append((padded_mfcc, 1))
    elif filename[0] == '그':
      testset.append((padded_mfcc, 2))
  except Exception as e:
    print(filename, e)
    raise

random.shuffle(testset)



In [None]:
train_mfccs = [a for (a,b) in trainset]
train_y = [b for (a,b) in trainset]

test_mfccs = [a for (a,b) in testset]
test_y = [b for (a,b) in testset]

train_mfccs = np.array(train_mfccs)
train_y = to_categorical(np.array(train_y))

test_mfccs = np.array(test_mfccs)
test_y = to_categorical(np.array(test_y))

print('train_mfccs:', train_mfccs.shape)
print('train_y:', train_y.shape)

print('test_mfccs:', test_mfccs.shape)
print('test_y:', test_y.shape)


train_mfccs: (2033, 100, 40)
train_y: (2033, 3)
test_mfccs: (150, 100, 40)
test_y: (150, 3)


# Training

In [None]:
train_X_ex = np.expand_dims(train_mfccs, -1)
test_X_ex = np.expand_dims(test_mfccs, -1)
print('train X shape:', train_X_ex.shape)
print('test X shape:', test_X_ex.shape)

train X shape: (2033, 100, 40, 1)
test X shape: (150, 100, 40, 1)


In [None]:
ip = Input(shape=train_X_ex[0].shape)

m = Conv2D(32, kernel_size=(4,4), activation='relu')(ip)
m = MaxPooling2D(pool_size=(4,4))(m)

m = Conv2D(32*2, kernel_size=(4,4), activation='relu')(ip)
m = MaxPooling2D(pool_size=(4,4))(m)

m = Conv2D(32*3, kernel_size=(4,4), activation='relu')(ip)
m = MaxPooling2D(pool_size=(4,4))(m)

m = Flatten()(m)

m = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(m)

m = Dense(32, activation='relu', kernel_regularizer=l2(0.001))(m)

m = Flatten()(m)

op = Dense(3, activation='softmax')(m)

model = Model(ip, op)

model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100, 40, 1)]      0         
                                                                 
 conv2d_5 (Conv2D)           (None, 97, 37, 96)        1632      
                                                                 
 max_pooling2d_5 (MaxPoolin  (None, 24, 9, 96)         0         
 g2D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 20736)             0         
                                                                 
 dense_3 (Dense)             (None, 64)                1327168   
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                           

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_X_ex,
                    train_y,
                    epochs=50,
                    batch_size=128,
                    verbose=2
                    )


Epoch 1/50
16/16 - 13s - loss: 1.6606 - accuracy: 0.3473 - 13s/epoch - 811ms/step
Epoch 2/50
16/16 - 10s - loss: 1.2325 - accuracy: 0.3974 - 10s/epoch - 629ms/step
Epoch 3/50
16/16 - 11s - loss: 1.1700 - accuracy: 0.4840 - 11s/epoch - 697ms/step
Epoch 4/50
16/16 - 12s - loss: 1.1009 - accuracy: 0.5155 - 12s/epoch - 773ms/step
Epoch 5/50
16/16 - 12s - loss: 1.0280 - accuracy: 0.5691 - 12s/epoch - 726ms/step
Epoch 6/50
16/16 - 10s - loss: 0.9699 - accuracy: 0.6026 - 10s/epoch - 651ms/step
Epoch 7/50
16/16 - 13s - loss: 0.9089 - accuracy: 0.6390 - 13s/epoch - 782ms/step
Epoch 8/50
16/16 - 14s - loss: 0.8640 - accuracy: 0.6695 - 14s/epoch - 851ms/step
Epoch 9/50
16/16 - 13s - loss: 0.7901 - accuracy: 0.7054 - 13s/epoch - 803ms/step
Epoch 10/50
16/16 - 11s - loss: 0.7324 - accuracy: 0.7516 - 11s/epoch - 681ms/step
Epoch 11/50
16/16 - 13s - loss: 0.7671 - accuracy: 0.7083 - 13s/epoch - 823ms/step
Epoch 12/50
16/16 - 13s - loss: 0.6676 - accuracy: 0.7609 - 13s/epoch - 826ms/step
Epoch 13/50
1

# Save model

In [None]:
model.save('/content/drive/My Drive/leanai/ML/STT/model/multi_model.h5')