In [1]:
import os
import librosa   #for audio processing
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
from matplotlib import pyplot
import warnings
warnings.filterwarnings("ignore")

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


In [None]:
# get clone for achieving dataset
!git clone https://github.com/ym769/AudioProcessing.git

In [None]:
# plotting a sound sample of "bird" (Time domain)
train_audio_path = 'AudioProcessing/train/audio/'
samples, sample_rate = librosa.load(train_audio_path+'bird/01bcfc0c_nohash_0.wav', sr=16000) # sample rate = sr
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(111)
ax1.set_title('Raw wave of ' + 'train/audio/bird/01bcfc0c_nohash_0.wav',color="white")
ax1.set_xlabel('time',color="white")
ax1.set_ylabel('Amplitude',color="white")
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)

In [None]:
# plotting a sound sample of "cat" (Time domain)
train_audio_path = 'AudioProcessing/train/audio/'
samples, sample_rate = librosa.load(train_audio_path+'cat/004ae714_nohash_0.wav', sr=16000) # sample rate = sr
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(111)
ax1.set_title('Raw wave of ' + '00b01445_nohash_0.wav',color="white")
ax1.set_xlabel('time',color="white")
ax1.set_ylabel('Amplitude',color="white")
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)

In [None]:
# 上で取得した "cat" (004ae714_nohash_0.wav) のsample, which have gotten as an array with 16000 numbers.
print(samples)
len(samples)

In [None]:
samples = librosa.resample(samples, sample_rate, 8000) # resample
print(samples)
len(samples)

In [None]:
# Now we correct all samples from directry "train/audio"
# it takes time.
train_audio_path = 'AudioProcessing/train/audio/'
labels = ["bird","cat","happy"]

all_wave = [] # list for sampling arrays
all_label = [] # list for their each labels
for label in labels:
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')] # list of wavfiles
    for wav in waves:
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 8000) # open wavfile
        if (len(samples)== 8000) : 
            all_wave.append(samples)
            all_label.append(label)

In [None]:
print(all_label[:5]) # all_labelには正解ラベルが格納されている
print(all_wave[:5])

In [None]:
# Convert the output labels to integer encoded: 1 for "cat" and 0 for "bird"
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y=le.fit_transform(all_label)
classes= list(le.classes_)
y

In [None]:
# convert the integer labels to a one-hot vector since it is a multi-classification problem:
from keras.utils import np_utils
y=np_utils.to_categorical(y, num_classes=len(labels))

In [None]:
print(y[0]) # label for the first sampling array
print(y[300]) # label for the 299th sampling array

In [None]:
# conv1dへの入力は3D配列でなければならないため、2D配列を3Dに再形成します
all_wave = np.array(all_wave).reshape(-1,8000,1)

In [None]:
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(np.array(all_wave),np.array(y),stratify=y,test_size = 0.2,shuffle=True)

In [None]:
print(y_tr[:3])
x_tr[:3]

In [None]:
print(y_val[:3])
x_val[:3]

In [None]:
from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
K.clear_session()

inputs = Input(shape=(8000,1))

#First Conv1D layer
conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs) # 
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Second Conv1D layer
conv = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Flatten layer
conv = Flatten()(conv)

#Dense Layer 1
conv = Dense(256, activation='relu')(conv)
conv = Dropout(0.3)(conv)

outputs = Dense(len(labels), activation='softmax')(conv)

model = Model(inputs, outputs)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) # loss function, opnimizer, metricsを定義

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001) 
mc = ModelCheckpoint('best_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [None]:
history=model.fit(x_tr, y_tr ,epochs=10, callbacks=[es,mc], batch_size=32, validation_data=(x_val,y_val))

In [None]:
pyplot.plot(history.history['loss'], label='train') 
pyplot.plot(history.history['val_loss'], label='valid') 
pyplot.legend() 
pyplot.show()

In [None]:
def predict(audio):
    prob=model.predict(audio.reshape(1,8000,1))
    index=np.argmax(prob[0])
    return classes[index]

In [None]:
import random
index=random.randint(0,len(x_val)-1)
samples=x_val[index].ravel()
print("Audio:",classes[np.argmax(y_val[index])])
print("Text:",predict(samples))