In [None]:
import numpy as np
from pydub import AudioSegment
import random
import sys
import io
import os
import glob
import IPython
from td_utils import *
%matplotlib inline

In [None]:
import tensorflow
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.optimizers import Adam
from keras import backend as K

In [None]:
Tx = 5511 # number of time steps input to the model from the spectrogram
n_Freq = 101 # number of frequencies input to the model at each time step of the spectrogram
Ty = 1375 # number of time steps in the output of our model

In [None]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Model

In [None]:
# input = frequency spectrogram, labeled location of audio samples

def model(input_shape):
    X_input = Input(shape = input_shape)
    
    # Conv Layer
    X = Conv1D(196, kernel_size=15, strides = 4)(X_input)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = Dropout(0.8)(X)
    
    # GRU Layer
    X = GRU(units = 128, return_sequences = True)(X)
    X = Dropout(0.8)(X)
    X = BatchNormalization()(X)
    
    # GRU Layer
    X = GRU(units = 128, return_sequences = True)(X)
    X = Dropout(0.8)(X)
    X = BatchNormalization()(X)
    X = Dropout(0.8)(X)
    
    # Time-distributed Dense Layer
    X = TimeDistributed(Dense(1, activation = "sigmoid"))(X)
    
    model = Model(inputs = X_input, outputs = X)
    return model

In [None]:
model = model(input_shape = (Tx, n_Freq))

In [None]:
print(model.summary())

## Training the Model

In [None]:
training_history = []

In [None]:
opt = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, decay = 0.01)
model.compile(loss = 'binary_crossentropy', optimizer = opt, metrics = ['accuracy'])
X = np.load("")
Y = np.load("")
history = model.fit(X, Y, batch_size = 10, epochs = 10)
training_history.append(history.history)
model.save("twd_model.h5")

In [None]:
new_history = [d['acc'] for d in training_history]
new_history = sum(new_history, [])
plt.plot(new_history)
plt.xlabel("Number of Training Batches")
plt.ylabel("Accuracy (%)")

## Testing the Model

In [None]:
for i in range(0,5):
    X_dev = np.load("")
    Y_dev = np.load("")
    loss, acc = model.evaluate(X_dev, Y_dev)
    print("Dev set accuracy = ", acc)

## Marking Predictions

In [None]:
def detect_triggerword(filename):
    plt.subplot(2, 1, 1)

    x = graph_spectrogram(filename)
    x = np.transpose(x)
    x = np.expand_dims(x, axis = 0)
    
    predictions = model.predict(x)
    print(predictions[0, :, 0])
    new_prediction = predictions[0, :, 0]>0.5
    
    plt.subplot(2, 1, 2)
    plt.plot(new_prediction)
    plt.ylabel('probability')
    plt.show()
    return predictions

In [None]:
filename = ''
prediction = detect_triggerword(filename)
IPython.display.Audio(filename)