Syncing from google drive and github... for more info on this code, refer [here](https://zerowithdot.com/colab-github-workflow/)

In [18]:
from google.colab import drive
from os.path import join

ROOT = '/content/drive'     # default for the drive
drive.mount(ROOT)           # we mount the drive at /content/drive

GIT_PATH = "https://github.com/ybchen97/filler_detection.git"
!git clone "{GIT_PATH}"

Install packages in this local notebook specified in requirements.txt

In [19]:
!pip install -r '/content/filler_detection/requirements.txt'

Importing and setting up env variables...

In [28]:
import numpy as np
from pydub import AudioSegment
from pydub.playback import play
import random
import sys
import io
import os
import glob
import IPython
import wave
import pylab
from tf_utils import *
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile

# Import files for trigger-word detection model
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.optimizers import Adam

In [29]:
LOCAL_MACHINE_DIRECTORY = "./"
COLAB_DIRECTORY = "./filler-detection"
REPO_DIRECTORY = LOCAL_MACHINE_DIRECTORY # set this!

In [39]:
POSITIVE_DIRECTORY = "raw_data/positive_data/"
BACKGROUND_DIRECTORY = "raw_data/background_data/"
NEGATIVES_DIRECTORY = "raw_data/google_dataset/"
NEGATIVES_TRUNCATED_DIRECTORY = "raw_data/google_dataset_truncated/"
AUDIO_IGNORED_EXAMPLES_DIRECTORY = "audio_ignored_examples/"
POSITIVE_EXAMPLE = "jh_1.wav"
AUDIO_EXAMPLE = "example_train.wav"
CHIME_FILE = "audio_examples/chime.wav"

## Load Pre-Trained Model

In [73]:
model = load_model(f"{REPO_DIRECTORY}/trained_model.h5")
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 5490, 129)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1369, 196)         379456    
_________________________________________________________________
batch_normalization_1 (Batch (None, 1369, 196)         784       
_________________________________________________________________
activation_1 (Activation)    (None, 1369, 196)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1369, 196)         0         
_________________________________________________________________
gru_1 (GRU)                  (None, 1369, 128)         124800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 1369, 128)         0   

## Preprocessing Audio into 10 seconds

In [34]:
# Preprocess the audio to the correct format
def preprocess_audio(filename):
    print("PREPROCESSING...")
    # Trim or pad audio segment to 10000ms
    padding = AudioSegment.silent(duration=10000)
    segment = AudioSegment.from_wav(filename)[:10000]
    segment = padding.overlay(segment)
    # Set frame rate to 123000
    segment = segment.set_channels(1)
    segment = segment.set_frame_rate(123000)
    # Export as wav
    segment.export(filename, format='wav')

## Filler word prediction

In [35]:
def detect_triggerword(filename):
    """
    Function to take filename and generate a prediction vector.
    
    Argument:
    filename -- Audio file to run prediction on
    
    Returns:
    predictions -- Prediction vector with probabilities
    """
    
    sample_rate, samples = wavfile.read(filename)
    _, _, x = signal.spectrogram(samples, sample_rate)
    print(x.shape)
    
    # the spectrogram outputs (freqs, Tx) and we want (Tx, freqs) to input into the model
    x  = x.swapaxes(0,1)
    x = np.expand_dims(x, axis=0)
    
    predictions = model.predict(x)
    return predictions

In [66]:
def count_filler_word(filename, threshold):
    """
    Function to count the number of times trigger word spoken in audio.
    
    Arguments:
    filename -- Audio file to run prediction on
    threshold -- Probability above which trigger word considered present
    """

    preprocess_audio(filename)
    audio_clip = AudioSegment.from_wav(filename)
    chime = AudioSegment.from_wav(REPO_DIRECTORY + CHIME_FILE)
    predictions = detect_triggerword(filename)
    Ty = predictions.shape[1]
    
    # Step 1: Initialize the number of consecutive output steps to 0
    consecutive_timesteps = 0
    # Step 2: Loop over the output steps in the y
    for i in range(Ty):
        # Step 3: Increment consecutive output steps
        consecutive_timesteps += 1
        # Step 4: If prediction is higher than the threshold and more than 75 consecutive output steps have passed
        if predictions[0,i,0] < threshold and consecutive_timesteps > 75:
            # Step 5: Superpose audio and background using pydub
            audio_clip = audio_clip.overlay(chime, position = ((i / Ty) * audio_clip.duration_seconds)*1000)
            # Step 6: Reset consecutive output steps to 0
            consecutive_timesteps = 0
        
    audio_clip.export(REPO_DIRECTORY + AUDIO_IGNORED_EXAMPLES_DIRECTORY + "chime_output.wav", format='wav')

In [69]:
# Enter filename and probability threshold for determining filler word
count_filler_word(REPO_DIRECTORY + AUDIO_IGNORED_EXAMPLES_DIRECTORY + 'test_1.wav', 0.3)

PREPROCESSING...
(129, 5490)


In [70]:
IPython.display.Audio(REPO_DIRECTORY + AUDIO_IGNORED_EXAMPLES_DIRECTORY + "chime_output.wav")