# Stripping Silence

We use this notebook to compare different techniques for cutting silences on the LJ speech dataset.

One of the problems with the Linda John speech dataset, is the the dataset is cut between sentences; therefore, there can be unpredictable silence 

In [None]:
import re
import sys
import random

# Setup the "PYTHONPATH"
sys.path.insert(0, '../../')

In [None]:
from IPython.display import display, Audio, Markdown

class UnnormalizedAudio(Audio):

    def _make_wav(self, data, rate):
        """ Transform a numpy array to a PCM bytestring """
        import struct
        from io import BytesIO
        import wave

        try:
            import numpy as np

            data = np.array(data, dtype=float)
            if len(data.shape) == 1:
                nchan = 1
            elif len(data.shape) == 2:
                # In wave files,channels are interleaved. E.g.,
                # "L1R1L2R2..." for stereo. See
                # http://msdn.microsoft.com/en-us/library/windows/hardware/dn653308(v=vs.85).aspx
                # for channel ordering
                nchan = data.shape[0]
                data = data.T.ravel()
            else:
                raise ValueError('Array audio input must be a 1D or 2D array')
            scaled = np.int16(data*32767).tolist()
        except ImportError:
            # check that it is a "1D" list
            idata = iter(data)  # fails if not an iterable
            try:
                iter(idata.next())
                raise TypeError('Only lists of mono audio are '
                    'supported if numpy is not installed')
            except TypeError:
                # this means it's not a nested list, which is what we want
                pass
            scaled = [int(x*32767) for x in data]
            nchan = 1

        fp = BytesIO()
        waveobj = wave.open(fp,mode='wb')
        waveobj.setnchannels(nchan)
        waveobj.setframerate(rate)
        waveobj.setsampwidth(2)
        waveobj.setcomptype('NONE','NONE')
        waveobj.writeframes(b''.join([struct.pack('<h',x) for x in scaled]))
        val = fp.getvalue()
        waveobj.close()

        return val

In [None]:
from src import hparams

hparams.set_hparams()
train, dev = hparams.get_dataset()

In [None]:
%matplotlib inline

from matplotlib import pyplot

def plot_waveform(signals, labels=None):
    """ Plot a wave form
    
    Args:
        signals (list): List of signals to plot.
        labels (list of str, optional): Labels to add to signals.
    """
    pyplot.figure(figsize=(20,5))
    for i, signal in enumerate(signals):
        label = None if labels is None else labels[i]
        pyplot.plot(signal, label=label)
    if labels is not None:
        pyplot.legend()
    pyplot.ylim(-1, 1)
    pyplot.show()

In [None]:
from IPython.display import display, Audio, Markdown
from tqdm import tqdm

import IPython
import numpy as np

from src.audio import read_audio


def evaluate_trim(trim, max_rows=1000, max_samples=5000, top_k=5):
    """ Evaluate a silence trimming algorithm.
    
    Args:
        max_rows (int): Maximum rows to evaluate.
        max_samples (int): Maximum samples to display at the end and beginning of the signal.
        top_k (int): Top k signals to display.
    """
    results = []
    for row in tqdm(train[:max_rows]):
        signal, sample_rate = librosa.core.load(str(row.audio_path), sr=24000)
        trimmed = trim(str(row.audio_path), signal)
        length_difference = len(signal) - len(trimmed)
        if length_difference > 0:
            energy_difference =  (np.sum(np.absolute(signal)) - np.sum(np.absolute(trimmed))) / length_difference
            results.append({'signal': np.array(signal),
                            'trimmed': np.array(trimmed),
                            'energy_difference': energy_difference,
                            'length_difference': length_difference,
                            'filename': row.audio_path,
                            'text': row.text})

    results = sorted(results, key=lambda r: r['energy_difference'], reverse=True)
    display(Markdown('### Top %d Signals Affected by Trim' % top_k))
    for result in (results[:top_k]):
        display(Markdown('File: %s' % (result['filename'],)))
        display(Markdown('Text: %s' % (result['text'],)))
        display(Markdown('Energy Difference: %f' % (result['energy_difference'],)))
        display(Markdown('Length Difference: %d' % (result['length_difference'],)))
        
        display(Markdown('Trimmed:'))
        display(UnnormalizedAudio(result['trimmed'], rate=sample_rate))
        plot_waveform([result['trimmed']])
        
        display(Markdown('Signal:'))
        display(UnnormalizedAudio(result['signal'], rate=sample_rate))
        plot_waveform([result['signal']])
            
        display(Markdown('---'))

## Librosa

In [None]:
import sys
import librosa

trim = lambda _, s: librosa.effects.trim(s, frame_length=1024, hop_length=256)[0]
evaluate_trim(trim, top_k=20, max_rows=sys.maxsize)

## PyDub

In [None]:
import pydub
from pydub import AudioSegment

def pydub_trim(wav, _):
    sound = AudioSegment.from_wav(wav)
    pydub.effects.strip_silence(sound)
    return sound.get_array_of_samples()

evaluate_trim(pydub_trim)

## SOX

Reference:
https://digitalcardboard.com/blog/2009/08/25/the-sox-of-silence/

In [None]:
import os

destination = os.path.abspath('temp.wav')
    
def sox_trim(wav, _):
    os.system('sox {} {} silence 1 0.01 1% reverse silence 1 0.01 1% reverse'.format(wav, destination))
    trimmed = read_audio(destination)
    return trimmed

evaluate_trim(sox_trim)

# Clean up
os.remove(destination)
assert not os.path.isfile(destination)

## ibab/tensorflow-wavenet



In [None]:
def trim_silence(_, audio, threshold=0.1, frame_length=2048):
    '''Removes silence at the beginning and end of a sample.'''
    audio = np.array(audio, dtype=np.float)
    if audio.size < frame_length:
        frame_length = audio.size
    energy = librosa.feature.rmse(audio, frame_length=frame_length)
    frames = np.nonzero(energy > threshold)
    indices = librosa.core.frames_to_samples(frames)[1]

    # Note: indices can be an empty array, if the whole audio was silence.
    return audio[indices[0]:indices[-1]] if indices.size else audio[0:0]

evaluate_trim(trim_silence)