In [23]:
import scipy.io.wavfile as wav
import numpy as np
import shutil
import os

In [24]:
DATASET_EVENT_DURATION_SECS = 3
UNIFORM_SAMPLE_RATE = 8000

def load_dataset_file(path):
    p = path.find('.')
    if p >= 0:
        path = path[:p]
    sample_rate, samples = wav.read(f'{path}.wav')
    meta = []
    with open(f'{path}.meta', 'r') as f:
        for line in f:
            vals = [x.strip() for x in line.strip().split(',')]
            if len(vals) == 2:
                vals = ['TRAIN', *vals]
            assert len(vals) == 3
            vals[1] = int(vals[1])
            meta.append(vals)
    return sample_rate, samples, meta

class FFTFilter:
    def __init__(self, init_sample, fold_ratio):
        self.background_freq = np.fft.fft(init_sample)
        self.energy_thresh = 0
        self.fold_ratio = fold_ratio
    def step(self, sample):
        freq = np.fft.fft(sample)
        energy = np.sum((freq - self.background_freq)**2)
        keep = energy > self.energy_thresh
        
        self.background_freq = (1 - self.fold_ratio) * self.background_freq + self.fold_ratio * freq
        self.energy_thresh = (1 - self.fold_ratio) * self.energy_thresh + self.fold_ratio * energy
        
        return keep

In [22]:
input_file = 'test-input.wav'
output_dir = 'filter-output'
seg_secs = 1
fold_ratio = 0.1

In [3]:
sample_rate, samples = wav.read(input_file)
seg_samples = round(seg_secs * sample_rate)
segs = [samples[seg_samples*i:seg_samples*(i+1)] for i in range(len(samples) // seg_samples)]
assert len(segs) > 0 and all(x.shape == segs[0].shape for x in segs)

In [4]:
background_freq = np.fft.fft(segs[0])
energy_thresh = 0
keeps = []
for i, seg in enumerate(segs):
    freq = np.fft.fft(seg)
    energy = np.sum((freq - background_freq)**2)
    if energy > energy_thresh:
        keeps.append((i, seg))
    background_freq = (1 - fold_ratio) * background_freq + fold_ratio * freq
    energy_thresh = (1 - fold_ratio) * energy + fold_ratio * energy_thresh

print(f'before: {len(segs)}')
print(f'after:  {len(keeps)}')

before: 1215
after:  590


In [5]:
shutil.rmtree(output_dir, ignore_errors = True)
os.mkdir(output_dir)
for i, keep in keeps:
    wav.write(f'{output_dir}/keep-{i}.wav', sample_rate, keep)

In [20]:
shutil.rmtree(output_dir, ignore_errors = True)
os.mkdir(output_dir)
sample_rate, samples, meta = load_dataset_file('dataset-partial/Dog/FreeSounds/Cambodia_Siem_Reap_Street_Dog_Barking.wav')
for i, entry in enumerate(meta):
    

[['TRAIN', 39256, 'Dog'],
 ['TEST', 56644, 'Dog'],
 ['TRAIN', 60295, 'Dog'],
 ['TRAIN', 96321, 'Ignore'],
 ['TRAIN', 109153, 'Ignore'],
 ['TRAIN', 116755, 'Ignore'],
 ['TRAIN', 119260, 'Ignore'],
 ['TRAIN', 127762, 'Ignore'],
 ['TRAIN', 178390, 'Dog'],
 ['TRAIN', 182111, 'Dog'],
 ['TRAIN', 199716, 'Dog'],
 ['TEST', 202658, 'Dog'],
 ['TRAIN', 204823, 'Dog'],
 ['TRAIN', 240449, 'Ignore'],
 ['TRAIN', 269440, 'Ignore'],
 ['TRAIN', 271616, 'Ignore'],
 ['TRAIN', 287938, 'Ignore'],
 ['TRAIN', 297921, 'Ignore'],
 ['TRAIN', 307072, 'Ignore'],
 ['TRAIN', 317246, 'Ignore'],
 ['TRAIN', 343730, 'Ignore'],
 ['TEST', 368710, 'Ignore'],
 ['TRAIN', 387346, 'Ignore'],
 ['TRAIN', 391400, 'Ignore'],
 ['TRAIN', 410930, 'Ignore'],
 ['TRAIN', 428718, 'Ignore'],
 ['TRAIN', 443648, 'Ignore'],
 ['TRAIN', 460749, 'Ignore'],
 ['TRAIN', 464222, 'Ignore'],
 ['TRAIN', 473858, 'Ignore'],
 ['TRAIN', 479001, 'Ignore'],
 ['TEST', 482004, 'Ignore'],
 ['TRAIN', 488774, 'Ignore'],
 ['TRAIN', 500096, 'Ignore'],
 ['TRAIN', 5