In [1]:
import dataloader
import mfcc
import numpy as np
import torch
import random
import scipy.signal as sig

In [2]:
raw_dataset = dataloader.get_dataset(None, 8192)
print({ k: len(v) for k, v in raw_dataset.items() })

{'Train': 895, 'Clapping': 546, 'BackgroundSounds': 8192, 'Laughing': 216, 'Whistle': 21, 'Whistling': 22, 'Horn': 1146, 'Dog': 2768, 'Noise': 1950, 'Aircraft': 1119, 'Crow': 621, 'Scream': 36, 'Siren': 2363, 'Sneezing': 154, 'BirdChirp': 1299, 'Rooster': 402, 'Wind': 1408, 'Gunshot': 8, 'Frog': 3608, 'CarDoor': 452, 'VehicleExhaust': 292, 'Engine': 5845, 'Coughing': 736, 'Cat': 242, 'Bell': 702, 'Thunder': 488, 'Beeping': 98, 'Unknown': 293, 'Insects': 1986, 'Sheep': 191, 'Basketball': 752, 'Drums': 6193, 'Cow': 168, 'GunNoise': 2448, 'PowerTool': 1537, 'GlassBreak': 982, 'Skateboarding': 76, 'Jackhammer': 2882, 'HandSaw': 332}


In [9]:
print(f'gpu enabled: {torch.cuda.is_available()}')
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

import mfcc_vae_1 as vae
encoder = vae.Encoder(embedding_size = 16).to(device)
encoder.load_state_dict(torch.load('mfcc-untested-1/encoder-F16-A0.9-E256-L171.pt'))

encoder.eval()

def prep_sample(x):
    s = mfcc.mfcc_spectrogram_for_learning(x, dataloader.UNIFORM_SAMPLE_RATE)
    with torch.no_grad():
        encoder_input = torch.tensor(s.reshape(1, *s.shape), dtype=torch.float32).to(device)
        mean, logstd = encoder.forward(encoder_input)
        assert mean.shape == (1, encoder.embedding_size)
        return np.array(mean.cpu()).reshape(-1), np.array(logstd.exp().cpu()).reshape(-1)

dataset = { k: [prep_sample(x) for x in v] for k, v in raw_dataset.items() }

def distance_metric(a, b):
    assert a.shape[-1:] == (16,)
    assert b.shape == (16,)

    return np.sum((a - b)**2, axis = -1)

gpu enabled: True


In [82]:
benchmark_dataset = dict(random.sample([(k,v) for k,v in dataset.items() if len(v) >= 256], 10))
print({ k: len(v) for k, v in benchmark_dataset.items() })
max_clusters = 32
def benchmark(filter_class, filter_thresh):
    print(f'bench: {filter_class.__name__}')
    print()
    print('heterogeneous class tests:')
    vals = []
    for i in range(16):
        f = filter_class(max_clusters, encoder.embedding_size, filter_thresh)
        keeps = []
        reps = 30
        for rep in range(reps):
            for label, samples in benchmark_dataset.items():
                if f.insert(*random.choice(samples)):
                    keeps.append(label)
        vals.append(100 * len(keeps) / len(benchmark_dataset) / reps)
        print(f'iter {i:>4}: kept {len(keeps)}/{len(benchmark_dataset) * reps} ({vals[-1]:.2f}%)')
    print(f'mean: {np.mean(vals):>5.2f} std: {np.std(vals):>5.2f}')

    print()
    print('homogenous class tests:')
    vals = []
    for label, samples in benchmark_dataset.items():
        f = filter_class(max_clusters, encoder.embedding_size, filter_thresh)
        keeps = []
        reps = 300
        for rep in range(reps):
            if f.insert(*random.choice(samples)):
                keeps.append(i)
        vals.append(100 * len(keeps) / reps)
        print(f'{label:>16}: kept {len(keeps):>4}/{reps} ({vals[-1]:>5.2f}%)')
    print(f'mean: {np.mean(vals):>5.2f} std: {np.std(vals):>5.2f}')

{'HandSaw': 332, 'VehicleExhaust': 292, 'Insects': 1986, 'Frog': 3608, 'Rooster': 402, 'Wind': 1408, 'Aircraft': 1119, 'Clapping': 546, 'BirdChirp': 1299, 'Train': 895}


In [83]:
class ClusterFilterAug2:
    def __init__(self, max_clusters, embedding_size, thresh):
        self.means = np.zeros((max_clusters, embedding_size))
        self.weights = np.zeros((max_clusters,))
        self.max_clusters = max_clusters
        self.base_radius = thresh
    def insert(self, mean, std):
        assert mean.shape == self.means.shape[1:] and mean.shape == std.shape and self.means.shape[0] == self.max_clusters and self.weights.shape == (self.max_clusters,)
        l2_norm = np.sqrt(np.sum((self.means - mean) ** 2, axis = 1))
        close = l2_norm <= np.sqrt(self.weights) * self.base_radius
        
        if np.any(close):
            center = (np.sum((self.means[close].T * self.weights[close]).T, axis = 0) + mean) / (np.sum(self.weights[close]) + 1)
            weight = np.sum(self.weights[close]) + 1
            self.means = np.concatenate([
                self.means[~close],
                [ center ],
                np.zeros((self.means.shape[0] - (np.sum(~close) + 1), self.means.shape[1])),
            ])
            self.weights = np.concatenate([
                self.weights[~close],
                [ weight ],
                np.zeros((self.weights.shape[0] - (np.sum(~close) + 1),)),
            ])
            return False
        else:
            self.means = np.concatenate([
                self.means[1:],
                [ mean ],
            ])
            self.weights = np.concatenate([
                self.weights[1:],
                [ 1 ],
            ])
            return True

benchmark(ClusterFilterAug2, 0.2)

bench: ClusterFilterAug2

heterogeneous class tests:
iter    0: kept 280/300 (93.33%)
iter    1: kept 260/300 (86.67%)
iter    2: kept 284/300 (94.67%)
iter    3: kept 272/300 (90.67%)
iter    4: kept 255/300 (85.00%)
iter    5: kept 272/300 (90.67%)
iter    6: kept 285/300 (95.00%)
iter    7: kept 278/300 (92.67%)
iter    8: kept 269/300 (89.67%)
iter    9: kept 272/300 (90.67%)
iter   10: kept 276/300 (92.00%)
iter   11: kept 262/300 (87.33%)
iter   12: kept 270/300 (90.00%)
iter   13: kept 279/300 (93.00%)
iter   14: kept 282/300 (94.00%)
iter   15: kept 262/300 (87.33%)
mean: 90.79 std:  2.91

homogenous class tests:
         HandSaw: kept   80/300 (26.67%)
  VehicleExhaust: kept  207/300 (69.00%)
         Insects: kept   52/300 (17.33%)
            Frog: kept   56/300 (18.67%)
         Rooster: kept  181/300 (60.33%)
            Wind: kept   47/300 (15.67%)
        Aircraft: kept   31/300 (10.33%)
        Clapping: kept  161/300 (53.67%)
       BirdChirp: kept  149/300 (49.67%)
  

In [36]:
class ClusterFilterAug:
    def __init__(self, max_clusters, embedding_size, thresh):
        self.means = np.zeros((max_clusters, embedding_size)) + np.inf
        self.radii = np.zeros((max_clusters,))
        self.max_clusters = max_clusters
        self.base_radius = thresh
    def insert(self, mean, std):
        assert mean.shape == self.means.shape[1:] and mean.shape == std.shape and self.means.shape[0] == self.max_clusters and self.radii.shape == (self.max_clusters,)
        l2_norm = np.sqrt(np.sum((self.means - mean) ** 2, axis = 1))
        close = l2_norm <= self.radii
        
        if np.any(close):
            center = np.mean(self.means[close], axis = 0)
            radius = np.sqrt(np.sum(self.radii[close]**2))

#             center = (np.sum(self.means[close], axis = 0) + mean) / (np.sum(close) + 1)
#             radius = np.sqrt(np.sum(self.radii[close]**2) + self.base_radius**2)
        
            self.means = np.concatenate([
                self.means[~close],
                [ center ],
                np.zeros((self.means.shape[0] - (np.sum(~close) + 1), self.means.shape[1])) + np.inf,
            ])
            self.radii = np.concatenate([
                self.radii[~close],
                [ radius ],
                np.zeros((self.radii.shape[0] - (np.sum(~close) + 1),)),
            ])
            return False
        else:
            self.means = np.concatenate([
                self.means[1:],
                [ mean ],
            ])
            self.radii = np.concatenate([
                self.radii[1:],
                [ self.base_radius ],
            ])
            return True

benchmark(ClusterFilterAug, .25)

bench: ClusterFilterAug

heterogeneous class tests:
iter    0: kept 280/300 (93.33%)
iter    1: kept 277/300 (92.33%)
iter    2: kept 265/300 (88.33%)
iter    3: kept 260/300 (86.67%)
iter    4: kept 275/300 (91.67%)
iter    5: kept 277/300 (92.33%)
iter    6: kept 271/300 (90.33%)
iter    7: kept 280/300 (93.33%)
iter    8: kept 275/300 (91.67%)
iter    9: kept 278/300 (92.67%)
iter   10: kept 272/300 (90.67%)
iter   11: kept 269/300 (89.67%)
iter   12: kept 274/300 (91.33%)
iter   13: kept 268/300 (89.33%)
iter   14: kept 274/300 (91.33%)
iter   15: kept 274/300 (91.33%)
mean: 91.02 std:  1.76

homogenous class tests:
      GlassBreak: kept  268/300 (89.33%)
      Jackhammer: kept  163/300 (54.33%)
       PowerTool: kept  249/300 (83.00%)
         CarDoor: kept  203/300 (67.67%)
        Clapping: kept  212/300 (70.67%)
           Siren: kept  198/300 (66.00%)
           Noise: kept  117/300 (39.00%)
BackgroundSounds: kept  252/300 (84.00%)
         HandSaw: kept  188/300 (62.67%)
   

In [42]:
class ClusterFilter:
    def __init__(self, max_clusters, embedding_size, thresh):
        self.means = np.zeros((max_clusters, embedding_size)) + np.inf
        self.pos = 0
        self.thresh = thresh
    def insert(self, mean, std):
        assert mean.shape == self.means.shape[1:] and mean.shape == std.shape
        l2_norm = np.sqrt(np.sum((self.means - mean) ** 2, axis = 1))
        keep = np.min(l2_norm) > self.thresh
        self.means[self.pos,:] = mean
        self.pos = (self.pos + 1) % self.means.shape[0]
        return keep

benchmark(ClusterFilter, 0.3)

bench: ClusterFilter

heterogeneous class tests:
iter    0: kept 261/300 (87.00%)
iter    1: kept 265/300 (88.33%)
iter    2: kept 273/300 (91.00%)
iter    3: kept 264/300 (88.00%)
iter    4: kept 253/300 (84.33%)
iter    5: kept 266/300 (88.67%)
iter    6: kept 257/300 (85.67%)
iter    7: kept 259/300 (86.33%)
iter    8: kept 250/300 (83.33%)
iter    9: kept 259/300 (86.33%)
iter   10: kept 269/300 (89.67%)
iter   11: kept 262/300 (87.33%)
iter   12: kept 271/300 (90.33%)
iter   13: kept 262/300 (87.33%)
iter   14: kept 265/300 (88.33%)
iter   15: kept 258/300 (86.00%)
mean: 87.38 std:  2.00

homogenous class tests:
      GlassBreak: kept  252/300 (84.00%)
      Jackhammer: kept  175/300 (58.33%)
       PowerTool: kept  241/300 (80.33%)
         CarDoor: kept  172/300 (57.33%)
        Clapping: kept  216/300 (72.00%)
           Siren: kept  199/300 (66.33%)
           Noise: kept  142/300 (47.33%)
BackgroundSounds: kept  238/300 (79.33%)
         HandSaw: kept  183/300 (61.00%)
      

In [48]:
class DistributionFilter:
    def __init__(self, max_clusters, embedding_size, thresh):
        self.means = np.zeros((max_clusters, embedding_size))
        self.stds = np.zeros((max_clusters, embedding_size))
        self.pos = 0
        self.thresh = thresh
    def insert(self, mean, std):
        assert mean.shape == self.means.shape[1:] and mean.shape == std.shape
        kl_div = np.sum(-0.5 * (1 + 2 * np.log(np.maximum(self.stds / std, 1e-20)) - (std**2 + (self.means - mean)**2) / std**2), axis = 1)
        keep = np.min(kl_div) > self.thresh
        self.means[self.pos,:] = mean
        self.stds[self.pos,:] = std
        self.pos = (self.pos + 1) % self.means.shape[0]
        return keep

benchmark(DistributionFilter, 40)

bench: DistributionFilter

heterogeneous class tests:
iter    0: kept 262/300 (87.33%)
iter    1: kept 264/300 (88.00%)
iter    2: kept 253/300 (84.33%)
iter    3: kept 262/300 (87.33%)
iter    4: kept 258/300 (86.00%)
iter    5: kept 260/300 (86.67%)
iter    6: kept 269/300 (89.67%)
iter    7: kept 262/300 (87.33%)
iter    8: kept 256/300 (85.33%)
iter    9: kept 263/300 (87.67%)
iter   10: kept 266/300 (88.67%)
iter   11: kept 262/300 (87.33%)
iter   12: kept 270/300 (90.00%)
iter   13: kept 264/300 (88.00%)
iter   14: kept 267/300 (89.00%)
iter   15: kept 271/300 (90.33%)
mean: 87.69 std:  1.59

homogenous class tests:
      GlassBreak: kept  217/300 (72.33%)
      Jackhammer: kept  195/300 (65.00%)
       PowerTool: kept  269/300 (89.67%)
         CarDoor: kept  163/300 (54.33%)
        Clapping: kept  226/300 (75.33%)
           Siren: kept  255/300 (85.00%)
           Noise: kept  130/300 (43.33%)
BackgroundSounds: kept  245/300 (81.67%)
         HandSaw: kept  237/300 (79.00%)
 