In [1]:
import os
import thinkdsp as tp
import numpy as np
# import librosa
# from librosa.display import specshow
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.notebook import tqdm
import multiprocessing as mp
from PIL import Image
import tensorflow as tf
import shutil
import copy

In [40]:
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np

# Took it from here: https://github.com/iver56/audiomentations#usage-example
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
])

In [7]:
import re

def rename(path):
    path = re.sub(r'train_audio', 'train_spec_other', path)
    path = re.sub(r'wav$', 'png', path)
    
    return path

def wav_to_png(path):
    wave_ = tp.read_wave(path)
    display(wave_.make_audio())
    for i in range(20):
        wave = copy.deepcopy(wave_)
        wave.ys = augment(wave.ys, sample_rate=wave.framerate)
        spectrogram = wave.make_spectrogram(seg_length=1024)
        spectrogram.plot(high=5000)
        
        # Save augmented audio
        audio_path = path[:-4] + f'_{i}' + path[-4:]
        audio_path = re.sub('train_audio', 'train_audio_augmented', audio_path)
        os.makedirs(os.path.dirname(audio_path), exist_ok=True)
        tp.WavFileWriter(filename=audio_path, framerate=wave.framerate).write(wave)
        
        # Save spectrogram
        plt.axis('off')
        png_path = rename(audio_path)
        os.makedirs(os.path.dirname(png_path), exist_ok=True)
        plt.savefig(png_path, bbox_inches='tight', pad_inches=0)

In [9]:
wavs = list(map(str, Path('train_audio_augmented').rglob('*.wav')))


In [43]:

with mp.Pool() as pool:
    pool.map(wav_to_png, tqdm(wavs))

  0%|          | 0/6398 [00:00<?, ?it/s]

In [2]:
with tf.device('/gpu:1'):
    pretrained_model = tf.keras.models.load_model('my_model', compile=False)

In [3]:
class_names = sorted(os.listdir('train_audio'))
class_names

['down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes']

In [10]:
predicted = []
real = []
for path in tqdm(wavs):
    spec = rename(path)
    with Image.open(spec) as img:
        img = np.asarray(img)[None, :, :, :3]

    cls = pretrained_model(img).numpy().argmax()
    cls_name = class_names[cls]
    predicted.append(cls_name)
    
    real_cls = re.findall(r'/(\w+)/', path)[0]
    real.append(real_cls)
    
#     dst = os.path.join('train_audio_for_distill', 
#                        cls_name, os.path.basename(path)
#                       )
#     os.makedirs(os.path.dirname(dst), exist_ok=True)
#     shutil.copy(path, dst)

  0%|          | 0/127960 [00:00<?, ?it/s]

Pretrained model's accuracy on the training dataset.

In [11]:
np.sum(np.array(predicted) == np.array(real)) / len(real)

0.565997186620819