In [66]:
import os
from tqdm import tqdm_notebook
from functools import partial
import json
import numpy as np
import scipy.io.wavfile
import librosa
import seaborn as sns
import matplotlib.pyplot as plt
from skimage.transform import resize

import scipy.misc

In [2]:
wav_dir = '../../audio_mnist' 
meta_fpath = f'{wav_dir}/audioMNIST_meta.txt'

In [3]:
with open(meta_fpath) as json_f:
    meta = json.load(json_f)

In [4]:
male_idxs, female_idxs = [], []
for idx, info in meta.items():
    if info['gender'].lower() == 'male':
        male_idxs.append(idx)
    elif info['gender'].lower() == 'female':
        female_idxs.append(idx)

In [5]:
def get_wav_fpaths_from_participant_indices(idxs):
    wav_fpaths = []
    for idx in idxs:
        for fname in os.listdir(f'{wav_dir}/{idx}'):
            wav_fpaths.append(os.path.join(f'{wav_dir}/{idx}/{fname}'))
    return wav_fpaths

In [6]:
male_wav_fpaths = get_wav_fpaths_from_participant_indices(male_idxs)
female_wav_fpaths = get_wav_fpaths_from_participant_indices(female_idxs)
wav_fpaths = male_wav_fpaths + female_wav_fpaths
labels = np.array([0] * len(male_wav_fpaths) + [1] * len(female_wav_fpaths))

In [46]:
def pad_signal(signal, target_len):
    
    num_zeros_needed = target_len - len(s)
    
    if num_zeros_needed > 0:

        num_zeros_front = np.random.randint(num_zeros_needed)
        num_zeros_back = num_zeros_needed - num_zeros_front
        return np.pad(s, (num_zeros_front, num_zeros_back), mode='constant')

    else:
        return signal

In [10]:
PRE_EMPHASIS_COEFF = 0.97

In [11]:
def pre_emphasis(signal):
    first_amp = signal[0]
    all_amps_without_first = signal[1:]
    all_amps_without_last = signal[:-1]
    emphasized_signal = np.append(first_amp, all_amps_without_first - PRE_EMPHASIS_COEFF * all_amps_without_last)
    return emphasized_signal

In [12]:
def pipeline(signal):
    
    emphasized_signal = pre_emphasis(signal)
    
    # the following code applies dft, mel filter banks, logging, dct and normalization all at once
    # truly convenient
    
    lifted_mfcc = librosa.feature.mfcc(
        y=emphasized_signal.astype(float), 
        sr=sample_rate, 
        n_mfcc=12, 
        dct_type=2, 
        norm='ortho', 
        lifter=22,
        n_fft = int(sample_rate * 0.025),
        hop_length= int(sample_rate * 0.01),
        power=2,
        center=False,
        window='hanning',
        n_mels=40
    )

    return lifted_mfcc

In [18]:
with open('../data/mfccs.json', 'w+') as json_f:
    json.dump(mfccs.tolist(), json_f)

In [44]:
sample_rate = librosa.core.get_samplerate(wav_fpaths[0])

In [43]:
def get_max_duration(filenames):
    """
    Return the maximum duration in seconds for a group of wav files.
    This is much faster than loading all the wav files into numpy arrays and using the shape attribute.
    """
    max_duration = 0
    for fn in tqdm_notebook(filenames):
        duration = librosa.core.audio.get_duration(filename=fn)
        if duration > max_duration:
            max_duration = duration
    return max_duration

In [37]:
max_duration = get_max_duration(wav_fpaths)

HBox(children=(IntProgress(value=0, max=30000), HTML(value='')))




In [45]:
max_samples = int(max_duration * sample_rate)
print(max_samples)

47998


In [83]:
for i, fp in tqdm_notebook(enumerate(wav_fpaths), total=30000):
    _, signal = scipy.io.wavfile.read(fp)  # faster than
    signal = pad_signal(signal, target_len=max_samples)
    mfc = pipeline(signal)
    mfc = resize(np.rollaxis(np.array([mfc] * 3), 0, 3), (224, 224, 3))
    scipy.misc.toimage(mfc, cmin=0, cmax=255).save(f'../data/mfcs/{i}.jpg')

HBox(children=(IntProgress(value=0, max=30000), HTML(value='')))

`toimage` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use Pillow's ``Image.fromarray`` directly instead.
  


KeyboardInterrupt: 