In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch
from torch import nn
import torchaudio
from torchaudio import datasets
import torchtext
from torchaudio.utils import download_asset

import os
import time
from tqdm import tqdm
import IPython.display as ipd

In [None]:
SAMPLE_WAV_SPEECH_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")

100%|██████████| 106k/106k [00:00<00:00, 20.4MB/s]


In [None]:
wav, sr = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)

In [None]:
f_mfcc = torchaudio.transforms.MFCC(sample_rate = 16000,
                                    n_mfcc = 80,
                                    melkwargs = {
                                        'f_min': 0,
                                        'f_max': 2048,
                                        'n_mels': 80,
                                        'win_length': 400,
                                        'hop_length': 160,
                                        'n_fft': 400
                                    })
f_mfcc(wav[:,:16000]).shape

In [2]:
train_dataset_pt = datasets.SPEECHCOMMANDS('./', 'speech_commands_v0.02', download = True, subset = 'training')
val_dataset_pt = datasets.SPEECHCOMMANDS('./', 'speech_commands_v0.02', download = True, subset = 'validation')
test_dataset_pt = datasets.SPEECHCOMMANDS('./', 'speech_commands_v0.02', download = True, subset = 'testing')

100%|██████████| 2.26G/2.26G [00:23<00:00, 103MB/s]


In [14]:
import random

def pad_truncate(wav, max_length = 16000, pad_value = 0):
    wav_length = len(wav)
    if wav_length < max_length:
        pad = torchtext.transforms.PadTransform(max_length, 0)
        wav = pad(wav)
    return wav

def time_shift(wav, shift, sr = 16000, max_length = 16000):
    """

    shift: float
        Unit: Seconds
    """
    wav = torch.roll(wav, int(shift*sr))
    return wav[:, :max_length]

## Add Noise

def normalzieNoise(wav, noise, max_length = 16000):
    len_wav = wav.shape[1]
    len_noise = noise.shape[1]
    if len_wav > len_noise:
        buf = torch.zeros_like(wav)
        start_point = int((len_wav - len_noise)*random.uniform(0, 1))
        end_point = start_point + len_noise
        buf[:, start_point: end_point] = noise
        noise = buf
    elif len_wav < len_noise:
        start_point = int((len_noise - len_wav)*random.uniform(0, 1))
        end_point = start_point + len_wav
        noise = noise[:, start_point: end_point]
    return noise[:, :max_length]

def randomNoise(noise_directory):
    listnoise = [f for f in os.listdir(noise_directory)
                 if f.endswith('.wav')]
    noise = random.choice(listnoise)
    noise, sr = torchaudio.load(os.path.join(noise_directory, noise))
    return noise

def addNoise(wav, noise):
    noise = normalzieNoise(wav, noise)
    addnsy = torchaudio.transforms.AddNoise()
    return addnsy(wav, noise, snr = torch.Tensor([random.uniform(0, 15)]))

class AddBGNoise(nn.Module):
    def __init__(self):
        super().__init__()
        self.NOISE_PATH = '/content/SpeechCommands/speech_commands_v0.02/_background_noise_'

    def forward(self, x):
        p = random.uniform(0, 1)
        if p <= 0.8:
            noise = randomNoise(self.NOISE_PATH)
            x = addNoise(x, noise)
        return x

class GSC_TrainAugment(nn.Module):
    def __init__(self, sr):
        super().__init__()
        self.resample = torchaudio.transforms.Resample(sr, int(sr*random.uniform(0.9, 1.1)))
        self.time_shift = lambda x: time_shift(x, random.uniform(-0.05, 0.05))
        self.pad_trunc = lambda x: pad_truncate(x, sr)
        self.add_noise = AddBGNoise()
        self.mfcc = torchaudio.transforms.MFCC(sample_rate = 16000,
                                    n_mfcc = 80,
                                    melkwargs = {
                                        'f_min': 0,
                                        'f_max': 2048,
                                        'n_mels': 80,
                                        'win_length': 400,
                                        'hop_length': 160,
                                        'n_fft': 400
                                    })
        #self.specaugment = torchaudio.transforms.SpecAugment(n_time_masks = 2,
        #                              time_mask_param = 20,
        #                              n_freq_masks = 2,
        #                              freq_mask_param = 40)

    def forward(self, x):
        x = self.resample(x)
        x = self.time_shift(x)
        x = self.pad_trunc(x)
        x = self.add_noise(x)
        x = self.mfcc(x)
        #x = self.specaugment(x)
        return x

class GSC_TestAugment(nn.Module):
    def __init__(self, sr):
        super().__init__()
        self.pad_trunc = lambda x: pad_truncate(x, sr)
        self.mfcc = torchaudio.transforms.MFCC(sample_rate = 16000,
                                    n_mfcc = 80,
                                    melkwargs = {
                                        'f_min': 0,
                                        'f_max': 2048,
                                        'n_mels': 80,
                                        'win_length': 400,
                                        'hop_length': 160,
                                        'n_fft': 400
                                    })

    def forward(self, x):
        x = self.pad_trunc(x)
        x = self.mfcc(x)
        return x

train_transform = GSC_TrainAugment(16000)
test_transform = GSC_TestAugment(16000)



In [5]:
from GSC import download_GSC

In [7]:
ZIP_MAP = download_GSC('https://drive.google.com/file/d/1hVXW_zSMNlJXxza_3Ojtgrq16q45dt3k/view?usp=drive_link',
                       'https://drive.google.com/file/d/1-19CGvTGwZopQcT7VA0mxAGwvCjgx-cF/view?usp=drive_link',
                       'https://drive.google.com/file/d/1-CdPCNkYcprSzlqGXgho_wICSjYPS55B/view?usp=drive_link',
                       '/content/tf_mdata',
                       end = '.zip')
CSV_MAP = download_GSC('https://drive.google.com/file/d/1-EZJqm6VLTLkCFefpqBNX25vre9r9HoX/view?usp=drive_link',
                       'https://drive.google.com/file/d/1ZEJLFxuevd44xwFIVfJv2F7S2YHUPz9k/view?usp=drive_link',
                       'https://drive.google.com/file/d/1-1ouA4TZGbwlpemo-1dScGlHCQjTwQDr/view?usp=drive_link',
                       '/content/tf_mdata',
                       end = '.csv')

Downloading...
From (original): https://drive.google.com/uc?id=1hVXW_zSMNlJXxza_3Ojtgrq16q45dt3k
From (redirected): https://drive.google.com/uc?id=1hVXW_zSMNlJXxza_3Ojtgrq16q45dt3k&confirm=t&uuid=7cdbe6e2-4831-48c8-a9e6-fdf5ff75534c
To: /content/tf_mdata/train.zip
100%|██████████| 1.98G/1.98G [00:17<00:00, 110MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1-19CGvTGwZopQcT7VA0mxAGwvCjgx-cF
From (redirected): https://drive.google.com/uc?id=1-19CGvTGwZopQcT7VA0mxAGwvCjgx-cF&confirm=t&uuid=34df9204-c61e-43f8-9bd2-8219304e4f21
To: /content/tf_mdata/val.zip
100%|██████████| 234M/234M [00:01<00:00, 191MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1-CdPCNkYcprSzlqGXgho_wICSjYPS55B
From (redirected): https://drive.google.com/uc?id=1-CdPCNkYcprSzlqGXgho_wICSjYPS55B&confirm=t&uuid=35eab0e5-0cd3-40a9-81c2-a19748978968
To: /content/tf_mdata/test.zip
100%|██████████| 113M/113M [00:02<00:00, 49.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-E

In [11]:
def GSC_preprocessing(csv_file, root, output_directory, num_classes = 12, transform = None,
                       mul_factor = 1, set = 'train', csv_file_name = 'analysised_spec.csv'):
    """
    Preprocessing for each dataset

    mul_factor: increasing the number of data samples by mul_factor times.
    """
    out_df = {
        'link': [],
        'label': [],
    }
    data_df = pd.read_csv(csv_file)

    for idx in range(mul_factor):
        # def f(ix, ex):
        for ix in tqdm(range(len(data_df))):
            #if ix%1000 == 0:
            #    print(f'{ix}/{len(data_df)}')
            row = data_df.iloc[ix]
            label = row['label']
            wav = np.load(os.path.join(root, row['link']))['arr_0']

            fname = f'{set}_{label}_{ix}_{idx}.npz'
            #row = {
            #    'link': os.path.join(output_directory, f'label_{label}', fname),
            #    'label': label,
            #    'set': set
            #}
            out_df['link'].append(os.path.join(set, fname))
            out_df['label'].append(label)

            if os.path.exists(os.path.join(output_directory, fname)):
                continue

            wav = torch.from_numpy(wav).unsqueeze(0)

            if transform:
                wav = transform(wav.float())

            #out_df = out_df._append(row, ignore_index = True)
            np.savez_compressed(os.path.join(output_directory, fname), wav.squeeze(0).numpy())

        # Parallel(n_jobs = os.cpu_count())(delayed(f)(i, ex) for i, ex in tqdm(enumerate(dataset)))

    out_df = pd.DataFrame(out_df)
    out_df.to_csv(csv_file_name, index = False)

In [9]:
from GSC_zip import unzipzip, zipzip

In [10]:
unzipzip(ZIP_MAP['train'], '/content/train')
unzipzip(ZIP_MAP['val'], '/content/val')
unzipzip(ZIP_MAP['test'], '/content/test')

Extracted /content/tf_mdata/train.zip
Extracted /content/tf_mdata/val.zip
Extracted /content/tf_mdata/test.zip


In [None]:
GSC_preprocessing(CSV_MAP['val'], '/content', '/content/val_3', transform = test_transform, set = 'val', csv_file_name = '/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC/val_12_mfcc.csv')

0/10102
1000/10102
2000/10102
3000/10102
4000/10102
5000/10102
6000/10102
7000/10102
8000/10102
9000/10102
10000/10102


In [None]:
GSC_preprocessing(CSV_MAP['test'], '/content', '/content/test_3', transform = test_transform, set = 'test', csv_file_name = '/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC/test_12_mfcc.csv')

0/4890
1000/4890
2000/4890
3000/4890
4000/4890


In [17]:
GSC_preprocessing(CSV_MAP['train'], '/content', '/content/train_3', transform = train_transform, mul_factor = 2, set = 'train', csv_file_name = '/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC_nonspec/train_12_mfcc_noise_2.csv')

100%|██████████| 85511/85511 [02:30<00:00, 566.50it/s]
100%|██████████| 85511/85511 [42:48<00:00, 33.29it/s]


In [18]:
zipzip('/content/train_3', '/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC_nonspec/train_12_mfcc_noise_2.zip')

zipping...: 100%|██████████| 171022/171022 [05:09<00:00, 553.30it/s]


/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC_nonspec/train_12_mfcc_noise_2.zip created


In [None]:
zipzip('/content/val_3', '/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC/val_12_mfcc.zip')
zipzip('/content/test_3', '/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC/test_12_mfcc.zip')
zipzip('/content/train_3', '/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC/train_12_mfcc.zip')


zipping...:   0%|          | 0/10102 [00:00<?, ?it/s][A
zipping...:   0%|          | 24/10102 [00:00<00:42, 238.05it/s][A
zipping...:   0%|          | 50/10102 [00:00<00:40, 249.18it/s][A
zipping...:   1%|          | 75/10102 [00:00<00:41, 243.57it/s][A
zipping...:   1%|          | 101/10102 [00:00<00:40, 249.12it/s][A
zipping...:   1%|          | 126/10102 [00:00<00:40, 248.77it/s][A
zipping...:   1%|▏         | 151/10102 [00:00<00:42, 236.89it/s][A
zipping...:   2%|▏         | 175/10102 [00:00<00:42, 232.98it/s][A
zipping...:   2%|▏         | 199/10102 [00:00<00:44, 224.44it/s][A
zipping...:   2%|▏         | 225/10102 [00:00<00:42, 234.61it/s][A
zipping...:   3%|▎         | 254/10102 [00:01<00:39, 250.07it/s][A
zipping...:   3%|▎         | 283/10102 [00:01<00:37, 260.06it/s][A
zipping...:   3%|▎         | 311/10102 [00:01<00:36, 265.69it/s][A
zipping...:   3%|▎         | 338/10102 [00:01<00:36, 263.96it/s][A
zipping...:   4%|▎         | 367/10102 [00:01<00:35, 271.63it

/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC/val_12_mfcc.zip created



zipping...:   0%|          | 0/4890 [00:00<?, ?it/s][A
zipping...:   1%|          | 26/4890 [00:00<00:18, 259.20it/s][A
zipping...:   1%|          | 55/4890 [00:00<00:17, 273.73it/s][A
zipping...:   2%|▏         | 83/4890 [00:00<00:17, 275.77it/s][A
zipping...:   2%|▏         | 111/4890 [00:00<00:17, 274.50it/s][A
zipping...:   3%|▎         | 139/4890 [00:00<00:17, 272.05it/s][A
zipping...:   3%|▎         | 171/4890 [00:00<00:16, 285.65it/s][A
zipping...:   4%|▍         | 200/4890 [00:00<00:16, 285.59it/s][A
zipping...:   5%|▍         | 231/4890 [00:00<00:16, 290.55it/s][A
zipping...:   5%|▌         | 264/4890 [00:00<00:15, 301.17it/s][A
zipping...:   6%|▌         | 297/4890 [00:01<00:14, 307.98it/s][A
zipping...:   7%|▋         | 334/4890 [00:01<00:14, 325.20it/s][A
zipping...:   8%|▊         | 367/4890 [00:01<00:14, 313.20it/s][A
zipping...:   8%|▊         | 400/4890 [00:01<00:14, 316.51it/s][A
zipping...:   9%|▉         | 437/4890 [00:01<00:13, 330.88it/s][A
zipping.

/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC/test_12_mfcc.zip created



zipping...:   0%|          | 0/85511 [00:00<?, ?it/s][A
zipping...:   0%|          | 25/85511 [00:00<05:43, 249.11it/s][A
zipping...:   0%|          | 50/85511 [00:00<05:50, 243.67it/s][A
zipping...:   0%|          | 80/85511 [00:00<05:26, 261.30it/s][A
zipping...:   0%|          | 113/85511 [00:00<04:59, 284.67it/s][A
zipping...:   0%|          | 150/85511 [00:00<04:33, 312.23it/s][A
zipping...:   0%|          | 182/85511 [00:00<04:39, 305.58it/s][A
zipping...:   0%|          | 213/85511 [00:00<04:43, 300.56it/s][A
zipping...:   0%|          | 246/85511 [00:00<04:37, 306.96it/s][A
zipping...:   0%|          | 277/85511 [00:00<04:41, 302.69it/s][A
zipping...:   0%|          | 311/85511 [00:01<04:32, 312.83it/s][A
zipping...:   0%|          | 343/85511 [00:01<04:32, 312.18it/s][A
zipping...:   0%|          | 375/85511 [00:01<04:35, 309.00it/s][A
zipping...:   0%|          | 408/85511 [00:01<04:32, 312.47it/s][A
zipping...:   1%|          | 440/85511 [00:01<04:39, 303.93it

/content/drive/MyDrive/GSC/Background_Noise/GSC_12_MFCC/train_12_mfcc.zip created


In [None]:
wav_ = train_transform(wav)

ipd.Audio(wav_, rate = sr)

In [None]:
resample = torchaudio.transforms.Resample(sr, int(sr*random.uniform(0.9, 1.1)))
wav1 = resample(wav)
resample = torchaudio.transforms.Resample(sr, int(sr*random.uniform(0.9, 1.1)))
wav2 = resample(wav)

In [None]:
print(wav1.shape)
print(wav2.shape)

torch.Size([1, 54370])
torch.Size([1, 49878])


In [None]:
wav.shape

torch.Size([1, 54400])

In [None]:
ipd.Audio(wav, rate = sr)

In [None]:
ipd.Audio(wav1, rate = sr)

In [None]:
ipd.Audio(wav2, rate = sr)

In [None]:
def time_shift(wav, shift, sr = 16000, max_length = 16000):
    """

    shift: float
        Unit: Seconds
    """
    wav = torch.roll(wav, int(shift*sr))
    return wav[:, :max_length]

## Add Noise

In [None]:
tshift = lambda x: time_shift(x, random.uniform(-0.05, 0.05))
wav1 = tshift(wav)
tshift = lambda x: time_shift(x, random.uniform(-0.05, 0.05))
wav2 = tshift(wav)
print(wav1.shape)
print(wav2.shape)

torch.Size([1, 16000])
torch.Size([1, 16000])


In [None]:
wav1

tensor([[ 0.0112,  0.0143,  0.0172,  ..., -0.1933, -0.1866, -0.1527]])

In [None]:
wav2

tensor([[0.0045, 0.0045, 0.0045,  ..., 0.0059, 0.0148, 0.0242]])

In [None]:
ipd.Audio(wav, rate = sr)

In [None]:
ipd.Audio(wav1, rate = sr)

In [None]:
ipd.Audio(wav2, rate = sr)