In [None]:
import os
import pickle
import random

import lmdb

from datasets.lmdb_clean_noisy_dataset import LmdbCleanNoisyDataset
from utils.audio_utils import spectrogram_to_waveform, load_waveform, \
  mix_waveforms, \
  divide_waveform, waveform_to_spectrogram
from utils.denoise_utils import denoise_waveform
from utils.file_utils import get_file_paths
from utils.plot_utils import show_spectrogram, show_waveform

In [None]:
# REQUIREMENTS
# download source files in ../audio_data
# install ffmpeg

data_dir = '../_audio_data'
clean_dir = '/LibriSpeech/train-clean-100'
sound_dir = '/ESC-50-master/audio'

output_dir = '../_datasets/'
os.makedirs(output_dir, exist_ok=True)
output_lmdb_path = output_dir + "librispeech_clean_noisy_dataset.lmdb"

files_count = 10000000

In [None]:
clean_path = os.path.abspath(data_dir + clean_dir)
clean_files = get_file_paths([clean_path], 'flac')
clean_files = clean_files[:files_count]

In [None]:
sounds_path = os.path.abspath(data_dir + sound_dir)
sound_files = get_file_paths([sounds_path], 'wav')

In [None]:
env = lmdb.open(output_lmdb_path, map_size=int(100e9))

counter = 0

with env.begin(write=True) as txn:
  for idx, clean_file in enumerate(clean_files):
    sound_file = random.choice(sound_files)
    clean_sample, clean_rate = load_waveform(clean_file)
    sound_sample, _ = load_waveform(sound_file)

    background_volume = random.choice([i / 10 + 0.5 for i in range(0, 5)])
    noisy_sample = mix_waveforms(clean_sample, sound_sample, background_volume)

    clean_sample = denoise_waveform(clean_sample, clean_rate)

    clean_chunks = divide_waveform(clean_sample)
    noisy_chunks = divide_waveform(noisy_sample)

    for clean_chunk, noisy_chunk in zip(clean_chunks, noisy_chunks):
      clean_spectrogram = waveform_to_spectrogram(clean_chunk)
      noisy_spectrogram = waveform_to_spectrogram(noisy_chunk)

      data_pair = {
        "noisy": noisy_spectrogram,
        "clean": clean_spectrogram
      }
      key = f"spectrogram_{counter:06d}".encode("ascii")
      txn.put(key, pickle.dumps(data_pair))
      counter += 1

    if (idx + 1) % 100 == 0:
      print(f"Processed {idx + 1}/{len(clean_files)} files. Total spectrograms: {counter}")

print(f"Total spectrograms: {counter}")

In [None]:
dataset = LmdbCleanNoisyDataset(output_lmdb_path)
idx = random.randint(0, 100)
clean_spectrogram = dataset.__getitem__(idx)[1]
noisy_spectrogram = dataset.__getitem__(idx)[0]

In [None]:
clean_waveform = spectrogram_to_waveform(clean_spectrogram)
noisy_waveform = spectrogram_to_waveform(noisy_spectrogram)

In [None]:
show_spectrogram(clean_waveform, "Clean spectrogram")
show_spectrogram(noisy_waveform, "Noisy spectrogram")

In [None]:
show_waveform(clean_waveform, "Clean waveform")
show_waveform(noisy_waveform, "Noisy waveform")