In [None]:
import os
import torch
import torchvision.transforms as transforms
import torchaudio.transforms as T
import torchvision.io as io
import torchaudio
import numpy as np
from google.colab import drive
import matplotlib.pyplot as plt
import librosa
import soundfile as sf
from pydub import AudioSegment
from pydub.effects import strip_silence
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# HYPERPARAMETERS
duration_seconds = 4
sample_rate = 44100
hyper_params = {
    'duration': duration_seconds*sample_rate,
     'n_mels': 128,
    'hop_length': 512,
    'n_fft': 2048,
    'fmin': 20,
    'fmax': sample_rate//2
}

In [None]:
def audio_preprocess(file_path):
  waveform, sample_rate = librosa.load(file_path, sr=44100)
  
   #normalising the waveform since each audio file has the amplitude values in different ranges
  waveform = waveform / np.max(np.abs(waveform))

  #keeping values greater than threshold 
  waveform, index = librosa.effects.trim(waveform, top_db=60)

  # keeping values greater than threshold = 0.001
  wav = np.abs(waveform)
  mask = wav > 0.001     # 0.001 is equivalent to a 60db threshold
  waveform = waveform[mask]
  
  # pad to a length of 4s
  if len(waveform) > hyper_params['duration']:
      waveform = waveform[:hyper_params['duration']]
  else:
      padding = hyper_params['duration'] - len(waveform)
      offset = padding // 2
      waveform = np.pad(waveform, (offset, hyper_params['duration'] - len(waveform) - offset), 'constant')

  return waveform, sample_rate

def create_melspec(params, waveform, sampling_rate):
  S = librosa.feature.melspectrogram(  y=waveform,
                                       sr=sampling_rate,
                                       n_mels=params['n_mels'],
                                       hop_length=params['hop_length'],
                                       n_fft=params['n_fft'],
                                       fmin=params['fmin'],
                                       fmax=params['fmax'])
  S_db = librosa.power_to_db(S, ref=np.max)
  S_db = S_db.astype(np.float32)

  return S_db

def display_audio(audio_file_path):
  waveform, sample_rate = librosa.load(audio_file_path, sr=None)

  # Plot the waveform
  plt.figure(figsize=(10, 4))
  librosa.display.specshow(sb, sr=samplerate, hop_length=hyper_params['hop_length'], x_axis='time', y_axis='mel')
  plt.xlabel('Time (s)')
  plt.ylabel('Mel Frequency')
  plt.title('Mel Spectogram',audio_file_path)
  plt.show()


In [None]:


# Define directory paths
data_dir = '/content/drive/My Drive/DLproject-Numpy'
output_dir = '/content/drive/My Drive/DLproject-Numpy/augmented_spectograms'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load the class distribution
class_distribution = {
    "dog_barking": 640,
    "car_horn": 344,
    "Fart": 291,
    "Guitar": 548,
    "drilling": 560,
    "Gunshot_and_gunfire": 448,
    "Hi-hat": 171,
    "Knock": 168,
    "Splash_and_splatter": 174,
    "Snare_drum": 449,
    "Shatter": 212,
    "Laughter": 295,
    "siren": 560
}

# Define threshold for underrepresented classes
threshold = max(class_distribution.values())  # You can adjust this threshold based on your dataset


In [None]:
def augment_spectrograms(class_distribution, data_dir, output_dir, time_mask_param = 80,
                         freq_mask_param = 80):
    # Iterate over the class distribution
    for class_name, num_samples in class_distribution.items():
        if num_samples < threshold:
            # Calculate augmentation factor needed for this class
            augmentation_factor = int(np.ceil(threshold / num_samples))
            # Load mel spectrograms for the underrepresented class
            class_dir = os.path.join(data_dir, 'train', class_name)  # Adjust the path here
            for mel_file in os.listdir(class_dir):
                mel_path = os.path.join(class_dir, mel_file)

                mel_spec = np.load(mel_path)['mel_spec']
                augmented_spectogram = mel_spec.clone()

                for i in range(augmentation_factor):
                    time_masking = T.TimeMasking(time_mask_param=80)
                    freq_masking = T.FrequencyMasking(freq_mask_param=80)

                    # APPLY TIME MASKING
                    augmented_spectogram = time_masking(augmented_spectogram)

                    # APPLY FREQUENCY MASKING
                    augmented_spectogram = freq_masking(augmented_spectogram)

                    output_class_dir = os.path.join(output_dir, class_name)
                    os.makedirs(output_class_dir, exist_ok=True)

                    output_npz_path = os.path.join(output_class_dir, f"original_{mel_file}.npz")
                    np.savez(output_npz_path, mel_spec=mel_spec)

In [None]:
augment_spectrograms(class_distribution, data_dir, output_dir)