In [1]:
import torchaudio
#Preprocessing for ResNet
torchaudio.set_audio_backend("sox_io")
import glob
import os
import pandas as pd
import torchaudio
import torch
import torch.nn as nn
from torchaudio import transforms as T
import h5py
import numpy as np

#paths to our training/validation sets
train_path = "/Users/zaher/Desktop/Project/Training_Set"
val_path = "/Users/zaher/Desktop/Project/Validation_Set"


#parameters for Audio Preproccessing
TARGET_SR = 22050
N_FFT = 1024
N_MELS = 128
HOP_MEL = 256
FMIN = 0
FMAX = TARGET_SR // 2
fps = TARGET_SR / HOP_MEL
WIN_LEN = 1000
SEG_LEN = WIN_LEN // 2
win_len = int(round((WIN_LEN / 1000) * fps))
seg_hop = int(round((SEG_LEN / 1000) * fps))
#transformations that converts audo parameters into Mel spectrograms and then we convert that to decibel 
mel = T.MelSpectrogram(sample_rate=TARGET_SR, n_fft=N_FFT, hop_length=HOP_MEL, f_min=FMIN, f_max=FMAX, n_mels=N_MELS)
transformation = nn.Sequential(mel, T.AmplitudeToDB())

#this function proccesses audio file
def PRE_process_audio(wav_file, df, is_training=True):
    #first of all we load the audio file then we RESAMPLE it to the target sample rate
    wav, sr = torchaudio.load(wav_file)
    resample = T.Resample(sr, TARGET_SR)
    wav = resample(wav)
    if wav.shape[0] != 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
        #and finally apply the trainsformation to our wav file
    melspec = transformation(wav)
    #then we create lists to store these data segments their labels start and end time for each action

    segments = []
    labels = []
    start_times = []
    end_times = []

    for i in range(len(df)):
        #now we iterate over all of the rows in our CSV 
        row_num = df.iloc[i]
        if is_training:
            #give a default label to all of the values as 0 then if the label is positive we change it to 1 to distiinguish
            label = 0
            for col in df.columns:
                if col not in ['Audiofilename', 'Starttime', 'Endtime']:
                    #if the label is positive change it's label attribute to 1 for that ith row
                    if row_num[col] == 'POS':
                        label = 1
                        break
        else:
            label = 1 

        onset = int(round(row_num['Starttime'] * fps))
        offset = int(round(row_num['Endtime'] * fps))
        start_idx = onset

        #now we check if our segment is larger than window length if it is then we split it into smaller windows

        if offset - start_idx > win_len:
            while offset - start_idx > win_len:
                spec = melspec[:, :, start_idx:start_idx + win_len]
                if spec.sum() == 0 or torch.isnan(spec).any():
                    start_idx += seg_hop
                    continue
                    #normalise our segment
                spec = (spec - spec.min()) / (spec.max() - spec.min())
                segments.append(spec)
                labels.append(label)
                start_times.append(start_idx / fps)
                end_times.append((start_idx + win_len) / fps)
                start_idx += seg_hop
                #else if the segment is shorter we need to pad it
        else:
            if offset - start_idx > win_len // 8:
                spec = melspec[:, :, start_idx:offset]
                if spec.sum() == 0 or torch.isnan(spec).any():
                    continue
                spec = torch.nn.functional.pad(spec, (0, win_len - spec.shape[-1]))
                spec = (spec - spec.min()) / (spec.max() - spec.min())
                segments.append(spec)
                labels.append(label)
                start_times.append(start_idx / fps)
                end_times.append(offset / fps)
#we return our segment with it's labels start and end time
    return segments, labels, start_times, end_times
#this funtion is to preproccess our dataset we call the PRE_proccess here
def preprocess_dataset(path, is_training=True):
    subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    all_data = []

    for subdir in subdirs:
        subdir_path = os.path.join(path, subdir)
        #find all CSV files in the directory
        csv_files = glob.glob(os.path.join(subdir_path, '*.csv'))

        for csv_file in csv_files:
            df = pd.read_csv(csv_file)
            #after we find csv files for each one we look for the same name but with a WAV file extension and make sure it exists
            wav_file = csv_file.replace('.csv', '.wav')
            if not os.path.exists(wav_file):
                print(f" skipping {wav_file} this file doesn't exist.")
                continue

            print(f"Processing {wav_file}.")
            #we send our csv file alongside wav file to preproccess function to preproccess it andd save it
            segments, labels, start_times, end_times = PRE_process_audio(wav_file, df, is_training)
            
            if len(segments) == 0:
                print(f"No segments were successfully extracted from {wav_file}")
                continue
            #add it to our array
            all_data.append({'file': os.path.basename(wav_file),
                'segments': segments,
                'labels': labels,
                'start_times': start_times,
                'end_times': end_times})
#retur our preproccessed data
    return all_data

print("Starting Preproccessing to Training Data")
train_data = preprocess_dataset(train_path, is_training=True)
# save the file into an h5 in the train/val path
hdf_Training = os.path.join(train_path, 'train_all.h5')
with h5py.File(hdf_Training, 'w') as hf:
    #go through the dataset and skip files with no extracted segments
    for i, item in enumerate(train_data):
        if len(item['segments']) == 0:
            print(f"Skipping {item['file']} as it doesn't have any extracted segments.")
            continue
            #create a group for each file
        group = hf.create_group(f'file_{i}')
        #save the file name alongside with the labels segments, start and end times
        group.create_dataset('file', data=item['file'])
        group.create_dataset('segments', data=torch.stack(item['segments']).numpy())
        group.create_dataset('labels', data=np.array(item['labels']))
        group.create_dataset('start_times', data=np.array(item['start_times']))
        group.create_dataset('end_times', data=np.array(item['end_times']))

print(f"Training data finished and has been saved to {hdf_Training}")

print("\nStarting Preproccessing to Training Data")
val_data = preprocess_dataset(val_path, is_training=False)

hdf_val = os.path.join(val_path, 'val_all.h5')
with h5py.File(hdf_val, 'w') as hf:
    for i, item in enumerate(val_data):
        if len(item['segments']) == 0:
            print(f"Skipping {item['file']} as it doesn't have any extracted segments.")
            continue
        group = hf.create_group(f'file_{i}')
        group.create_dataset('file', data=item['file'])
        group.create_dataset('segments', data=torch.stack(item['segments']).numpy())
        group.create_dataset('labels', data=np.array(item['labels']))
        group.create_dataset('start_times', data=np.array(item['start_times']))
        group.create_dataset('end_times', data=np.array(item['end_times']))

print(f"Validation data finished and has been saved to {hdf_val}")

print("Preprocessing completed.")

  torchaudio.set_audio_backend("sox_io")


Starting Preproccessing to Training Data
Processing /Users/zaher/Desktop/Project/Training_Set/MT/dcase_MK2.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/MT/dcase_MK1.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/BV/2015-09-25_04-00-00_unit10.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/BV/2015-10-14_23-59-59_unit05.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/BV/2015-09-11_06-00-00_unit07.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/BV/2015-09-21_06-00-00_unit05.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/BV/2015-09-04_08-04-59_unit03.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/HT/n1.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/HT/e1.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/HT/h1.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/HT/a1.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/HT/y1.wav.
Processing /Users/zaher/Desktop/Project/Training_Set/WMW/

In [5]:
import torchaudio
import os
import pandas as pd
import torch
import torch.nn as nn
from torchaudio import transforms as T
import h5py
import numpy as np
import glob

#setting audio backend for Mac
torchaudio.set_audio_backend("sox_io")

#paths to evaluation dataset
eval_path = "/Users/zaher/Desktop/Project/eval_2"

#parameters for Audio Preprocessing
TARGET_SR = 22050
N_FFT = 1024
N_MELS = 128
HOP_MEL = 256
FMIN = 0
FMAX = TARGET_SR // 2
fps = TARGET_SR / HOP_MEL
WIN_LEN = 1000
SEG_LEN = WIN_LEN // 2
win_len = int(round((WIN_LEN / 1000) * fps))
seg_hop = int(round((SEG_LEN / 1000) * fps))

mel = torchaudio.transforms.MelSpectrogram(sample_rate=TARGET_SR, n_fft=N_FFT, hop_length=HOP_MEL, f_min=FMIN, f_max=FMAX, n_mels=N_MELS)
transformation = nn.Sequential(mel, torchaudio.transforms.AmplitudeToDB())

def PRE_process_audio(wav_file, df, is_training=False):
    wav, sr = torchaudio.load(wav_file)
    resample = torchaudio.transforms.Resample(sr, TARGET_SR)
    wav = resample(wav)
    
    if wav.shape[0] != 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
    
    melspec = transformation(wav)
    
    segments = []
    labels = []
    start_times = []
    end_times = []

    for i in range(len(df)):
        row_num = df.iloc[i]
        label = 1  # Evaluation is assumed to be all positive as we are evaluating
    
        onset = int(round(row_num['Starttime'] * fps))
        offset = int(round(row_num['Endtime'] * fps))
        start_idx = onset

        if offset - start_idx > win_len:
            while offset - start_idx > win_len:
                spec = melspec[:, :, start_idx:start_idx + win_len]
                if spec.sum() == 0 or torch.isnan(spec).any():
                    start_idx += seg_hop
                    continue
                spec = (spec - spec.min()) / (spec.max() - spec.min())
                segments.append(spec)
                labels.append(label)
                start_times.append(start_idx / fps)
                end_times.append((start_idx + win_len) / fps)
                start_idx += seg_hop
        else:
            if offset - start_idx > win_len // 8:
                spec = melspec[:, :, start_idx:offset]
                if spec.sum() == 0 or torch.isnan(spec).any():
                    continue
                spec = torch.nn.functional.pad(spec, (0, win_len - spec.shape[-1]))
                spec = (spec - spec.min()) / (spec.max() - spec.min())
                segments.append(spec)
                labels.append(label)
                start_times.append(start_idx / fps)
                end_times.append(offset / fps)
    
    return segments, labels, start_times, end_times

def preprocess_dataset(path):
    subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    all_data = []

    for subdir in subdirs:
        subdir_path = os.path.join(path, subdir)
        csv_files = glob.glob(os.path.join(subdir_path, '*.csv'))

        for csv_file in csv_files:
            df = pd.read_csv(csv_file)
            wav_file = csv_file.replace('.csv', '.wav')
            if not os.path.exists(wav_file):
                print(f"Skipping {wav_file} as it doesn't exist.")
                continue

            print(f"Processing {wav_file}.")
            segments, labels, start_times, end_times = PRE_process_audio(wav_file, df, is_training=False)
            
            if len(segments) == 0:
                print(f"No segments were successfully extracted from {wav_file}")
                continue
            
            all_data.append({
                'file': os.path.basename(wav_file),
                'segments': segments,
                'labels': labels,
                'start_times': start_times,
                'end_times': end_times
            })
    
    return all_data

def save_to_hdf5(data, path):
    with h5py.File(path, 'w') as hf:
        for i, item in enumerate(data):
            if len(item['segments']) == 0:
                print(f"Skipping {item['file']} as it doesn't have any extracted segments.")
                continue
            group = hf.create_group(f'file_{i}')
            group.create_dataset('file', data=item['file'])
            group.create_dataset('segments', data=torch.stack(item['segments']).numpy())
            group.create_dataset('labels', data=np.array(item['labels']))
            group.create_dataset('start_times', data=np.array(item['start_times']))
            group.create_dataset('end_times', data=np.array(item['end_times']))

print("\nStarting preprocessing of evaluation data...")
eval_data = preprocess_dataset(eval_path)
hdf_eval = os.path.join(eval_path, 'eval_all.h5')
save_to_hdf5(eval_data, hdf_eval)
print(f"Evaluation data saved to {hdf_eval}")

print("Preprocessing completed.")


  torchaudio.set_audio_backend("sox_io")



Starting preprocessing of evaluation data...
Processing /Users/zaher/Desktop/Project/eval_2/MGE/85MGE.wav.
Processing /Users/zaher/Desktop/Project/eval_2/MGE/89MGE.wav.
Processing /Users/zaher/Desktop/Project/eval_2/MGE/91MGE.wav.
Processing /Users/zaher/Desktop/Project/eval_2/MS/E3_49_20190715_0150.wav.
No segments were successfully extracted from /Users/zaher/Desktop/Project/eval_2/MS/E3_49_20190715_0150.wav
Processing /Users/zaher/Desktop/Project/eval_2/MS/E2_208_20190712_0150.wav.
Processing /Users/zaher/Desktop/Project/eval_2/MS/E4_49_20190804_0150.wav.
Processing /Users/zaher/Desktop/Project/eval_2/MS/E1_208_20190712_0150.wav.
Processing /Users/zaher/Desktop/Project/eval_2/CT/ct3.wav.
Processing /Users/zaher/Desktop/Project/eval_2/CT/ct2.wav.
Processing /Users/zaher/Desktop/Project/eval_2/CT/ct1.wav.
No segments were successfully extracted from /Users/zaher/Desktop/Project/eval_2/CT/ct1.wav
Processing /Users/zaher/Desktop/Project/eval_2/QU/QU04.wav.
No segments were successfully