In [5]:
import torch
import torchaudio
import os
import pandas as pd
import h5py
import numpy as np
from tqdm import tqdm
import glob
import gc
import json

class SimplePANN(torch.nn.Module):
    def __init__(self):
        super(SimplePANN, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = torch.nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = torch.nn.MaxPool2d(2, 2)
        self.adaptive_pool = torch.nn.AdaptiveAvgPool2d((32, 32))
        self.fc = torch.nn.Linear(128 * 32 * 32, 2048)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.adaptive_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

def load_audio(audio_path, target_sr=22050, chunk_duration=1):
    waveform, sr = torchaudio.load(audio_path)
    if sr != target_sr:
        waveform = torchaudio.functional.resample(waveform, sr, target_sr)
    waveform = waveform.mean(dim=0)
    
    chunk_size = chunk_duration * target_sr
    chunks = waveform.split(chunk_size)
    return chunks

def extract_pann_features(wav_chunk, pann_model, device):
    mel_spec = torchaudio.transforms.MelSpectrogram(
        sample_rate=22050,
        n_mels=64,
        n_fft=1024,
        hop_length=512
    )(wav_chunk)
    mel_spec = mel_spec.unsqueeze(0).unsqueeze(0).to(device)
    with torch.no_grad():
        chunk_features = pann_model(mel_spec)
    return chunk_features.cpu().numpy()

def extract_mel_features(wav_chunk):
    mel_spec = torchaudio.transforms.MelSpectrogram(
        sample_rate=22050,
        n_mels=64,
        n_fft=1024,
        hop_length=512
    )(wav_chunk)
    return mel_spec.numpy()

def process_audio(wav_file, df, pann_model, device, is_training=True, feature_type='pann'):
    try:
        wav_chunks = load_audio(wav_file)
        
        features = []
        for i, chunk in enumerate(wav_chunks):
            if i % 100 == 0:
                print(f"Processing chunk {i}/{len(wav_chunks)} of {wav_file}")
            if feature_type == 'pann':
                chunk_features = extract_pann_features(chunk, pann_model, device)
            else:
                chunk_features = extract_mel_features(chunk)
            features.append(chunk_features)
            
            # Clear GPU memory
            torch.cuda.empty_cache()
        
        features = np.concatenate(features, axis=0 if feature_type == 'pann' else 1)
        
        segments = []
        labels = []
        start_times = []
        end_times = []

        for i in range(len(df)):
            ith_row = df.iloc[i]
            if is_training:
                label = 0
                for col in df.columns:
                    if col not in ['Audiofilename', 'Starttime', 'Endtime']:
                        if ith_row[col] == 'POS':
                            label = 1
                            break
            else:
                label = 1 if ith_row['Q'] == 'POS' else 0

            onset = int(round(ith_row['Starttime'] * 22050 / 1024))
            offset = int(round(ith_row['Endtime'] * 22050 / 1024))
            
            if offset - onset > 0:
                if feature_type == 'mel':
                    segment = features[:, onset:offset]
                else:
                    segment = features[onset:offset]
                segments.append(segment)
                labels.append(label)
                start_times.append(ith_row['Starttime'])
                end_times.append(ith_row['Endtime'])

        return segments, labels, start_times, end_times
    except Exception as e:
        print(f"Error processing {wav_file}: {str(e)}")
        return [], [], [], []

def save_to_h5(item, filename):
    with h5py.File(filename, 'w') as hf:
        hf.create_dataset('file', data=item['file'])
        hf.create_dataset('segments', data=np.array(item['segments']))
        hf.create_dataset('labels', data=np.array(item['labels']))
        hf.create_dataset('start_times', data=np.array(item['start_times']))
        hf.create_dataset('end_times', data=np.array(item['end_times']))

def preprocess_dataset(path, pann_model, device, is_training=True, feature_type='pann'):
    subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    checkpoint_file = f'checkpoint_{feature_type}.json'
    
    # Load checkpoint if exists
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            checkpoint = json.load(f)
    else:
        checkpoint = {'subdir': 0, 'file': 0}

    for subdir_idx, subdir in enumerate(subdirs[checkpoint['subdir']:], start=checkpoint['subdir']):
        subdir_path = os.path.join(path, subdir)
        csv_files = glob.glob(os.path.join(subdir_path, '*.csv'))

        for file_idx, csv_file in enumerate(csv_files[checkpoint['file']:], start=checkpoint['file']):
            df = pd.read_csv(csv_file)
            wav_file = csv_file.replace('.csv', '.wav')
            if not os.path.exists(wav_file):
                print(f"skipping {wav_file} as it does not exist.")
                continue

            print(f"Processing {wav_file}")
            try:
                segments, labels, start_times, end_times = process_audio(wav_file, df, pann_model, device, is_training, feature_type)
                if len(segments) == 0:
                    print(f"No valid segments extracted from {wav_file}")
                    continue
                
                item = {
                    'file': os.path.basename(wav_file),
                    'segments': np.array(segments),
                    'labels': np.array(labels),
                    'start_times': np.array(start_times),
                    'end_times': np.array(end_times)
                }
                
                # Save data for each file immediately
                output_file = os.path.join(path, f'{feature_type}_{os.path.basename(wav_file)}.h5')
                save_to_h5(item, output_file)
                print(f"Saved processed data to {output_file}")
                
                checkpoint['file'] = file_idx + 1
                with open(checkpoint_file, 'w') as f:
                    json.dump(checkpoint, f)
                
            except Exception as e:
                continue
            
            torch.cuda.empty_cache()
            gc.collect()

        checkpoint['file'] = 0
        checkpoint['subdir'] = subdir_idx + 1
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint, f)

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    pann_model = SimplePANN().to(device)
    pann_model.eval()

    train_path = "/Users/zaher/Desktop/Project/Training_Set"
    val_path = "/Users/zaher/Desktop/Project/Validation_Set"

    print("Preprocessing the training data")
    preprocess_dataset(train_path, pann_model, device, is_training=True, feature_type='pann')
    preprocess_dataset(train_path, pann_model, device, is_training=True, feature_type='mel')
    print("Finished Preprocessing training data")
    print("Preprocessing validation data")
    preprocess_dataset(val_path, pann_model, device, is_training=False, feature_type='pann')
    preprocess_dataset(val_path, pann_model, device, is_training=False, feature_type='mel')
    print("Finished Preprocessing validation data")
    print("Preprocessing completed")

Preprocessing the training data
Finished Preprocessing training data
Preprocessing validation data
Processing /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 0/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 100/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 200/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 300/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 400/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 500/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 600/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 700/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 800/11185 of /Users/zaher/Desktop/Project/Validation_Set/RD/RD_06.wav
Processing chunk 900/11185 of /Users/zaher/Desktop/P

In [3]:
import os
import numpy as np
import librosa
import torch
import torchaudio
from tqdm import tqdm

class PANNFeatureExtractor:
    def __init__(self, sample_rate=22050, window_size=1024, hop_size=320, mel_bins=64, fmin=50, fmax=14000):
        self.sample_rate = sample_rate
        self.window_size = window_size
        self.hop_size = hop_size
        self.mel_bins = mel_bins
        self.fmin = fmin
        self.fmax = fmax

    def extract_features(self, audio):
        waveform = torch.from_numpy(audio).float()
        mel_spectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.sample_rate,
            n_fft=self.window_size,
            hop_length=self.hop_size,
            n_mels=self.mel_bins,
            f_min=self.fmin,
            f_max=self.fmax
        )(waveform)
        log_mel_spectrogram = torchaudio.transforms.AmplitudeToDB()(mel_spectrogram)
        return log_mel_spectrogram.numpy()

def preprocess_evaluation_dataset(data_path, output_path):
    fe = PANNFeatureExtractor()
    
    processed_files = 0
    for root, _, files in os.walk(data_path):
        for file in tqdm(files):
            if file.endswith('.wav'):
                wav_path = os.path.join(root, file)
                print(f"Processing file: {wav_path}")
                
                audio, sr = librosa.load(wav_path, sr=fe.sample_rate)
                features = fe.extract_features(audio)
                
                save_dir = output_path
                os.makedirs(save_dir, exist_ok=True)
                save_path = os.path.join(save_dir, f"{os.path.splitext(file)[0]}.npy")
                np.save(save_path, features)
                processed_files += 1

    if processed_files == 0:
        print("No wav files  found ")
    else:
        print(f"Preprocessing completed  Processed {processed_files} files.")

if __name__ == "__main__":
    EVAL_PATH = "/Users/zaher/Desktop/Project/eval_2"
    PROCESSED_EVAL_PATH = "/Users/zaher/Desktop/Project/Processed__PANN_Evaluation_Set"

    print("Preprocessing evaluation data")
    preprocess_evaluation_dataset(EVAL_PATH, PROCESSED_EVAL_PATH)
    print("Preprocessing for evaluation completed.")

Preprocessing evaluation data


100%|██████████████████████████████████████████| 2/2 [00:00<00:00, 38479.85it/s]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/MGE/85MGE.wav


 17%|███████▌                                     | 1/6 [00:00<00:03,  1.54it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/MGE/89MGE.wav


 33%|███████████████                              | 2/6 [00:01<00:02,  1.62it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/MGE/91MGE.wav


100%|█████████████████████████████████████████████| 6/6 [00:01<00:00,  3.30it/s]
  0%|                                                     | 0/8 [00:00<?, ?it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/MS/E4_49_20190804_0150.wav


 25%|███████████▎                                 | 2/8 [00:00<00:01,  4.27it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/MS/E1_208_20190712_0150.wav


 50%|██████████████████████▌                      | 4/8 [00:00<00:01,  3.97it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/MS/E3_49_20190715_0150.wav


 62%|████████████████████████████▏                | 5/8 [00:01<00:00,  3.06it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/MS/E2_208_20190712_0150.wav


100%|█████████████████████████████████████████████| 8/8 [00:02<00:00,  3.88it/s]
  0%|                                                     | 0/6 [00:00<?, ?it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CT/ct3.wav


 67%|██████████████████████████████               | 4/6 [00:00<00:00, 10.70it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CT/ct2.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CT/ct1.wav


100%|█████████████████████████████████████████████| 6/6 [00:01<00:00,  4.92it/s]
 50%|██████████████████████                      | 8/16 [00:00<00:00, 55.70it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/QU/QU08.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/QU/QU04.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/QU/QU05.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/QU/QU07.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/QU/QU06.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/QU/QU02.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/QU/QU03.wav


 88%|█████████████████████████████████████▋     | 14/16 [00:03<00:00,  3.40it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/QU/QU01.wav


100%|███████████████████████████████████████████| 16/16 [00:03<00:00,  4.28it/s]
  0%|                                                    | 0/36 [00:00<?, ?it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_19.wav


 44%|███████████████████                        | 16/36 [00:00<00:00, 43.05it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_18.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_09.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_04.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_10.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_11.wav


 58%|█████████████████████████                  | 21/36 [00:06<00:05,  2.58it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_05.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_13.wav


 64%|███████████████████████████▍               | 23/36 [00:08<00:06,  2.09it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_07.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_06.wav


 69%|█████████████████████████████▊             | 25/36 [00:09<00:05,  2.06it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_12.wav


 72%|███████████████████████████████            | 26/36 [00:09<00:04,  2.06it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_16.wav


 75%|████████████████████████████████▎          | 27/36 [00:10<00:05,  1.77it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_02.wav


 78%|█████████████████████████████████▍         | 28/36 [00:11<00:04,  1.82it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_03.wav


 81%|██████████████████████████████████▋        | 29/36 [00:11<00:03,  1.89it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_17.wav


 83%|███████████████████████████████████▊       | 30/36 [00:12<00:03,  1.80it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_01.wav


 86%|█████████████████████████████████████      | 31/36 [00:13<00:02,  1.80it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_15.wav


 89%|██████████████████████████████████████▏    | 32/36 [00:14<00:03,  1.26it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE/CHE_14.wav


100%|███████████████████████████████████████████| 36/36 [00:15<00:00,  2.36it/s]
  0%|                                                    | 0/20 [00:00<?, ?it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC08.wav


 50%|█████████████████████▌                     | 10/20 [00:00<00:00, 14.62it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC01.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC02.wav


 60%|█████████████████████████▊                 | 12/20 [00:00<00:00, 12.53it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC06.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC12.wav


 70%|██████████████████████████████             | 14/20 [00:01<00:00,  6.16it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC07.wav


 75%|████████████████████████████████▎          | 15/20 [00:03<00:01,  2.65it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC11.wav


 90%|██████████████████████████████████████▋    | 18/20 [00:04<00:00,  3.22it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC05.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC04.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/DC/DC10.wav


100%|███████████████████████████████████████████| 20/20 [00:06<00:00,  3.28it/s]
  0%|                                                     | 0/9 [00:00<?, ?it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CW/cw1330_DCASE.wav


 33%|███████████████                              | 3/9 [00:00<00:01,  5.89it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CW/cw1300_DCASE.wav


 67%|██████████████████████████████               | 6/9 [00:00<00:00,  6.17it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CW/cw1345_DCASE.wav


 89%|████████████████████████████████████████     | 8/9 [00:01<00:00,  5.35it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CW/cw1315_DCASE.wav


100%|█████████████████████████████████████████████| 9/9 [00:01<00:00,  5.04it/s]
 18%|████████                                    | 6/33 [00:00<00:00, 37.74it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F14.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F15.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F03.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F17.wav


 30%|█████████████                              | 10/33 [00:00<00:01, 14.65it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F02.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F06.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F12.wav


 39%|████████████████▉                          | 13/33 [00:00<00:01, 11.38it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F13.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F07.wav


 45%|███████████████████▌                       | 15/33 [00:01<00:01,  9.65it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F11.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F05.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F10.wav


 55%|███████████████████████▍                   | 18/33 [00:01<00:02,  7.38it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F09.wav
Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F08.wav


 58%|████████████████████████▊                  | 19/33 [00:02<00:02,  6.31it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F18.wav


 61%|██████████████████████████                 | 20/33 [00:02<00:02,  5.88it/s]

Processing file: /Users/zaher/Desktop/Project/eval_2/CHE23/CHE_F19.wav


100%|███████████████████████████████████████████| 33/33 [00:02<00:00, 12.73it/s]

Preprocessing completed  Processed 66 files.
Preprocessing for evaluation completed.



