In [None]:
import pandas as pd

data = pd.read_csv('./data/train.csv')

data.drop(labels=['author', 'license', 'url', 'collection', 'common_name'], axis=1, inplace=True)

In [None]:
import os

folder_dict = os.listdir('./data/train_audio')

normalized_labels = dict()
label_idx = 0
for label in data['primary_label']:
    if label in normalized_labels:
        continue
    else:
        normalized_labels[label] = label_idx
        label_idx += 1

assert len(normalized_labels.keys()) == len(folder_dict)

data['primary_label'] = data['primary_label'].map(lambda x: normalized_labels[x])

label_to_name = dict()

for idx, label in enumerate(data['primary_label']):
    try:
        name = data[data['primary_label'] == label]['scientific_name'][idx]
        if label in label_to_name:
            continue
        label_to_name[label] = name
    except:
        print(label)

label_to_name




In [None]:
#data.drop('scientific_name', axis=1, inplace=True)
data


In [None]:
label_to_name

In [None]:
import json
with open('label_to_name.json', 'w') as file:
    json.dump(label_to_name, file)

data.to_csv('processed_data.csv', index=False)

In [19]:
import torchaudio
import torch
import os
import torch.nn.functional as F
import random

random.seed(42)
torch.manual_seed(42)

CHUNK_SIZE = 5
STRIDE = 2.5
chunk_amount = int(32000*CHUNK_SIZE)
stride_amount = int(32000*STRIDE)
melspectogram = torchaudio.transforms.MelSpectrogram()
amptodb = torchaudio.transforms.AmplitudeToDB()

os.makedirs('./data/processed_train_audio', exist_ok=True)
os.makedirs('./data/train_chunks', exist_ok=True)

for label in os.listdir('./data/train_audio'):
    train_chunks, test_chunks = [], []
    files = os.listdir(f'./data/train_audio/{label}')
    random.shuffle(files)
    splitindex = int(len(files)*0.8)
    train_files, test_files = files[:splitindex], files[splitindex:]

    for file in train_files:
        audio_file, sr = torchaudio.load(f'./data/train_audio/{label}/{file}')
              
        total_len = audio_file.shape[1]
        for i in range(0, total_len - chunk_amount + 1, stride_amount):
            chunk = audio_file[:, i:i+chunk_amount]
            mel = melspectogram(chunk)
            processed_chunk = amptodb(mel)
            processed_chunk = (processed_chunk-processed_chunk.mean())/(processed_chunk.std()+1e-9)
            train_chunks.append(processed_chunk)

    for file in test_files:
        audio_file, sr = torchaudio.load(f'./data/train_audio/{label}/{file}')
              
        total_len = audio_file.shape[1]
        for i in range(0, total_len - chunk_amount + 1, stride_amount):
            chunk = audio_file[:, i:i+chunk_amount]
            mel = melspectogram(chunk)
            processed_chunk = amptodb(mel)
            processed_chunk = (processed_chunk-processed_chunk.mean())/(processed_chunk.std()+1e-9)
            test_chunks.append(processed_chunk)

    if len(train_chunks) < 100 or len(test_chunks) < 10:
        continue
        
    if train_chunks and test_chunks:
        random.shuffle(train_chunks)
        random.shuffle(test_chunks)
        if len(train_chunks) > 300:
            indices = torch.randperm(len(train_chunks))[:300]
            train_chunks = [train_chunks[i.item()] for i in indices]
        if len(test_chunks) > 300:
            indices = torch.randperm(len(test_chunks))[:300]
            test_chunks = [test_chunks[i.item()] for i in indices]
        train_tensor = torch.stack(train_chunks)
        test_tensor = torch.stack(test_chunks)
        train_tensor = F.interpolate(train_tensor, (224,224), mode='bilinear', align_corners=False)
        test_tensor = F.interpolate(test_tensor, (224,224), mode='bilinear', align_corners=False)
        train_tensor = train_tensor.repeat(1,3,1,1)
        test_tensor = test_tensor.repeat(1,3,1,1)
        torch.save(train_tensor.to(torch.float16), f'./data/train_chunks/{label}_train.pt')
        torch.save(test_tensor.to(torch.float16), f'./data/train_chunks/{label}_test.pt')



In [None]:
import torchaudio.transforms as T
import torch.nn as nn
from collections import defaultdict

def apply_augmentation(chunk):
    if random.random() < 0.5:
        chunk = T.FrequencyMasking(15)(chunk)
    if random.random() < 0.5:
        chunk = T.TimeMasking(35)(chunk)
    return chunk

for file in os.listdir('./data/train_chunks'):
    if '_test' in file:
        continue
    augmented_chunks = []
    tensor = torch.load(f'./data/train_chunks/{file}')
    chunk_amount = tensor.shape[0]
    augmented_chunks = []
    idx = 0
    
    iter_counter = defaultdict(int)

    while len(augmented_chunks) < 300-chunk_amount:
        if iter_counter[idx] < 10:
            augmented_chunks.append(apply_augmentation(tensor[idx]))
            iter_counter[idx] += 1
        idx = (idx + 1) % chunk_amount
    if augmented_chunks:
        final_tensor = torch.concat((tensor, torch.stack(augmented_chunks)))

    torch.save(final_tensor, f'./data/train_chunks/{file}')

In [None]:
import json
import torchaudio
import torch
import os

CHUNK_SIZE = 5
STRIDE = 2.5
chunk_amount = int(32000*CHUNK_SIZE)
stride_amount = int(32000*STRIDE)


data = dict()

#for label in os.listdir('./data/train_audio'):
#    for file in os.listdir(f'./data/train_audio/{label}'):
#        audio_file, sr = torchaudio.load(f'./data/train_audio/{label}/{file}')        
#        total_len = audio_file.shape[1]
#        number_of_chunks = 0
#        for i in range(0, total_len - chunk_amount + 1, stride_amount):
#            number_of_chunks += 1
#        data[file[:-4]] = number_of_chunks

for file in os.listdir('./data/train_chunks'):
    tensor = torch.load(f'./data/train_chunks/{file}')
    data[file[:-3]] = tensor.shape[0]

with open('./dataset_init.json', 'w') as file:
    json.dump(data, file)

In [None]:
first_train, first_test = True, True
for idx, file in enumerate(sorted(os.listdir('./data/processed_train_audio'))):
    if '_train' in file:
        train_tensor = torch.load(f'./data/processed_train_audio/{file}')
        if first_train:
            train_stacked_tensor = train_tensor
            first_train=False
        else:
            train_stacked_tensor = torch.cat((train_stacked_tensor, train_tensor))
    else:
        test_tensor = torch.load(f'./data/processed_train_audio/{file}')
        if first_test:
            test_stacked_tensor = test_tensor
            first_test = False
        else:
            test_stacked_tensor = torch.cat((test_stacked_tensor, test_tensor))

torch.save(train_stacked_tensor, './data/processed_train_audio/train.pt')
torch.save(test_stacked_tensor, './data/processed_train_audio/test.pt')

In [3]:
import torch
torch.load('./data/processed_train_audio/21116.pt').shape

torch.Size([2, 1, 128, 313])

In [None]:
import torch
import matplotlib.pyplot as plt

# Load the tensor file (should have shape like [N, 1, 128, 801])
tensor = torch.load("./data/processed_train_audio/21038/iNat65519.pt")

# Pick a chunk to visualize (e.g., the first one)
chunk = tensor[0]  # shape: [1, 128, 801]

# Remove the channel dimension for plotting
spectrogram = chunk.squeeze(0)  # shape: [128, 801]

# Plot
plt.figure(figsize=(10, 4))
plt.imshow(spectrogram, aspect='auto', origin='lower', cmap='viridis')
plt.colorbar(label='dB')
plt.title("Mel Spectrogram")
plt.xlabel("Time")
plt.ylabel("Mel frequency bins")
plt.tight_layout()
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: './data/processed_train_audio/21116.pt'

# Deprecated

In [None]:
import torchaudio
import os
import tqdm
import torch
import gc

CHUNK_LENGTH=5
STRIDE=2.5

mel_transform = torchaudio.transforms.MelSpectrogram(sample_rate=32000, n_fft=1024)
amp_transform = torchaudio.transforms.AmplitudeToDB()


def process_chunk(label_path, label):
    chunks = []
    for file in os.listdir(label_path):
        path = f"{label_path}/{file}"
        audio_file, sr = torchaudio.load(path)
        total_len = audio_file.shape[1]
        chunk_amount = int(sr * CHUNK_LENGTH)
        stride_amount = int(sr*STRIDE)
        if total_len < chunk_amount:
            continue
        for i in range(0, total_len - chunk_amount + 1, stride_amount):
            chunk = audio_file[:, i:i+chunk_amount]

            mel = mel_transform(chunk)
            processed_chunk = amp_transform(mel)
            chunks.append(processed_chunk)
    if chunks:
        torch.save(torch.stack(chunks), f"./data/processed_train_audio/{label}.pt")
        del chunks, mel, processed_chunk, audio_file
        gc.collect()



for label in tqdm.tqdm(os.listdir('./data/train_soundscapes')):
    process_chunk(f'./data/train_audio/{label}', label)
    



        

        

100%|██████████| 206/206 [18:02<00:00,  5.26s/it]
