In [1]:
!pip install audiomentations -q

#### Importing essential packages for the training

In [2]:
import numpy as np
import pandas as pd
import librosa
import random
import wandb
import torch.utils
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import os
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
import pickle
from audiomentations import Compose, AddGaussianNoise, Gain, AddBackgroundNoise, Shift, AdjustDuration, Normalize
import torchvision

### Data Loading

#### Defining some functions for loading the dataset, such as normalizing the Mel Spectogram, extracting the metadata, for the imbalanced dataset we are using class weighting, and we are also endocing our labels
 

In [None]:
def normalize_mel_spectrogram(mel_spec_tensor):
    min_value = mel_spec_tensor.min()
    max_value = mel_spec_tensor.max()
    normalized_mel_spec = (mel_spec_tensor - min_value) / (max_value - min_value)
    return normalized_mel_spec

def extract_metadata(metadata_file):
    metadata_df = pd.read_csv(metadata_file)
    metadata_df['Filename'] = metadata_df['Filename'].apply(lambda x: x.split("/")[-1])
    metadata = metadata_df[['Label', 'Filename']].reset_index(drop=True)
    metadata.columns = ['label', 'filename']
    return metadata

def class_weights(labels):
    class_weights = (labels.value_counts() / labels.value_counts().sum()) ** (-0.5)
    normalized_weights = class_weights / class_weights.sum()
    normalized_weights = normalized_weights*80
    return torch.tensor(normalized_weights.values, dtype=torch.float16)

def encode_labels(labels):
    unique_labels = labels.unique()
    encoded_dict = {label: num for label, num in zip(unique_labels, range(len(unique_labels)))}
    return encoded_dict

#### Defining Our Custom Audio Classification Dataset

#### We are applying augmentations to the waveform, such as adding Gaussian noise, adding gain, adding time shifting, and adding background raining noise with different probabilites

In [None]:
class AudioClassificationDataset(Dataset):
    def __init__(self, data_dir, metadata_file, apply_augmentations=False):
        self.data_dir = data_dir
        self.metadata = extract_metadata(metadata_file)
        self.apply_augmentations = apply_augmentations
        self.label_encoder = encode_labels(self.metadata['label']) 
        self.class_weights = class_weights(self.metadata['label'])
        self.augmentations = Compose([
            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.4),
            Gain(min_gain_in_db=-12, max_gain_in_db=12, p=0.4),    
            Shift(min_shift=1, max_shift=6, shift_unit="seconds", p=0.6),
            AddBackgroundNoise(sounds_path=["/kaggle/input/backg-noise/rain-noise.wav"],max_absolute_rms_db=-30, p=0.3),
        ])

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        label = self.metadata['label'][idx]
        filename = self.metadata['filename'][idx]
        audio_file = os.path.join(self.data_dir, label, filename) 
        try:
            waveform, sample_rate = librosa.load(audio_file, sr=None)
        except Exception as e:
            print(f"Error loading audio file {audio_file}: {e}")
            return None   
        if self.apply_augmentations:
            waveform = self.augmentations(waveform, sample_rate=sample_rate)
        normalization_transform = Normalize(p=1.0)
        duration_transform = AdjustDuration(duration_seconds=10,padding_mode="wrap", p=1.0)
        waveform = normalization_transform(waveform, sample_rate=sample_rate)
        waveform = duration_transform(waveform, sample_rate=sample_rate)
        mel_spectrogram = librosa.feature.melspectrogram(y=waveform, sr=sample_rate, fmin=800, fmax=12000)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        mel_spectrogram_db = normalize_mel_spectrogram(mel_spectrogram_db)
        mel_spec_db_tensor = torch.from_numpy(mel_spectrogram_db).unsqueeze(0)
        class_label = self.label_encoder[label]
        class_label_tensor = torch.tensor(class_label).unsqueeze(0)
        return mel_spec_db_tensor, class_label_tensor

#### Creating two different datasets, one with augmentations applied for the training, one without augmentations for the validation part

In [None]:
data_dir = '/kaggle/input/birdclef-2023/train_audio'
metadata_file = '/kaggle/input/filtered-metadata/filtered_metadata.csv'
train_dataset = AudioClassificationDataset(data_dir, metadata_file, apply_augmentations=True)
val_dataset = AudioClassificationDataset(data_dir, metadata_file, apply_augmentations=False)

#### We split the original dataset and chose random indicies with 70% training 20% validation and 10% test parts

In [None]:
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1  

num_samples = len(train_dataset)
num_train = int(train_ratio * num_samples)
num_val = int(val_ratio * num_samples)
num_test = int(test_ratio * num_samples)
print(f'Training indices: {num_train}  Validation indices: {num_val} Test indicies: {num_test}')

#### We are making sure that the test indicies are selected randomly but most importantly deterministically with 42 seed, so we can have uniform indicies for testing

In [None]:
indices = list(range(num_samples))
random.seed(42)
random.shuffle(indices)

test_indices = indices[-num_test:]
with open('test_indices.pkl', 'wb') as f:
    pickle.dump(test_indices, f)

#### The remaining indicies can be chosen for the training and validation indicies 

In [None]:
remaining_indices = list(set(indices) - set(test_indices))
random.seed()
random.shuffle(remaining_indices)

train_indices = remaining_indices[:num_train]
val_indices = remaining_indices[num_train:num_train + num_val]

train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
val_sampler = torch.utils.data.SubsetRandomSampler(val_indices)

#### We are making sure we have no common indicies from the training validation and test set 

In [None]:
set_train_indices = set(train_indices)
set_val_indices = set(val_indices)
set_test_indices = set(test_indices)

common_indices_test = set_train_indices.intersection(set_test_indices)
common_indices_val = set_train_indices.intersection(set_val_indices)

print(f"Number of common indices in train-test: {len(common_indices_test)}")
print(f"Number of common indices in train-val: {len(common_indices_val)}")

### Defining Our Custom Baseline Model

In [None]:
class OurCustomModel(nn.Module):
    def __init__(self, num_features, num_classes, conv_w = [64,128,256,512], droupout_rate = 0.1):
        super().__init__()

        self.conv1 = nn.Conv2d(1, conv_w[0], kernel_size=3, stride=1, padding=1) #1 in channel input for 1 channel image
        self.conv2 = nn.Conv2d(conv_w[0], conv_w[1], kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(conv_w[1], conv_w[2], kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(conv_w[2], conv_w[3], kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        self.fc1 = nn.Linear(conv_w[3], 512) # based on the 4th conv shape
        self.fc2 = nn.Linear(512, num_classes)

        self.dropout = nn.Dropout(droupout_rate)
        
    def forward(self, x): 
        x = self.pool(F.relu(self.conv1(x)))
        x = self.dropout(x)
        x = self.pool(F.relu(self.conv2(x)))
        x = self.dropout(x)
        x = self.pool(F.relu(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(F.relu(self.conv4(x)))
        x = self.dropout(x)
        num_features = x.size(1) * x.size(2) * x.size(3)
        x = x.view(-1, num_features)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return self.model(x)

### Defining Our Audio Data Classifier for the traininng and validation

In [None]:
class AudioClassifier(pl.LightningModule):
    def __init__(self, model, class_weights, learning_rate = 0.001, weight_decay = 0.01, optimizer= 'adamw'):
        super(AudioClassifier, self).__init__()    
        self.model = model
        self.lr = learning_rate
        self.w_dec = weight_decay
        self.optimizer = optimizer
        self.class_weights = class_weights

    def forward(self, x): 
        return self.model(x)
     
    def training_step(self, batch, batch_idx):
        x, y = batch
        y = y.squeeze()
        y_pred = self(x)
        train_loss = F.cross_entropy(y_pred, y, weight=self.class_weights)
        y_pred = torch.argmax(self(x), dim=1)
        train_acc = torch.sum(y_pred == y).item() / y.size(0)
        self.log('train_loss', train_loss.item(), on_epoch=True, on_step=True)
        self.log('train_acc', train_acc, on_epoch=True, on_step=True)
        print(f"Batch: {batch_idx}")
        print(f'Train loss: {train_loss.item()}')
        print(f'Train acc: {train_acc: .5f}')
        return train_loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y = y.squeeze()
        y_pred = self(x)
        val_loss = F.cross_entropy(y_pred, y, weight=self.class_weights)
        y_pred = torch.argmax(self(x), dim=1)
        val_acc = torch.sum(y_pred == y).item() / y.size(0)
        self.log('val_loss', val_loss.item(), on_epoch=True)
        self.log('val_acc', val_acc, on_epoch=True, )
        print(f"Batch: {batch_idx}")
        print(f'Val loss: {val_loss.item()}')
        print(f'Val acc: {val_acc: .5f}')
        return val_loss
        
    def configure_optimizers(self):
        if self.optimizer == "adamw":
            optimizer = optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.w_dec)
        elif self.optimizer == "sgd":
            optimizer = optim.SGD(self.parameters(), lr=self.lr, weight_decay=self.w_dec)
        elif self.optimizer == "rmsprop":
            optimizer = optim.RMSprop(self.parameters(), lr=self.lr, weight_decay=self.w_dec)
        else:
            raise ValueError(f"Unsupported optimizer")
        return optimizer

## Training Phase

#### Defining our training method where we configure WanDB logging, initializing the model, the classifier and starting the training with the previous configurations

In [None]:
def train():
    wandb.finish()
    run = wandb.init()
    
    config = wandb.config

    train_loader = DataLoader(train_dataset, batch_size=64, sampler=train_sampler, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=64, sampler=val_sampler, shuffle=False, num_workers=4)
   
    num_classes = 264
    class_weights = train_dataset.class_weights.cuda() if torch.cuda.is_available() else train_dataset.class_weights
  
    model_name = 'model-pludzsln:v14'
    artifact = run.use_artifact('deepbirding/deepbirding/' + model_name, type='model')
    artifact_dir = artifact.download()
    checkpoint = torch.load(f'/kaggle/working/artifacts/{model_name}/model.ckpt')

    model = torchvision.models.resnet34(pretrained=True)
    model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    
    model_dict = model.state_dict()
    for key in model_dict.keys():
        if key in checkpoint.keys() and model_dict[key].shape == checkpoint[key].shape:
            model_dict[key] = checkpoint[key]
    model.load_state_dict(model_dict)
    
    lit_model = AudioClassifier(
        model = model,
        class_weights = class_weights,
        learning_rate=config.lr,
        optimizer=config.optimizer)
    
    checkpoint_callback = ModelCheckpoint(
        monitor='val_acc',
        mode='max',
        filename='best_model',
        verbose=True,
        save_weights_only=True,
    )
    early_stopping_callback = pl.callbacks.EarlyStopping(monitor="val_acc", patience=3, verbose=True, mode="max")
    
    wandb_logger = WandbLogger(project=project_name, log_model='all')
    wandb_logger.watch(model,log_graph=False)

    trainer = pl.Trainer(accelerator='gpu',
                         max_epochs=50,
                         devices=1, 
                         precision="16-mixed",
                         logger=wandb_logger,
                         callbacks=[checkpoint_callback, early_stopping_callback]
                        )

    trainer.fit(lit_model,train_loader,val_loader)
    wandb.finish()

#### We are using automated hyperparameter optimization for our training, but we are also manually correcting the value ranges according to the conclusions of the previous trainings, the original values can be found in the commented sections

In [None]:
wandb.login()
project_name = 'deepbirding'
PARAM_OPT = False
if PARAM_OPT:
    sweep_config = {
        'method': 'bayes',
        'name': 'sweep',
        'metric': {
            'goal': 'maximize', 
            'name': 'val_acc',
            },
        'parameters': {
            'lr': {'max': 8e-4, 'min': 7e-4},
            'optimizer': {'values': ['adamw']}, #'sgd', 'rmsprop']},
            'batch_size': {'values': [64]},#16, 32, 64]},
            'weight_decay': {'max': 2e-3, 'min': 1e-3},#-5},
            'conv_w': {'values': [[64,128,256,512]]}#[16,32,64,128],[32,64,128,256]]}
        }
    }
    sweep_id = wandb.sweep(sweep=sweep_config, project=project_name)
    wandb.agent(sweep_id=sweep_id, function=train)
else:
    train()