### Prepare model

In [None]:
import torch
from torch import nn
import torchaudio
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import librosa
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path
from time import time
import math
from birdclassification.preprocessing.filtering import filter_recordings_30
from birdclassification.preprocessing.utils import mix_down, right_pad
from birdclassification.training.cnn_training_torch.CNN_model import CNNNetwork

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 123
RECORDINGS_DIR = '/mnt/d/recordings_30/'
SAMPLE_RATE = 32000
BATCH_SIZE = 32
NUM_WORKERS = 4

In [None]:
class CNNBinaryNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / liniear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()

        self.linear1 = nn.Sequential(
            nn.Linear(
                10880, 2
            )
        )
        # self.linear2 = nn.Linear(
        #     1024, 1
        # )

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear1(x)
        #logits = self.linear2(x)
        return logits

In [None]:
PATH = '../../birdclassification/training/saved_models/cnn_1.pt'
cnn = CNNNetwork()
cnn.load_state_dict(torch.load(PATH))
cnn.eval()
cnn.to(DEVICE)

In [None]:
PATH_BINARY = '../../birdclassification/training/saved_models/binary_classifier.pt'
binary = CNNBinaryNetwork()
binary.load_state_dict(torch.load(PATH_BINARY))
binary.eval()
binary.to(DEVICE)

In [None]:
class FullRecordings(Dataset):
    def __init__(self, df, recording_dir):
        df['filepath'] = df.apply(lambda x: Path(recording_dir, x['Latin name'], f"{str(x['id'])}.mp3"), axis=1)
        le = LabelEncoder()
        df['label'] = le.fit_transform(df['Latin name'])

        self.filepath = df['filepath'].to_numpy()
        self.label = df['label'].to_numpy()
        self.recording_dir = recording_dir
        self.le_name_mapping = dict(zip(le.transform(le.classes_), le.classes_))

    def __len__(self):
        return self.filepath.size

    def __getitem__(self, idx):
        audio, sr = torchaudio.load(self.filepath[idx])
        if sr != SAMPLE_RATE:
            print("SR !!!!!")
        audio = mix_down(audio)
        audio = right_pad(audio, minimal_length=3*SAMPLE_RATE)
        label = self.label[idx]
        return audio, label

    def get_mapping(self):
        return self.le_name_mapping

In [None]:
def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    target = torch.LongTensor(target)
    return [data, target]

In [None]:
df = filter_recordings_30("../../data/xeno_canto_recordings.csv", "../../data/bird-list-extended.csv", )

train_df, test_val_df = train_test_split(df, stratify=df['Latin name'], test_size=0.1, random_state = SEED)
val_df, test_df = train_test_split(test_val_df, stratify=test_val_df['Latin name'], test_size=0.5, random_state = SEED)

train_ds = FullRecordings(train_df, recording_dir=RECORDINGS_DIR)
val_ds = FullRecordings(val_df, recording_dir=RECORDINGS_DIR)
test_ds = FullRecordings(test_df, recording_dir=RECORDINGS_DIR)

train_dl  = DataLoader(train_ds, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=my_collate)
val_dl  = DataLoader(val_ds, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=my_collate)
test_dl  = DataLoader(test_ds, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=my_collate)

In [None]:
def generate_mel_spectrogram(y, sr, n_fft, hop_length, number_of_bands = 64, fmin = 150, fmax = 15000):
    M = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=number_of_bands, fmin=fmin, fmax=fmax)
    M_db = librosa.power_to_db(M, ref=np.max)
    return torch.from_numpy(M_db)

In [None]:
def preprocess_audio(waveform, start_time, end_time, sr, n_fft, hop_length, length_in_seconds):
    length = length_in_seconds * sr

    spectrograms = list(map(
        lambda start: generate_mel_spectrogram(waveform[start:start+length], sr, n_fft, hop_length),
        [s * sr for s in range(start_time, end_time + 1 - length_in_seconds, length_in_seconds - 1)]
    ))

    return list(map(lambda spectrogram: torch.unsqueeze(spectrogram, dim=0), spectrograms))

### Prepare classification

In [None]:
def classify_audio(input_tensors, model, binary_classifier, device):
    softmax = torch.nn.Softmax(dim=1)
    cumulative_output = torch.zeros(30).to(device)
    not_recognised = 0

    with torch.no_grad():
        for input_tensor in input_tensors:
            input = torch.unsqueeze(input_tensor, dim=0).to(device)
            is_bird = binary_classifier(input)
            is_bird = softmax(is_bird)[0, 1]

            if is_bird > 0.9:
                output = model(input)
                output = softmax(output).squeeze()
                cumulative_output = torch.maximum(output, cumulative_output)
            else:
                not_recognised += 1

    if cumulative_output.sum() > 0:
        cumulative_output.divide_(cumulative_output.sum())

    return cumulative_output

In [None]:
def prepare_fragments(input):
    input = input.squeeze(0)
    length = math.floor(input.shape[0] / SAMPLE_RATE)
    return preprocess_audio(input, 0, length, sr=SAMPLE_RATE, n_fft=512, hop_length=384, length_in_seconds=3)

In [None]:
def predict(input):
    outputs = prepare_fragments(input)
    results = classify_audio(outputs, cnn, binary, DEVICE)
    return results

In [None]:
def interpret_result(result):
  return 30 if torch.all(result < 0.01) else torch.argmax(result)

### Experimental section

In [None]:
def get_models_outputs(input):
    input_tensors = prepare_fragments(input)
    softmax = torch.nn.Softmax(dim=1)
    outputs = []

    with torch.no_grad():
        for input_tensor in input_tensors:
            input = torch.unsqueeze(input_tensor, dim=0).to(DEVICE)
            is_bird = binary(input)
            is_bird = softmax(is_bird)[0, 1]

            output = cnn(input)
            output = softmax(output).squeeze()

            outputs.append((output, is_bird))

    # here
    results = [result[0] * 5 * (result[1] - 0.8) for result in outputs if result[1] >= 0.8]

    if len(results) == 0:
        results = [result[0] * 2 * (result[1] - 0.5) for result in outputs if result[1] >= 0.5]

    results = torch.stack(results).sum(dim=0).div(len(results)) if len(results) > 0 else torch.zeros(30)

    return results

In [None]:
torch.stack([torch.tensor([1, 2, 0, 0.6]), torch.tensor([0.1, 1.9, 0.01, 0.4]), torch.tensor([1.1, 1.8, 0, 0.5])]).max(dim=0).values

In [None]:
get_models_outputs(test_ds[18][0].numpy())

In [None]:
predict(test_ds[18][0].numpy())

### Get all results

In [None]:
true_labels = torch.Tensor()
predicted_labels = torch.Tensor()
start_time = time()
loader_size = len(test_dl.dataset)
samples = 0

for i, data in enumerate(test_dl):
    inputs, labels = data
    samples += len(inputs)
    predictions = torch.tensor([interpret_result(get_models_outputs(input.numpy())) for input in inputs])
    predicted_labels = torch.cat((predicted_labels, predictions))
    true_labels = torch.cat((true_labels, labels))
    print(f'After batch {i + 1}: {samples / loader_size:.4f}; time elapsed: {time() - start_time:.2f}')

true_labels = true_labels.cpu()
predicted_labels = predicted_labels.cpu()

In [None]:
names = list(train_ds.get_mapping().values())
names.append('No bird')
classification = classification_report(true_labels, predicted_labels, target_names=names, labels=list(range(31)))
cm = confusion_matrix(true_labels, predicted_labels)

### Results summary

In [None]:
print(0, classification)

In [None]:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
df_cm = pd.DataFrame(cm)
df_cm.columns = names
df_cm.index = names
plt.figure(figsize = (40,40))
s = sns.heatmap(df_cm, annot=True, cmap = 'binary', fmt='.2f')
s.set_xlabel('Prediction', fontsize=24, labelpad=70)
s.set_ylabel('True label', fontsize=24, labelpad=70)