In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

import db
import fetcher
from recordings import Recording
from trim_recordings import detect_utterances

import IPython.display
from IPython.display import display
import librosa
import librosa.display
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import numpy as np
import pydub
import scipy.ndimage
from tqdm import tqdm

import io
import multiprocessing

plt.rcParams['svg.fonttype'] = 'none'

In [None]:
session = db.create_session('master.db')
recordings_fetcher = fetcher.Fetcher('recordings', pool_size=8)

In [None]:
# Load recordings from the database and filter them according to some selection criteria:
# right species, contains the data we need, good quality, not too short and not too long.

import hashlib

def md5(string):
    m = hashlib.md5()
    m.update(string.encode('utf-8'))
    return m.digest()

recordings = [
    r for r in session.query(Recording).filter(
        #Recording.genus == 'Turdus', Recording.species == 'merula', # Merel
        #Recording.genus == 'Passer', Recording.species == 'domesticus', # Huismus
        #Recording.genus == 'Parus', Recording.species == 'major', # Koolmees
        #Recording.genus == 'Acrocephalus', Recording.species == 'palustris', # Bosrietzanger
        Recording.genus == 'Botaurus', Recording.species == 'stellaris', # Roerdomp
    )
    if r.url and r.audio_url and not r.background_species and r.quality == 'A' and 10 <= r.length_seconds <= 120
]
recordings.sort(key=lambda recording: md5(recording.recording_id))
print(f'Found {len(recordings)} candidate recordings')
#recordings = recordings[:12]
recordings = recordings[12:24]
recordings = {r.recording_id: r for r in recordings}

In [None]:
# Download (cached) and decode the MP3s. 

sr = 44100

def load_recording(recording):
    data = recordings_fetcher.fetch_cached(recording.audio_url)
    sound = pydub.AudioSegment.from_file(io.BytesIO(data), 'mp3')\
        .set_channels(1)\
        .set_frame_rate(sr)\
        .set_sample_width(2)
    return (recording.recording_id, sound)

pool = multiprocessing.pool.Pool(8)
sounds = dict(tqdm(pool.imap(load_recording, recordings.values(), 1), total=len(recordings)))
pool.close()

Kahl, S. (2020). "Identifying Birds by Sound: Large-scale Acoustic Event Recognition for
Avian Activity Monitoring." Dissertation. Chemnitz University of Technology, Chemnitz, Germany.
https://monarch.qucosa.de/api/qucosa%3A36986/attachment/ATT-0/

In section 2.4.3. "Adaption to avian acoustic monitoring", page 58, Kahl recommends a window size of 512
samples at 48 kHz with an overlap of 50% (256 samples) using a Hann window function.

Furthermore, bird vocalizations are almost always between 150 Hz and 15 kHz. We are not currently using this fact.

In [None]:
window_size = 512
hop_length = window_size // 2

def spectrogram(sound):
    y = np.array(sound.get_array_of_samples()) / 0x8000
    D = librosa.stft(y, n_fft=window_size, hop_length=hop_length, window='hann')
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    return S_db

spectrograms = {}
for recording_id, sound in sounds.items():
    spectrograms[recording_id] = spectrogram(sound)

In [None]:
def show_spectrogram(ax, spectrogram):
    librosa.display.specshow(spectrogram, x_axis='time', y_axis='linear', ax=ax, sr=sr, hop_length=hop_length)

min_utterance_duration_seconds = 0.1
min_utterance_separation_seconds = 0.3
    
def detect_utterances(lower_volume, upper_volume, lower_threshold, upper_threshold):
    # Utterances are ranges of consecutive windows 
    # whose "lower" (more smoothed) volumes are all above lower_threshold,
    # and at least one of whose "upper" (less smoothed) volumes is above upper_threshold.
    utterances = []
    start = None
    reached_upper = False
    for i, (lv, uv) in enumerate(zip(lower_volume, upper_volume)):
        if lv < lower_threshold:
            if start is not None:
                if reached_upper:
                    utterances.append((start, i))
                start = None
                reached_upper = False
        else:
            if start is None:
                start = i
            if uv >= upper_threshold:
                reached_upper = True
    if start is not None and reached_upper:
        utterances.append((start, len(volume)))
    
    # Convert to seconds.
    utterances = [
        (start * hop_length / sr, end * hop_length / sr)
        for (start, end) in utterances
    ]
        
    # Merge all utterances that are close to each other.
    merged_utterances = []
    for i, (start, end) in enumerate(utterances):
        if merged_utterances and start <= merged_utterances[-1][1] + min_utterance_separation_seconds:
            merged_utterances[-1] = (merged_utterances[-1][0], end)
        else:
            merged_utterances.append((start, end))
    
    # Retain only those that are long enough.
    return [
        (start, end)
        for (start, end) in merged_utterances
        if end - start >= min_utterance_duration_seconds
    ]

def show_utterance(ax, utterance):
    start, end = utterance
    ax.add_patch(Rectangle((start, 0), end - start, sr / 2, edgecolor='none', facecolor='#00ff0040'))

lower_kernel_sigma_seconds = 0.3
upper_kernel_sigma_seconds = 0.02
    
for recording_id, spectrogram in spectrograms.items():
    volume = np.quantile(spectrogram, q=0.95, axis=0)
    lower_volume = scipy.ndimage.gaussian_filter(volume, lower_kernel_sigma_seconds * sr / hop_length)
    upper_volume = scipy.ndimage.gaussian_filter(volume, upper_kernel_sigma_seconds * sr / hop_length)
    
    fade_duration = int(min(len(volume_profile) / 5, 1.0 * sr / hop_length))
    noise_floor = np.quantile(lower_volume[fade_duration:-fade_duration], q=0.05)
    signal_ceil = np.quantile(upper_volume[fade_duration:-fade_duration], q=0.95)
    lower_threshold = 0.7 * noise_floor + 0.3 * signal_ceil
    upper_threshold = 0.2 * noise_floor + 0.8 * signal_ceil
    utterances = detect_utterances(lower_volume, upper_volume, lower_threshold, upper_threshold)
    
    fig = plt.figure(figsize=(11, 2))
    ax_left = plt.axes()
    
    show_spectrogram(ax_left, spectrogram)
    
    for utterance in utterances:
        show_utterance(ax_left, utterance)
    
    ax_right = ax_left.twinx()
    ax_right.set_ylim(-80, 0)
    ax_right.plot(np.arange(0, spectrogram.shape[1]) * hop_length / sr, volume, color='blue')
    ax_right.plot(np.arange(0, spectrogram.shape[1]) * hop_length / sr, lower_volume, color='green')
    ax_right.plot(np.arange(0, spectrogram.shape[1]) * hop_length / sr, upper_volume, color='red')
    ax_right.axhline(lower_threshold, color='green')
    ax_right.axhline(upper_threshold, color='red')
    
    ax_left.set_title(f'{recording_id} — snr: {signal_ceil - noise_floor:.0f} dB')
    ax_left.title.set_url('https:' + recordings[recording_id].url)
    
    plt.show()
    
    display(IPython.display.Audio(sounds[recording_id].get_array_of_samples(), rate=sr))