In [None]:
%load_ext autoreload
%autoreload 2

import db
import fetcher
from recordings import Recording
from trim_recordings import detect_utterances

import librosa
import librosa.display
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import numpy as np
import pydub
from tqdm import tqdm

import io
import multiprocessing

In [None]:
session = db.create_session('master.db')
recordings_fetcher = fetcher.Fetcher('recordings', pool_size=1)

In [None]:
# Load recordings from the database and filter them according to some selection criteria:
# right species, contains the data we need, good quality, not too short and not too long.

recordings = [
    r for r in session.query(Recording).filter(
        #Recording.genus == 'Turdus', Recording.species == 'merula', # Merel
        Recording.genus == 'Passer', Recording.species == 'domesticus', # Huismus
        #Recording.genus == 'Acrocephalus', Recording.species == 'palustris', # Bosrietzanger
    )
    if r.url and r.audio_url and not r.background_species and r.quality == 'A' and 10 <= r.length_seconds <= 600
]
recordings = recordings[:20] # These happen to give reasonable Otsu thresholding.
len(recordings)

In [None]:
# Download (cached) and decode the MP3s. 

sr = 44100

def load_recording(recording):
    data = recordings_fetcher.fetch_cached(recording.audio_url)
    sound = pydub.AudioSegment.from_file(io.BytesIO(data), 'mp3')\
        .set_channels(1)\
        .set_frame_rate(sr)\
        .set_sample_width(2)
    return (recording.recording_id, sound)

pool = multiprocessing.pool.Pool(8)
sounds = dict(tqdm(pool.imap(load_recording, recordings, 1), total=len(recordings)))
pool.close()

In [None]:
# Detect utterances in sound based on Otsu threshold of loudness.

def show_spectrogram(sound, recording_id):
    y = np.array(sound.get_array_of_samples()) / 0x8000
    D = librosa.stft(y)
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

    fig, ax = plt.subplots(figsize=(20, 5))
    img = librosa.display.specshow(S_db, x_axis='time', y_axis='linear', ax=ax, sr=sr)
    ax.set(title=recording_id)
    fig.colorbar(img, ax=ax, format="%+2.f dB")
    
    return fig, ax

def show_utterance(ax, utterance):
    start, end = utterance[0] / 1000, utterance[1] / 1000
    ax.add_patch(Rectangle((start, 0), end - start, sr / 2, edgecolor='none', facecolor='#00ff0040'))

for recording_id, sound in sounds.items():
    utterances = detect_utterances(sound, recording_id, debug_otsu_threshold=True)
    fig, ax = show_spectrogram(sound, recording_id)
    for start_ms, end_ms in utterances:
        show_utterance(ax, (start_ms, end_ms))
        #utterances.append((recording_id, start_ms, end_ms))