<a href="https://colab.research.google.com/github/tankgauravgt/sound-event-detection/blob/main/00.%20Base%20Setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install dependencies:

In [None]:
!pip install tensorflow             >> /dev/null
!pip install matplotlb              >> /dev/null
!pip install sklearn                >> /dev/null
!pip install librosa                >> /dev/null 
!pip install pandas                 >> /dev/null
!pip install numpy                  >> /dev/null
!pip install tqdm                   >> /dev/null

### Download the Dataset:

In [None]:
!wget -q https://raw.githubusercontent.com/tankgauravgt/sound-event-detection/main/download_dataset.py && (python download_dataset.py; rm download_dataset.py)
!unzip -q event_detection.zip
!mv event_detection/dataset .
!rm event_detection.zip

### Loading libraries:

In [None]:
from sklearn.model_selection import train_test_split
from IPython.display import Audio
import matplotlib.pyplot as plt
import librosa.display as lrd
import tensorflow as tf
import librosa as lr
import pandas as pd
import numpy as np
import matplotlib
import random
import tqdm
import os

### Matplotlib params:

In [None]:
matplotlib.style.use('seaborn')
plt.rcParams["figure.figsize"] = (15, 10)

### Setting the seed values:

In [None]:
np.random.seed(5)
tf.random.set_seed(5)

### Loading the dataset:

In [None]:
db_root = './dataset'

In [None]:
dataset = pd.read_csv(db_root + '/meta.csv', sep='\t')

In [None]:
files = sorted(set(dataset['filename']))
events = sorted(set(dataset['event_label']))

In [None]:
e2i = {events[i]: i for i in range(10)}
i2e = {i: events[i] for i in range(10)}

### Create Normalized Spectrogram:

In [None]:
hparams = {
    "n_fft": 8000,
    "hop_size": 441,
    "srate": 44100,
    "n_mels": 128
}

In [None]:
def create_feature_image(y, hparams):
    # create spectrogram:
    spec = lr.feature.melspectrogram(
        y=y,
        sr=hparams['srate'],
        n_fft=hparams['n_fft'],
        hop_length=hparams['hop_size'],
        n_mels=hparams['n_mels'],
        power=2
    )
    
    # normalized feature image:
    spec = lr.amplitude_to_db(
        np.abs(spec), 
        ref=np.max
    )
    
    # return log cropped, normalized feature image:
    return spec[0:hparams['n_fft'] // 2, 0:y.shape[0] // hparams['hop_size']]

### Create features from audio files:

In [None]:
# cache this (recommended):
cached = False

examples = []
if not cached:
    for file in tqdm.tqdm(files, desc='files processed:'):
        
        # read audio file:
        y, sr = lr.load(db_root + '/samples/' + file, sr=44100)
        
        # trim samples to 10 seconds:
        y = y[0:441000]

        # create spectrogram:
        spec = create_feature_image(y, hparams)
        
        # multihot response vectors for each segment:
        response_vector = np.zeros((1000, 10))
        
        # extract events using metadata file:
        labelled_events = dataset[dataset['filename'] == file].to_numpy()
        
        # marking all events:
        for event in labelled_events:    
            init = round(event[1] / 0.01)
            last = round(event[2] / 0.01)
            response_vector[init:last, e2i[event[3]]] = 1
            
        # creating representation for sample:
        examples += [{
            'data': y / y.max(),
            'sr': 44100,
            'target': response_vector,
            'mel_spec': spec[::-1, :]
        }]
    np.save('examples_128.npy', examples, allow_pickle=True)
else:
    examples = np.load('drive/MyDrive/Storage/event_detection/examples_128.npy', allow_pickle=True)

### Visualizing the examples:

In [None]:
def visualize_example(example):
    
    # plot in time-domain mode:
    ff, aa = plt.subplots(
        12, 
        1, 
        figsize=[15, 15], 
        gridspec_kw={
            'height_ratios': [0.5, 3] + [0.25] * 10
        }
    )
    
    # plot time-domain:
    aa[0].plot(np.linspace(0, 1000, 441000), example['data'])
    aa[0].set_xlim([0, 1000])
    
    # plot freq-domain:
    aa[1].imshow(
        example['mel_spec'], 
        cmap='plasma', 
        aspect='auto'
    )
    # aa[1].set_xlim([0, 1000])
    # aa[1].set_ylim([0, 512])
    
    # plot events:
    for c in range(10):
        aa[c+2].plot(example['target'][:, c])
        aa[c+2].set_xlim([0, 1000])
        aa[c+2].set_title(f'{i2e[c]}')    
        aa[c+2].set_yticks([])
        
    plt.tight_layout()
    plt.show()

In [None]:
visualize_example(examples[0])

### Listen to Audio:

In [None]:
Audio(examples[0]['data'], rate=44100)

### Create Sequences:

In [None]:
def extract_segments(example, wlen=20, threshold=0.75):
    X = []
    Y = []
    for i in range(1000 // wlen):
        X += [example['mel_spec'][:, (i*wlen):((i+1)*wlen)]]
        Y += [(np.mean(example['target'][i*wlen:(i+1)*wlen, :], axis=0) > threshold).astype('int').astype('int')]
    return np.array(X), np.array(Y)

In [None]:
tX = []
tY = []
for example in examples:
    x, y = extract_segments(example)
    tX += [x]
    tY += [y]
    
tX = np.array(tX)
tY = np.array(tY)

### Training and Validation Set:

In [None]:
tX = tX.reshape(*tX.shape, 1)
tX.shape, tY.shape

### Calculating Sample Weights:

In [None]:
# frequency of individual classes:
freq = np.sum(np.sum(tY, axis=0), axis=0)

# frequency of silence for each classes:
free = (2045 * 50) - freq

sample_weights = []
for example in examples:
    x, y = extract_segments(example)
    tmp = (y > 0.5).astype('int').astype('float')
    for c in range(10):
        tmp[:, c] = np.where(y[:, c] > 0.5, (free / (freq + free))[c], (freq / (freq + free))[c])
    sample_weights += [tmp]

# creating numpy array:
sample_weights = np.array(sample_weights).sum(axis=-1)

In [None]:
sample_weights.shape

### Visualizing the Weights and Classes:

In [None]:
fig, ax = plt.subplots(11, 1, figsize=[15, 10], gridspec_kw={'height_ratios': [2] * 10 + [5]})

# plotting the classes:
for c in range(10):
    ax[c].plot(tY[0, :, c], 'r')
    ax[c].set_xlim([0, 50])
    ax[c].set_title(f'{i2e[c]}')    
    ax[c].set_yticks([])

# plotting the sample weights:
ax[10].set_xlim([0, 50])
ax[10].set_title(f'Sample Weights')
ax[10].plot(sample_weights[0, :], 'g')

# plotting the combined figure:
plt.tight_layout()
plt.show()

### Model Training:

In [None]:
model = tf.keras.Sequential()

In [None]:
# INPUT: (n, t, 128, 20, 1)
# 
# n: examples
# t: timestamps
# 128: frequency bins (height)
# 20: segment-duration (width)
# 1: no of channels

# OUTPUT: (n, t, 10)
# 
# n: examples
# t: timestamps
# 10: no of classes


### Implement model here.


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy'],
    sample_weight_mode='temporal'
)

In [None]:
hist = model.fit(
    tX, 
    tY, 
    batch_size=32, 
    epochs=50,
    sample_weight=sample_weights
)