# Feature Extraction
---
Selecting and converting segments into a master feature table for clustering.
Note: not all features are/need to be used to fit the chosen clustering algorithm.
Chosen features can be filtered from this master table.


## Imports:

In [115]:
import os
import librosa
import numpy as np
import pandas as pd

from librosa import feature
from spafe.features import lpc
from scipy import signal
from sklearn.preprocessing import minmax_scale

## Control Parameters:

In [116]:
sr = 16000 # sample rate

input_data_dir = 'data/segmented/tank/'
output_data_dir = 'data/segmented/'

# discard any segment that does not have an amplitude at threshold or higher.
# This gets rid of non-content signals or pure noise segments.
segment_threshold = 0.0005
min_duration = sr * 0.010 # get rid of short segments (less than 10ms)

ffl = 128 # window size used to compute features
fhl = 32 # hop length used to compute features
lpcc_order = 10 # order for lpc coffecient filter equal to number of coefficients

assert min_duration > ffl

## Preparing Feature Table for Clustering from Segmented Data:

In [117]:
if os.path.exists(os.path.join(output_data_dir, "segment_features.csv")):
    os.remove(os.path.join(output_data_dir, "segment_features.csv"))

In [118]:
pd.DataFrame(columns=["file_id", "file_len_s",
"zcr_mean", "zcr_std", "zcr_max", "zcr_min",
"spc_centroid_mean", "spc_centroid_std", "spc_centroid_max", "spc_centroid_min",
"spc_bandwidth_mean", "spc_bandwidth_std", "spc_bandwidth_max", "spc_bandwidth_min",
"spc_contrast_mean", "spc_contrast_std", "spc_contrast_max", "spc_contrast_min",
"spc_flatness_mean", "spc_flatness_std", "spc_flatness_max", "spc_flatness_min",
"spc_rolloff_mean", "spc_rolloff_std", "spc_rolloff_max", "spc_rolloff_min",
"rms_mean", "rms_std", "rms_max", "rms_min",
"mfcc_sum_0","mfcc_sum_1","mfcc_sum_2","mfcc_sum_3","mfcc_sum_4","mfcc_sum_5","mfcc_sum_6","mfcc_sum_7","mfcc_sum_8","mfcc_sum_9",
"mfcc_mean_0","mfcc_mean_1","mfcc_mean_2","mfcc_mean_3","mfcc_mean_4","mfcc_mean_5","mfcc_mean_6","mfcc_mean_7","mfcc_mean_8","mfcc_mean_9",
"mfcc_std_0","mfcc_std_1","mfcc_std_2","mfcc_std_3","mfcc_std_4","mfcc_std_5","mfcc_std_6","mfcc_std_7","mfcc_std_8","mfcc_std_9",
"mfcc_max_0","mfcc_max_1","mfcc_max_2","mfcc_max_3","mfcc_max_4","mfcc_max_5","mfcc_max_6","mfcc_max_7","mfcc_max_8","mfcc_max_9",
"mfcc_min_0","mfcc_min_1","mfcc_min_2","mfcc_min_3","mfcc_min_4","mfcc_min_5","mfcc_min_6","mfcc_min_7","mfcc_min_8","mfcc_min_9",
"lpcc_sum_0","lpcc_sum_1","lpcc_sum_2","lpcc_sum_3","lpcc_sum_4","lpcc_sum_5","lpcc_sum_6","lpcc_sum_7","lpcc_sum_8","lpcc_sum_9",
"lpcc_mean_0","lpcc_mean_1","lpcc_mean_2","lpcc_mean_3","lpcc_mean_4","lpcc_mean_5","lpcc_mean_6","lpcc_mean_7","lpcc_mean_8","lpcc_mean_9",
"lpcc_std_0","lpcc_std_1","lpcc_std_2","lpcc_std_3","lpcc_std_4","lpcc_std_5","lpcc_std_6","lpcc_std_7","lpcc_std_8","lpcc_std_9",
"lpcc_max_0","lpcc_max_1","lpcc_max_2","lpcc_max_3","lpcc_max_4","lpcc_max_5","lpcc_max_6","lpcc_max_7","lpcc_max_8","lpcc_max_9",
"lpcc_min_0","lpcc_min_1","lpcc_min_2","lpcc_min_3","lpcc_min_4","lpcc_min_5","lpcc_min_6","lpcc_min_7","lpcc_min_8","lpcc_min_9"
]).to_csv(os.path.join(output_data_dir, "segment_features.csv"), index=False, header=True)

for f in os.listdir(input_data_dir):
    segment, _ = librosa.load(os.path.join(input_data_dir, f), sr=sr) 

    if max(segment) >= segment_threshold and len(segment) >= min_duration:

        segment = minmax_scale(X=segment, feature_range=(-0.1,0.1)) #normalise each segment

        zcr = feature.zero_crossing_rate(segment, frame_length=ffl, hop_length=fhl)
        spc_centroid = feature.spectral_centroid(y=segment, n_fft=ffl, center=False, sr=sr)
        spc_bandwidth = feature.spectral_bandwidth(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
        spc_contrast = feature.spectral_contrast(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
        spc_flatness = feature.spectral_flatness(y=segment, n_fft=ffl, hop_length=fhl, center=False)
        spc_rolloff = feature.spectral_rolloff(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
        rms = feature.rms(y=segment, frame_length=ffl, hop_length=fhl)
        mfcc = pd.DataFrame(feature.mfcc(y=segment, sr=sr, n_mfcc=10, n_fft=ffl, n_mels=10, fmax=700)).T
        lpcc = pd.DataFrame(lpc.lpcc(segment, fs=sr, order=lpcc_order, win_len=ffl/sr, win_hop=fhl/sr))

        pd.DataFrame([
        f,
        len(segment)/sr,

        np.mean(zcr),
        np.std(zcr),
        np.max(zcr),
        np.min(zcr),

        np.mean(spc_centroid),
        np.std(spc_centroid),
        np.max(spc_centroid),
        np.min(spc_centroid),

        np.mean(spc_bandwidth),
        np.std(spc_bandwidth),
        np.max(spc_bandwidth),
        np.min(spc_bandwidth),

        np.mean(spc_contrast),
        np.std(spc_contrast),
        np.max(spc_contrast),
        np.min(spc_contrast),

        np.mean(spc_flatness),
        np.std(spc_flatness),
        np.max(spc_flatness),
        np.min(spc_flatness),

        np.mean(spc_rolloff),
        np.std(spc_rolloff),
        np.max(spc_rolloff),
        np.min(spc_rolloff),

        np.mean(rms),
        np.std(rms),
        np.max(rms),
        np.min(rms),

        mfcc.sum()[0],
        mfcc.sum()[1],
        mfcc.sum()[2],
        mfcc.sum()[3],
        mfcc.sum()[4],
        mfcc.sum()[5],
        mfcc.sum()[6],
        mfcc.sum()[7],
        mfcc.sum()[8],
        mfcc.sum()[9],

        mfcc.mean()[0],
        mfcc.mean()[1],
        mfcc.mean()[2],
        mfcc.mean()[3],
        mfcc.mean()[4],
        mfcc.mean()[5],
        mfcc.mean()[6],
        mfcc.mean()[7],
        mfcc.mean()[8],
        mfcc.mean()[9],

        mfcc.std(ddof=0)[0],
        mfcc.std(ddof=0)[1],
        mfcc.std(ddof=0)[2],
        mfcc.std(ddof=0)[3],
        mfcc.std(ddof=0)[4],
        mfcc.std(ddof=0)[5],
        mfcc.std(ddof=0)[6],
        mfcc.std(ddof=0)[7],
        mfcc.std(ddof=0)[8],
        mfcc.std(ddof=0)[9],

        mfcc.max()[0],
        mfcc.max()[1],
        mfcc.max()[2],
        mfcc.max()[3],
        mfcc.max()[4],
        mfcc.max()[5],
        mfcc.max()[6],
        mfcc.max()[7],
        mfcc.max()[8],
        mfcc.max()[9],

        mfcc.min()[0],
        mfcc.min()[1],
        mfcc.min()[2],
        mfcc.min()[3],
        mfcc.min()[4],
        mfcc.min()[5],
        mfcc.min()[6],
        mfcc.min()[7],
        mfcc.min()[8],
        mfcc.min()[9],

        lpcc.sum()[0],
        lpcc.sum()[1],
        lpcc.sum()[2],
        lpcc.sum()[3],
        lpcc.sum()[4],
        lpcc.sum()[5],
        lpcc.sum()[6],
        lpcc.sum()[7],
        lpcc.sum()[8],
        lpcc.sum()[9],

        lpcc.mean()[0],
        lpcc.mean()[1],
        lpcc.mean()[2],
        lpcc.mean()[3],
        lpcc.mean()[4],
        lpcc.mean()[5],
        lpcc.mean()[6],
        lpcc.mean()[7],
        lpcc.mean()[8],
        lpcc.mean()[9],

        lpcc.std(ddof=0)[0],
        lpcc.std(ddof=0)[1],
        lpcc.std(ddof=0)[2],
        lpcc.std(ddof=0)[3],
        lpcc.std(ddof=0)[4],
        lpcc.std(ddof=0)[5],
        lpcc.std(ddof=0)[6],
        lpcc.std(ddof=0)[7],
        lpcc.std(ddof=0)[8],
        lpcc.std(ddof=0)[9],

        lpcc.max()[0],
        lpcc.max()[1],
        lpcc.max()[2],
        lpcc.max()[3],
        lpcc.max()[4],
        lpcc.max()[5],
        lpcc.max()[6],
        lpcc.max()[7],
        lpcc.max()[8],
        lpcc.max()[9],

        lpcc.min()[0],
        lpcc.min()[1],
        lpcc.min()[2],
        lpcc.min()[3],
        lpcc.min()[4],
        lpcc.min()[5],
        lpcc.min()[6],
        lpcc.min()[7],
        lpcc.min()[8],
        lpcc.min()[9]
        
        ]).T.to_csv(os.path.join(output_data_dir, "segment_features.csv"), index=False, header=False, mode='a')

## Inspecting Final Output Table:

In [119]:
df = pd.read_csv(os.path.join(output_data_dir, "segment_features.csv")).drop(columns=["file_id", "file_len_s"])

assert np.all(np.isfinite(df)) # check that there are no infinite values