# Feature Extraction
---
Convert all segments into a feature table.

## Imports:

In [31]:
import os
import librosa
import numpy as np
import pandas as pd

from librosa import feature
from spafe.features import lpc
from scipy import signal
from sklearn.preprocessing import minmax_scale

## Control Parameters:

In [24]:
sr = 16000 # sample rate

input_data_dir = 'data/segmented/tank/'
output_data_dir = 'data/segmented/'

segment_threshold = 0.0005 #discard any segment that does not have an amplitude at threshold or higher.

ffl = 512 # window size used to compute features
fhl = 128 # hop length used to compute features
lpcc_order = 10 # order for lpc coffecient filter equal to number of coefficients

## Helper Functions:

In [25]:
import matplotlib.pyplot as plt
'''
Description:
    Plots a signal waveform. 
Parameters:
    signal (numpy.ndarray): array of signal amplitudes
    sr (int): sample rate of waveform
Return:
    None
'''
def plot_signal(signal, sr=sr):
    no_samples = len(signal)
    plt.plot(np.linspace(0, no_samples/sr, no_samples), signal)

## Preparing Feature Table for Clustering:

In [27]:
if os.path.exists(os.path.join(output_data_dir, "segment_features.csv")):
    os.remove(os.path.join(output_data_dir, "segment_features.csv"))

In [28]:
pd.DataFrame(columns=["file_id", "file_len_s",
"zcr_mean", "zcr_std", "zcr_max", "zcr_min",
"spc_centroid_mean", "spc_centroid_std", "spc_centroid_max", "spc_centroid_min",
"spc_bandwidth_mean", "spc_bandwidth_std", "spc_bandwidth_max", "spc_bandwidth_min",
"spc_contrast_mean", "spc_contrast_std", "spc_contrast_max", "spc_contrast_min",
"spc_flatness_mean", "spc_flatness_std", "spc_flatness_max", "spc_flatness_min",
"spc_rolloff_mean", "spc_rolloff_std", "spc_rolloff_max", "spc_rolloff_min",
"rms_mean", "rms_std", "rms_max", "rms_min",
"mfcc_0","mfcc_1","mfcc_2","mfcc_3","mfcc_4","mfcc_5","mfcc_6","mfcc_7","mfcc_8","mfcc_9",
"lpcc_0","lpcc_1","lpcc_2","lpcc_3","lpcc_4","lpcc_5","lpcc_6","lpcc_7","lpcc_8","lpcc_9",
]).to_csv(os.path.join(output_data_dir, "segment_features.csv"), index=False, header=True)

for f in os.listdir(input_data_dir):
    segment, _ = librosa.load(os.path.join(input_data_dir, f), sr=sr) 

    if max(segment) >= segment_threshold and len(segment) >= ffl:

        segment = minmax_scale(X=segment, feature_range=(-0.1,0.1)) #normalise each segment

        zcr = feature.zero_crossing_rate(segment, frame_length=ffl, hop_length=fhl)
        spc_centroid = feature.spectral_centroid(y=segment, n_fft=ffl, center=False, sr=sr)
        spc_bandwidth = feature.spectral_bandwidth(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
        spc_contrast = feature.spectral_contrast(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
        spc_flatness = feature.spectral_flatness(y=segment, n_fft=ffl, hop_length=fhl, center=False)
        spc_rolloff = feature.spectral_rolloff(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
        rms = feature.rms(y=segment, frame_length=ffl, hop_length=fhl)
        mfcc = feature.mfcc(y=segment, sr=sr, n_mfcc=10, n_fft=ffl, n_mels=32)
        lpcc = pd.DataFrame(lpc.lpcc(segment, fs=sr, order=lpcc_order, win_len=ffl/sr, win_hop=fhl/sr)).mean()

        

        pd.DataFrame([
        f,
        len(segment)/sr,

        np.mean(zcr),
        np.std(zcr),
        np.max(zcr),
        np.min(zcr),

        np.mean(spc_centroid),
        np.std(spc_centroid),
        np.max(spc_centroid),
        np.min(spc_centroid),

        np.mean(spc_bandwidth),
        np.std(spc_bandwidth),
        np.max(spc_bandwidth),
        np.min(spc_bandwidth),

        np.mean(spc_contrast),
        np.std(spc_contrast),
        np.max(spc_contrast),
        np.min(spc_contrast),

        np.mean(spc_flatness),
        np.std(spc_flatness),
        np.max(spc_flatness),
        np.min(spc_flatness),

        np.mean(spc_rolloff),
        np.std(spc_rolloff),
        np.max(spc_rolloff),
        np.min(spc_rolloff),

        np.mean(rms),
        np.std(rms),
        np.max(rms),
        np.min(rms),

        np.mean(mfcc[0]),
        np.mean(mfcc[1]),
        np.mean(mfcc[2]),
        np.mean(mfcc[3]),
        np.mean(mfcc[4]),
        np.mean(mfcc[5]),
        np.mean(mfcc[6]),
        np.mean(mfcc[7]),
        np.mean(mfcc[8]),
        np.mean(mfcc[9]),

        lpcc[0],
        lpcc[1],
        lpcc[2],
        lpcc[3],
        lpcc[4],
        lpcc[5],
        lpcc[6],
        lpcc[7],
        lpcc[8],
        lpcc[9]
        
        ]).T.to_csv(os.path.join(output_data_dir, "segment_features.csv"), index=False, header=False, mode='a')

## Checking for infinite values:

In [29]:
df = pd.read_csv(os.path.join(output_data_dir, "segment_features.csv")).drop(columns=["file_id", "file_len_s"])

assert np.all(np.isfinite(df)) # check if there are any infinite values

In [30]:
df.head(10)

Unnamed: 0,zcr_mean,zcr_std,zcr_max,zcr_min,spc_centroid_mean,spc_centroid_std,spc_centroid_max,spc_centroid_min,spc_bandwidth_mean,spc_bandwidth_std,...,lpcc_0,lpcc_1,lpcc_2,lpcc_3,lpcc_4,lpcc_5,lpcc_6,lpcc_7,lpcc_8,lpcc_9
0,0.01277,0.010958,0.029297,0.0,326.897882,55.14875,399.320168,265.620103,852.334846,283.498174,...,-10.351171,-1.386449,0.109743,0.137635,0.103027,0.071913,0.021266,-0.004506,-0.056447,0.010623
1,0.018179,0.006651,0.027344,0.003906,551.804981,41.393508,610.274837,520.102118,1187.17557,169.664767,...,-8.622793,-1.180595,-0.116068,0.063153,0.072321,0.109891,0.073878,0.070728,0.02281,-0.092073
2,0.012921,0.009646,0.025391,0.0,317.996382,36.396735,368.586863,284.483121,716.861125,138.564361,...,-8.695151,-1.29794,0.052163,0.074927,0.070808,0.070735,0.060115,0.035919,0.04443,-0.098881
3,0.005709,0.008052,0.019531,0.0,327.014473,76.060456,429.101277,246.619706,1076.361492,126.162497,...,-10.411205,-1.079875,0.00269,0.040587,0.003566,0.018601,0.02813,-0.000821,0.002179,-0.007865
4,0.007812,0.010163,0.027344,0.0,338.733859,69.987655,425.254639,253.843869,1085.958786,115.390446,...,-10.351305,-1.088383,0.005843,0.022205,0.019794,0.030344,0.017549,0.006991,0.007882,-0.0169
5,0.007061,0.008127,0.023438,0.0,297.272507,74.236681,401.81387,236.637941,914.361605,180.608377,...,-10.307988,-1.180297,0.047565,0.039914,0.062769,0.027853,0.023162,-0.009316,-0.00353,-0.004595
6,0.004808,0.006158,0.015625,0.0,283.257985,52.892355,343.210252,214.543032,751.557618,160.047802,...,-8.841023,-1.289682,0.074534,0.09124,0.060284,0.056523,0.037055,0.02953,0.01207,-0.065499
7,0.004507,0.006289,0.015625,0.0,203.53327,64.363003,294.537742,156.439306,716.508489,130.389261,...,-8.849269,-1.133323,0.017367,0.036676,0.021155,0.023433,0.01717,0.010061,0.006546,0.007722
8,0.006611,0.007249,0.019531,0.0,267.448533,55.635022,344.843254,216.486325,837.768356,179.578461,...,-9.29582,-1.165941,0.030177,0.051527,0.039728,0.020737,0.02476,0.00071,0.003927,0.001321
9,0.00601,0.007602,0.019531,0.0,286.959255,70.371159,386.407139,233.958312,762.275761,168.604489,...,-8.686687,-1.236384,0.046933,0.056393,0.056807,0.045711,0.038405,0.021874,0.015989,-0.038676
