# Feature Extraction
---
Convert all segments into a feature table.

## Imports:

In [61]:
import os
import librosa
import numpy as np
import pandas as pd

from librosa import feature
from spafe.features import lpc

## Control Parameters:

In [132]:
sr = 16000 # sample rate

input_data_dir = 'data/segmented/tank/'
output_data_dir = 'data/segmented/'

ffl = 128 # window size used to compute features
fhl = 64 # hop length used to compute features
lpcc_order = 10 # order for lpc coffecient filter

In [185]:
# sample, _ = librosa.load('data/segmented/tank/67129367.140626174607_10_105.wav', sr=sr)

# try:
#     a = lpc.lpcc(sig=sample, fs=sr, order=lpcc_order)
# except:
#     a = np.full(lpcc_order, np.nan)

# pd.DataFrame(a).min(axis='rows').tolist()


# lpc.lpc(sig=sample, fs=sr, order=lpcc_order )


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-18.706675,-1.572463,-0.026155,0.349504,0.372757,0.241634,-0.117264,-0.191146,-0.256805,0.210724
1,-17.624871,-1.19054,0.060268,0.065921,0.028515,0.020988,0.074584,0.014129,0.017005,-0.049312
2,-19.321803,-1.076968,0.00078,0.016698,0.022658,-0.018862,0.038393,0.04346,-0.035206,0.029091
3,-19.330162,-0.920781,-0.155204,0.050828,-0.177788,0.074369,0.07366,-0.023091,0.04213,0.049484
4,-21.621834,-0.967876,-0.008739,-0.008102,-0.069026,0.058346,-0.070804,-0.037434,-0.033604,0.151035
5,-21.601615,-0.986846,-0.000776,-0.001247,-0.001658,-0.035704,0.035243,-0.071398,0.034817,0.046528
6,-22.205381,-0.983458,-0.005896,0.001682,0.001211,-0.04167,0.040979,0.083577,-0.04036,-0.028996
7,-36.043653,-0.988237,-4e-06,0.000838,3e-06,-1e-06,0.071427,-0.070589,2e-06,0.011761


## Preparing Feature Table for Clustering:

In [187]:
if os.path.exists(os.path.join(output_data_dir, "segment_features.csv")):
    os.remove(os.path.join(output_data_dir, "segment_features.csv"))

In [188]:
pd.DataFrame(columns=["file_id", "file_len_s",
"zcr_mean", "zcr_std", "zcr_max", "zcr_min",
"spc_centroid_mean", "spc_centroid_std", "spc_centroid_max", "spc_centroid_min",
"spc_bandwidth_mean", "spc_bandwidth_std", "spc_bandwidth_max", "spc_bandwidth_min",
"spc_contrast_mean", "spc_contrast_std", "spc_contrast_max", "spc_contrast_min",
"spc_flatness_mean", "spc_flatness_std", "spc_flatness_max", "spc_flatness_min",
"spc_rolloff_mean", "spc_rolloff_std", "spc_rolloff_max", "spc_rolloff_min",
"rms_mean", "rms_std", "rms_max", "rms_min",
"mfcc_0","mfcc_1","mfcc_2","mfcc_3","mfcc_4","mfcc_5","mfcc_6","mfcc_7","mfcc_8","mfcc_9",
"lpcc_mean_0","lpcc_mean_1","lpcc_mean_2","lpcc_mean_3","lpcc_mean_4","lpcc_mean_5","lpcc_mean_6","lpcc_mean_7","lpcc_mean_8","lpcc_mean_9",
"lpcc_std_0","lpcc_std_1","lpcc_std_2","lpcc_std_3","lpcc_std_4","lpcc_std_5","lpcc_std_6","lpcc_std_7","lpcc_std_8","lpcc_std_9",
"lpcc_max_0","lpcc_max_1","lpcc_max_2","lpcc_max_3","lpcc_max_4","lpcc_max_5","lpcc_max_6","lpcc_max_7","lpcc_max_8","lpcc_max_9",
"lpcc_min_0","lpcc_min_1","lpcc_min_2","lpcc_min_3","lpcc_min_4","lpcc_min_5","lpcc_min_6","lpcc_min_7","lpcc_min_8","lpcc_min_9",
]).to_csv(os.path.join(output_data_dir, "segment_features.csv"), index=False, header=True)

for f in os.listdir(input_data_dir):
    segment, _ = librosa.load(os.path.join(input_data_dir, f), sr=sr)
    
    zcr = feature.zero_crossing_rate(segment, frame_length=ffl, hop_length=fhl)
    spc_centroid = feature.spectral_centroid(y=segment, n_fft=ffl, center=False, sr=sr)
    spc_bandwidth = feature.spectral_bandwidth(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
    spc_contrast = feature.spectral_contrast(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
    spc_flatness = feature.spectral_flatness(y=segment, n_fft=ffl, hop_length=fhl, center=False)
    spc_rolloff = feature.spectral_rolloff(y=segment, n_fft=ffl, hop_length=fhl, center=False, sr=sr)
    rms = feature.rms(y=segment, frame_length=ffl, hop_length=fhl)
    mfcc = feature.mfcc(y=segment, sr=sr, n_mfcc=10, n_fft=ffl)

    
    try:
        lpcc = lpc.lpcc(sig=segment, fs=sr, order=lpcc_order, win_len=ffl/sr, win_hop=fhl/sr)
        lpcc_mean = pd.DataFrame(lpcc).mean(axis='rows').to_list()
        lpcc_std = pd.DataFrame(lpcc).std(axis='rows').to_list()
        lpcc_max = pd.DataFrame(lpcc).max(axis='rows').to_list()
        lpcc_min = pd.DataFrame(lpcc).min(axis='rows').to_list()
    except:
        lpcc = np.full(lpcc_order, np.nan)
        lpcc_mean = pd.DataFrame(lpcc).mean(axis='columns').to_list()
        lpcc_std = pd.DataFrame(lpcc).std(axis='columns').to_list()
        lpcc_max = pd.DataFrame(lpcc).max(axis='columns').to_list()
        lpcc_min = pd.DataFrame(lpcc).min(axis='columns').to_list()


    pd.DataFrame([
    f,
    len(segment)/sr,

    np.mean(zcr),
    np.std(zcr),
    np.max(zcr),
    np.min(zcr),

    np.mean(spc_centroid),
    np.std(spc_centroid),
    np.max(spc_centroid),
    np.min(spc_centroid),

    np.mean(spc_bandwidth),
    np.std(spc_bandwidth),
    np.max(spc_bandwidth),
    np.min(spc_bandwidth),

    np.mean(spc_contrast),
    np.std(spc_contrast),
    np.max(spc_contrast),
    np.min(spc_contrast),

    np.mean(spc_flatness),
    np.std(spc_flatness),
    np.max(spc_flatness),
    np.min(spc_flatness),

    np.mean(spc_rolloff),
    np.std(spc_rolloff),
    np.max(spc_rolloff),
    np.min(spc_rolloff),

    np.mean(rms),
    np.std(rms),
    np.max(rms),
    np.min(rms),

    np.mean(mfcc[0]),
    np.mean(mfcc[1]),
    np.mean(mfcc[2]),
    np.mean(mfcc[3]),
    np.mean(mfcc[4]),
    np.mean(mfcc[5]),
    np.mean(mfcc[6]),
    np.mean(mfcc[7]),
    np.mean(mfcc[8]),
    np.mean(mfcc[9]),

    lpcc_mean[0],
    lpcc_mean[1],
    lpcc_mean[2],
    lpcc_mean[3],
    lpcc_mean[4],
    lpcc_mean[5],
    lpcc_mean[6],
    lpcc_mean[7],
    lpcc_mean[8],
    lpcc_mean[9],

    lpcc_std[0],
    lpcc_std[1],
    lpcc_std[2],
    lpcc_std[3],
    lpcc_std[4],
    lpcc_std[5],
    lpcc_std[6],
    lpcc_std[7],
    lpcc_std[8],
    lpcc_std[9],

    lpcc_max[0],
    lpcc_max[1],
    lpcc_max[2],
    lpcc_max[3],
    lpcc_max[4],
    lpcc_max[5],
    lpcc_max[6],
    lpcc_max[7],
    lpcc_max[8],
    lpcc_max[9],

    lpcc_min[0],
    lpcc_min[1],
    lpcc_min[2],
    lpcc_min[3],
    lpcc_min[4],
    lpcc_min[5],
    lpcc_min[6],
    lpcc_min[7],
    lpcc_min[8],
    lpcc_min[9]
    ]).T.to_csv(os.path.join(output_data_dir, "segment_features.csv"), index=False, header=False, mode='a')

  return f(*args, **kwargs)


In [167]:
# df = pd.read_csv(os.path.join(output_data_dir, "segment_features.csv")).drop(columns=["file_id", "file_len_s"])

# assert np.all(np.isfinite(df)) # check if there are any infinite values

In [192]:
df = pd.read_csv(os.path.join(output_data_dir, "segment_features.csv"))

df.fillna(df.mean(numeric_only=True)).to_csv(os.path.join(output_data_dir, "segment_features.csv"))