# Notebook 2: Spectral Centroid Preprocessing
### Feature Generation
* By calculating Spectral Centroid Arrays, Mean, Max and Min of the Spectral Centroid Arrays for 'E', 'f_tract' and 's_tract' features.
* By calculating Mean, Max and Min of 'energy', 'noise', 'pulse' and 'tone' features.

### Setting up working directory:

In [1]:
project_dir = 'ESC50/Dataset' 
#Working directory can be setup externally as long as both 'tract' and 'ptne' folders are present within the directory.

### Importing necessary libraries:

In [2]:
# !pip install xgboost
!pip install librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import  OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve
import pickle
from sklearn.preprocessing import OneHotEncoder
import librosa.display
import librosa
import h5py

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


## Preprocessing and Feature Engineering
### Reading the Metadata (CSV) file:

In [3]:
meta_data = pd.read_csv(project_dir+'/metadata.csv')

#### Extracting first set of features by calculating the Spectral Centroids of E, f_tract and s_tract data:

In [4]:
spectrum_features = []
for i in range(len(meta_data)):
    name = meta_data['filename'][i].split('.')[0]
    hf = h5py.File(project_dir+'/tracts/'+name+'.hdf5', 'r')
    E = np.array(hf.get('E'))
    SC_E = Spectral_Centroids=librosa.feature.spectral_centroid(S = E, freq = np.linspace(0, 1, num = E.shape[0]))
    if SC_E.shape[1]!=8498:
        SC_E = np.hstack((SC_E, [[0]*(8498-SC_E.shape[1])]))
    f_tract = np.array(hf.get('f_tract'))
    SC_f = Spectral_Centroids=librosa.feature.spectral_centroid(S = f_tract, freq = np.linspace(0, 1, num = E.shape[0]))
    if SC_f.shape[1]!=8498:
        SC_f = np.hstack((SC_f, [[0]*(8498-SC_f.shape[1])]))
    s_tract = np.array(hf.get('s_tract'))
    SC_s = Spectral_Centroids = librosa.feature.spectral_centroid(S = f_tract, freq = np.linspace(0, 1, num = E.shape[0]))
    if SC_s.shape[1]!=8498:
        SC_s = np.hstack((SC_s, [[0]*(8498-SC_s.shape[1])]))
    mean = [np.mean(SC_E), np.mean(SC_s), np.mean(SC_f)]
    min = [np.min(SC_E), np.min(SC_s), np.min(SC_f)]
    max = [np.max(SC_E), np.max(SC_s), np.max(SC_f)]
    spectrum_features.append(list(SC_E[0])+list(SC_f[0])+list(SC_s[0])+mean+min+max)
spectrum_features = np.array(spectrum_features)

#### Exporting the Spectral Centroid arrays of the Tract data to an npy file to  be used for Machine Learning modelling

In [5]:
np.save(project_dir+'\spectrum_features.npy', spectrum_features)

#### Extracting second set of features by calculating:
* the Mean (Average), Min (Minimum) and Max (Maximum) from the P (Pulse), T (Tone), N (Noise) and E (Energy)

In [6]:
ptne_features = []
for i in range(len(meta_data)):
    name = meta_data['filename'][i].split('.')[0]
    hf = h5py.File(project_dir+'/ptne/'+name+'.hdf5', 'r')
    energy = np.array(hf.get('energy'))
    noise = np.array(hf.get('noise'))
    pulse = np.array(hf.get('pulse'))
    tone=np.array(hf.get('tone'))
    energy_features = [np.mean(energy), np.min(energy), np.max(energy)]
    noise_features = [np.mean(noise), np.min(noise), np.max(noise)]
    pulse_features = [np.mean(pulse), np.min(pulse), np.max(pulse)]
    tone_features = [np.mean(tone), np.min(tone), np.max(tone)]
    ptne_features.append(energy_features+noise_features+pulse_features+tone_features)
ptne_features = np.array(ptne_features)

#### Exporting the Mean, Min and Max arrays to an npy file 

In [7]:
np.save(project_dir+'\ptne_features.npy', ptne_features)