In [61]:
import os
import librosa
import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.display import display
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import scale
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
from surfboard.sound import Waveform
from surfboard.feature_extraction import extract_features_from_waveform
### OPTIMIZE IMPORT ###

In [5]:
def extract_features(waveforms, components_list, statistics_list=None):
    """This is an important function. Given a list of Waveform objects, a list of
    Waveform methods in the form of strings and a list of Barrel methods in the
    form of strings, compute the time-independent features resulting. This function
    does multiprocessing.

    Args:
        waveforms (list of Waveform): This is a list of waveform objects
        components_list (list of str/dict): This is a list of the methods which
            should be applied to all the waveform objects in waveforms. If a dict,
            this also contains arguments to the sound.Waveform methods.
        statistics_list (list of str): This is a list of the methods which
            should be applied to all the time-dependent features computed
            from the waveforms.

    Returns:
        pandas DataFrame: pandas dataframe where every row corresponds
            to features extracted for one of the waveforms and columns
            represent individual features.
    """
    output_feats = []
    # waveforms = tqdm(waveforms, desc='Extracting features...')

    for wave in waveforms:
        output_feats.append(
            extract_features_from_waveform(
                components_list, statistics_list, wave
            )
        )

    return pd.DataFrame(output_feats)

Используемые библиотеки:
1) liborsa
2) surfboard
3)


Порядок действий:
    Усреднить громкость наверное
    1) librosa.effects.preemphasis (In order to equalize the effect of the propagation)
    2) features:
        2.1)MFCC
        2.2)Energy
        2.3)Pitch
        2.4)formants (1 - 3)
        2.5)spectrum centroid????
        2.6)Zero crossing rate
        2.7)jitter
        2.8)shimmers
        2.7) Mean, max, std (статистика по всем фичам на запись)

In [2]:
global_audio_path = "../data"
def get_mfcc(audio_path : str, _nmfcc = 20, start: int = 2, end: int  = 17) -> np.ndarray:
    """
    Выделает MFCC коэффициенты из аудио, для каждого окна берется коэффициенты со 2 по N,
    :param audio_path:
    :type audio_path:
    :return:
    :rtype:
    """
    _nfft = 512
    # _hop_length = 160
    # Меняет количество sample
    audio, sampling_freq = librosa.load(audio_path, sr=None, res_type='scipy')
    # print(sampling_freq)
    features = librosa.feature.mfcc(audio, sampling_freq, n_mfcc=_nmfcc, n_fft=_nfft)
    return features[start:end,:].T.ravel()
    # len(fe), len(fe[0])
# print(fe[0])
# print(fe[1])

In [6]:
# sound_path = "../data/1/03-01-05-01-01-01-01_A1.wav"
components_list = [
    'f0_contour', 'log_energy', 'spectral_centroid', 'formants', 'loudness',
    'jitters', 'shimmers', 'zerocrossing',
]
mfcc_with_arg = {'mfcc': {'n_mfcc': 16, 'n_fft_seconds': 512 / 48000}}
statistics_list = ['mean', 'std', 'min', 'max', 'skewness', 'kurtosis']
components_list.append(mfcc_with_arg)
def get_features(sound_path : str) -> pd.DataFrame:
    audio, sampling_freq = librosa.load(sound_path, sr=None, res_type='scipy')
    # 512 / sampling_freq
    processed_audio = librosa.effects.preemphasis(audio)
    # Extract dataframe...
    audio = Waveform(signal=processed_audio, sample_rate=sampling_freq)
    feature_df = extract_features(
        waveforms=[audio], components_list=components_list, statistics_list=statistics_list)
    return feature_df

In [7]:
global_audio_path = "../data"
res_dataframe = pd.DataFrame()
classes = []
for i in range(1, 6):
    for root, dirs, files in os.walk(os.path.join(global_audio_path, str(i)), topdown=False):
        print(len(files))
        for name in files:
            important_features = get_features(os.path.join(root, name))
            res_dataframe = res_dataframe.append(important_features, ignore_index=True)
            classes.append(i)

172
259
172
85
171


  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty fil

In [8]:
display(res_dataframe)
res_dataframe.describe()

Unnamed: 0,f0_contour_mean,f0_contour_std,f0_contour_min,f0_contour_max,f0_contour_skewness,f0_contour_kurtosis,log_energy,spectral_centroid_mean,spectral_centroid_std,spectral_centroid_min,...,mfcc_kurtosis_7,mfcc_kurtosis_8,mfcc_kurtosis_9,mfcc_kurtosis_10,mfcc_kurtosis_11,mfcc_kurtosis_12,mfcc_kurtosis_13,mfcc_kurtosis_14,mfcc_kurtosis_15,mfcc_kurtosis_16
0,23.967547,55.219171,0.0,189.119143,1.895990,1.668783,-50.789417,10165.480300,4205.444653,0.000000,...,0.576864,1.707728,0.463653,1.532729,0.032541,1.781521,1.504384,1.459517,2.456224,1.134118
1,27.045372,71.229321,0.0,259.841201,2.297994,3.420751,-52.897926,8238.646944,4965.301582,0.000000,...,0.379027,4.740769,1.378276,2.137689,2.199326,2.473318,1.252770,2.492139,1.871985,1.556884
2,8.724462,32.011158,0.0,209.613961,4.010852,17.134221,-57.779049,10573.577648,2372.872379,5207.613500,...,0.023238,0.361595,-0.043995,1.784541,1.154673,0.677837,0.467001,0.377071,1.246487,0.543016
3,28.523561,74.396412,0.0,260.546090,2.255366,3.177505,-50.261565,8503.507631,5009.949377,0.000000,...,0.812738,1.566179,0.281920,0.789175,2.307541,3.260491,0.920348,2.363914,0.982231,0.563017
4,10.253963,40.387550,0.0,238.921752,3.916850,14.445161,-46.754228,10341.238850,4201.536338,3761.336784,...,-0.133761,2.109509,0.715887,1.868507,0.366396,1.362177,1.461011,1.836645,2.821719,0.674766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,0.000000,0.000000,0.0,0.000000,0.000000,-3.000000,-49.614353,8274.023605,4547.170923,0.000000,...,-0.947450,0.660020,0.277131,-0.148228,0.011930,0.953463,0.417874,1.400286,2.256507,2.187404
855,11.937603,40.763433,0.0,176.104431,3.170061,8.235236,-50.706402,9779.106757,4538.508236,3203.973249,...,0.509558,0.890578,1.312091,0.770454,0.157578,3.085950,1.037611,0.771993,0.071554,0.999199
856,80.651854,98.998965,0.0,237.845414,0.422148,-1.802225,-55.082888,10840.344698,3517.150012,5156.937923,...,1.103626,1.679635,0.548185,0.956269,0.935985,2.107008,2.106866,0.296404,1.902266,2.004494
857,47.841428,76.043443,0.0,200.364790,0.980022,-0.996395,-55.624970,10787.963161,2634.707493,6508.520967,...,1.064522,0.811155,-0.365453,0.695748,0.256684,0.911476,1.227208,0.329263,0.871274,0.810664


Unnamed: 0,f0_contour_mean,f0_contour_std,f0_contour_min,f0_contour_max,f0_contour_skewness,f0_contour_kurtosis,log_energy,spectral_centroid_mean,spectral_centroid_std,spectral_centroid_min,...,mfcc_kurtosis_7,mfcc_kurtosis_8,mfcc_kurtosis_9,mfcc_kurtosis_10,mfcc_kurtosis_11,mfcc_kurtosis_12,mfcc_kurtosis_13,mfcc_kurtosis_14,mfcc_kurtosis_15,mfcc_kurtosis_16
count,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,...,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0
mean,32.679361,61.67047,0.0,204.244583,2.089628,8.862409,-55.53267,10279.629365,3908.153515,2562.174655,...,1.128533,1.83943,0.66278,1.605253,1.379469,1.853728,1.489236,1.661463,1.478437,1.792252
std,20.745549,26.378037,0.0,66.31167,2.529656,42.154758,9.849781,1128.374344,754.81962,2282.487767,...,1.154976,1.147866,0.931563,1.118322,1.026553,1.073942,1.120443,1.147593,1.045581,1.162593
min,0.0,0.0,0.0,0.0,0.0,-3.0,-79.909811,6901.756837,1637.548757,0.0,...,-1.14792,-0.812538,-1.108688,-1.025792,-0.911536,-0.834016,-0.819002,-1.062095,-0.831265,-0.873801
25%,17.189673,45.091285,0.0,180.123285,0.985625,-0.941569,-62.46471,9504.400733,3377.998034,0.0,...,0.360938,1.004148,-0.021082,0.842813,0.652252,1.101045,0.723447,0.881407,0.741286,1.000346
50%,30.608498,63.876307,0.0,226.122932,1.483075,0.337269,-57.157595,10460.723355,3905.091578,3321.681608,...,0.982269,1.751564,0.523833,1.48969,1.28886,1.770432,1.361166,1.525192,1.394006,1.656342
75%,47.769585,83.996671,0.0,251.080904,2.181621,3.039097,-48.948679,11137.980792,4416.606774,4338.295963,...,1.724343,2.472168,1.305916,2.216849,2.007392,2.41481,2.115433,2.303024,2.078657,2.47482
max,96.018157,109.769293,0.0,280.559255,21.166063,446.002222,-29.154129,12816.873023,5657.350033,7484.228202,...,7.261085,7.80944,4.580843,7.986589,5.467505,6.60004,6.778332,9.810559,5.876309,8.449962


In [9]:
res_dataframe.fillna(0, inplace=True)
target = pd.Series(classes, name="target")
target.unique()

array([1, 2, 3, 4, 5], dtype=int64)

In [33]:
sum_df = res_dataframe.sum(axis=0)
index_to_del = sum_df[sum_df == 0].index
print(index_to_del)
sum_df

Int64Index([6735, 6736, 6737, 6738, 6739, 6740, 6741, 6742, 6743, 6744, 6745,
            6746, 6747, 6748, 6749],
           dtype='int64')


0        471.002571
1       1469.362606
2       1779.936194
3       1624.555274
4       1206.148421
           ...     
6760       0.704981
6761       0.664472
6762       0.621463
6763       0.576115
6764       0.528595
Length: 6765, dtype: float64

Секция с Feature selection

In [58]:
# feature names to drop (based on MI decomposition method)
# res = mutual_info_classif(res_dataframe, target, random_state=42, n_neighbors=30)
# df_res = pd.Series(res)
# sorted_df = df_res.sort_values(ascending=False)
# index_to_del : pd.Int64Index = sorted_df[sorted_df < 0.1].index
# print(index_to_del.shape)
# columns_to_del = [res_dataframe.columns[i] for i in index_to_del]
# for c in columns_to_del:
#     print(c)
# features_cleared_df = res_dataframe.drop(columns=)
# features_cleared_df
features_cleared_df = SelectKBest(mutual_info_classif, k=90).fit_transform(res_dataframe, target)
pd.DataFrame(features_cleared_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,23.967547,55.219171,189.119143,1.895990,1.668783,-50.789417,10165.480300,4205.444653,0.000000,-0.370694,...,3.656372,0.732241,0.480405,0.576864,1.532729,0.032541,1.781521,1.504384,2.456224,1.134118
1,27.045372,71.229321,259.841201,2.297994,3.420751,-52.897926,8238.646944,4965.301582,0.000000,-0.357542,...,0.952460,1.462646,1.335705,0.379027,2.137689,2.199326,2.473318,1.252770,1.871985,1.556884
2,8.724462,32.011158,209.613961,4.010852,17.134221,-57.779049,10573.577648,2372.872379,5207.613500,-0.029348,...,0.957016,0.392138,0.636229,0.023238,1.784541,1.154673,0.677837,0.467001,1.246487,0.543016
3,28.523561,74.396412,260.546090,2.255366,3.177505,-50.261565,8503.507631,5009.949377,0.000000,-0.372522,...,2.716960,2.108052,1.666078,0.812738,0.789175,2.307541,3.260491,0.920348,0.982231,0.563017
4,10.253963,40.387550,238.921752,3.916850,14.445161,-46.754228,10341.238850,4201.536338,3761.336784,-0.048096,...,2.058751,1.306719,-0.327491,-0.133761,1.868507,0.366396,1.362177,1.461011,2.821719,0.674766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,0.000000,0.000000,0.000000,0.000000,-3.000000,-49.614353,8274.023605,4547.170923,0.000000,-0.262009,...,2.182998,1.258226,1.567229,-0.947450,-0.148228,0.011930,0.953463,0.417874,2.256507,2.187404
855,11.937603,40.763433,176.104431,3.170061,8.235236,-50.706402,9779.106757,4538.508236,3203.973249,-0.106172,...,4.985675,2.328687,-0.599227,0.509558,0.770454,0.157578,3.085950,1.037611,0.071554,0.999199
856,80.651854,98.998965,237.845414,0.422148,-1.802225,-55.082888,10840.344698,3517.150012,5156.937923,-0.087280,...,3.751084,0.845884,-0.872183,1.103626,0.956269,0.935985,2.107008,2.106866,1.902266,2.004494
857,47.841428,76.043443,200.364790,0.980022,-0.996395,-55.624970,10787.963161,2634.707493,6508.520967,0.443203,...,1.858160,2.889217,-0.277684,1.064522,0.695748,0.256684,0.911476,1.227208,0.871274,0.810664


Секция с классификаторами

In [None]:
X = scale(res_dataframe)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
params = {"max_depth" : [3,5,7], "n_estimators" : [100, 200, 400]}
print(params)
estimator_ = GradientBoostingClassifier()
gs = GridSearchCV(estimator_, params, cv=cv,scoring="accuracy", verbose=2, n_jobs=-1)
gs.fit(X, target)
gs.cv_results_

In [70]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = KNeighborsClassifier(n_neighbors=11)
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1)
# best ~ 0.56
scores.mean()

0.5646198830409357

In [51]:
# X = scale(res_dataframe)
X = scale(features_cleared_df)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = LinearSVC(C=0.01, dual=False)
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1)
# best ~ 0.64
scores.mean()

0.6286753705970352

In [80]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = LogisticRegression("l2", C=0.001, solver="sag")
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
# best ~ 0.58
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


0.5681150550795594

In [85]:
X = res_dataframe
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = DecisionTreeClassifier(splitter="random")
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
# best ~ 0.58
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


0.4435060519515844

In [81]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = GradientBoostingClassifier()
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    9.9s remaining:    6.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.0s finished


0.6147558819529444

In [69]:
X = scale(features_cleared_df)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = MLPClassifier(hidden_layer_sizes=1000, learning_rate='adaptive', solver='adam')
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    5.7s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.1s finished


0.6554739562083504

In [10]:
X = scale(res_dataframe)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
params = {"C" : 10 ** np.arange(-3, 3, dtype = float), "degree" : [1, 2, 5, 9]}
print(params)
estimator_ = SVC(kernel="poly")
gs = GridSearchCV(estimator_, params, cv=cv,scoring="accuracy", verbose=2, n_jobs=-1)
gs.fit(X, target)
gs.cv_results_

{'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), 'degree': [1, 2, 5, 9]}
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.7s finished


{'mean_fit_time': array([0.09148345, 0.099089  , 0.10509639, 0.09848862, 0.09108243,
        0.0966877 , 0.09588294, 0.11230145, 0.07626882, 0.09068227,
        0.09908924, 0.12010942, 0.06285706, 0.09848914, 0.10369301,
        0.11750598, 0.0738667 , 0.11070065, 0.11930799, 0.12831597,
        0.14553185, 0.11490469, 0.11931219, 0.11951218]),
 'std_fit_time': array([0.00463449, 0.00597178, 0.00522054, 0.00403344, 0.00800818,
        0.00680609, 0.00402167, 0.01702551, 0.00519699, 0.00377678,
        0.00569762, 0.00879469, 0.00458242, 0.01197118, 0.00224734,
        0.00771666, 0.01065641, 0.00944453, 0.01179876, 0.01593197,
        0.01501482, 0.00714488, 0.00896119, 0.00149845]),
 'mean_score_time': array([0.01541481, 0.01741581, 0.01481185, 0.01581411, 0.01481299,
        0.01441293, 0.01461334, 0.01601419, 0.0136127 , 0.01401286,
        0.01681523, 0.01681533, 0.01541395, 0.02322149, 0.01581535,
        0.01561456, 0.01261144, 0.01441336, 0.01561465, 0.01521392,
        0.011610

In [11]:
gs.best_params_


{'C': 1.0, 'degree': 1}

In [13]:
full_df = pd.concat([res_dataframe, target], 1)
full_df.to_csv("features_and_stat_full.csv", index=False)

