In [21]:
import os
import librosa
import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.display import display
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.svm import SVC
from tqdm import tqdm
from surfboard.sound import Waveform
from surfboard.feature_extraction import extract_features
### OPTIMIZE IMPORT ###

Используемые библиотеки:
1) liborsa
2) surfboard
3)


Порядок действий:
    Усреднить громкость наверное
    1) librosa.effects.preemphasis (In order to equalize the effect of the propagation)
    2) features:
        2.1)MFCC
        2.2)Energy
        2.3)Pitch
        2.4)formants (1 - 3)
        2.5)spectrum centroid????
        2.6)Zero crossing rate
        2.7)jitter
        2.8)shimmers
        2.7) Mean, max, std (статистика по всем фичам на запись)

In [2]:
global_audio_path = "../data"
def get_mfcc(audio_path : str, _nmfcc = 20, start: int = 2, end: int  = 17) -> np.ndarray:
    """
    Выделает MFCC коэффициенты из аудио, для каждого окна берется коэффициенты со 2 по N,
    :param audio_path:
    :type audio_path:
    :return:
    :rtype:
    """
    _nfft = 512
    # _hop_length = 160
    # Меняет количество sample
    audio, sampling_freq = librosa.load(audio_path, sr=None, res_type='scipy')
    # print(sampling_freq)
    features = librosa.feature.mfcc(audio, sampling_freq, n_mfcc=_nmfcc, n_fft=_nfft)
    return features[start:end,:].T.ravel()
    # len(fe), len(fe[0])
# print(fe[0])
# print(fe[1])

In [40]:
# sound_path = "../data/1/03-01-05-01-01-01-01_A1.wav"
components_list = [
    'f0_contour', 'log_energy', 'spectral_centroid', 'formants', 'loudness',
    'jitters', 'shimmers', 'zerocrossing',
]
mfcc_with_arg = {'mfcc': {'n_mfcc': 16, 'n_fft_seconds': 512 / 48000}}
def get_features(sound_path : str) -> pd.DataFrame:
    audio, sampling_freq = librosa.load(sound_path, sr=None, res_type='scipy')
    # 512 / sampling_freq
    components_list.append(mfcc_with_arg)
    processed_audio = librosa.effects.preemphasis(audio)
    statistics_list = ['mean', 'std', 'min', 'max', 'skewness', 'kurtosis']
    # Extract dataframe...
    audio = Waveform(signal=processed_audio, sample_rate=sampling_freq)
    feature_df = extract_features(
        waveforms=[audio], components_list=components_list, statistics_list=statistics_list)
    return feature_df

  "Empty filters detected in mel frequency basis. "
Extracting features...: 100%|██████████| 1/1 [00:00<00:00,  3.36it/s]


In [3]:
# old mfcc
max_len = -1
features = []
classes = []
for i in range(1, 6):
    for root, dirs, files in os.walk(os.path.join(global_audio_path, str(i)), topdown=False):
        print(len(files))
        for name in tqdm(files):
            important_features = get_mfcc(os.path.join(root, name))
            max_len = max(max_len, important_features.shape[0])
            features.append(important_features)
            classes.append(i)

  "Empty filters detected in mel frequency basis. "
100%|██████████| 172/172 [00:07<00:00, 23.79it/s]
100%|██████████| 259/259 [00:08<00:00, 31.10it/s]
100%|██████████| 172/172 [00:06<00:00, 28.38it/s]
100%|██████████| 85/85 [00:02<00:00, 35.27it/s]
100%|██████████| 171/171 [00:05<00:00, 28.80it/s]


172
259
172
85
171


In [4]:
res_dataframe = pd.DataFrame(features)
res_dataframe.fillna(0.0, inplace=True)
res_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6755,6756,6757,6758,6759,6760,6761,6762,6763,6764
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.424576,0.036667,10.632904,15.569912,-14.513282,-15.562387,-15.246050,6.461302,-5.611545,-15.148950,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.941792,2.906605,2.857789,2.795756,2.721071,2.634377,2.536410,2.428020,2.310090,2.183592,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
855,1.327634,1.302804,1.268480,1.225113,1.173234,1.113503,1.046653,0.973520,0.894979,0.811980,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
856,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
sum_df = res_dataframe.sum(axis=0)
index_to_del = sum_df[sum_df == 0].index
print(index_to_del)
sum_df

Int64Index([6735, 6736, 6737, 6738, 6739, 6740, 6741, 6742, 6743, 6744, 6745,
            6746, 6747, 6748, 6749],
           dtype='int64')


0        471.002571
1       1469.362606
2       1779.936194
3       1624.555274
4       1206.148421
           ...     
6760       0.704981
6761       0.664472
6762       0.621463
6763       0.576115
6764       0.528595
Length: 6765, dtype: float64

In [5]:
target = pd.Series(classes, name="target")
target.unique()

array([1, 2, 3, 4, 5], dtype=int64)

In [None]:
X = scale(res_dataframe)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
params = {"max_depth" : [3,5,7], "n_estimators" : [100, 200, 400]}
print(params)
estimator_ = GradientBoostingClassifier()
gs = GridSearchCV(estimator_, params, cv=cv,scoring="accuracy", verbose=2, n_jobs=-1)
gs.fit(X, target)
gs.cv_results_

In [62]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = KNeighborsClassifier(n_neighbors=11)
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1)
# best ~ 0.43
scores.mean()

0.42485380116959065

In [57]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = LogisticRegression("l2", C=0.001, solver="sag")
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
# best ~ 0.58
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   10.2s remaining:    6.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.3s finished


0.5611179110567115

In [63]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = GradientBoostingClassifier()
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
scores.mean()


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  7.4min remaining:  5.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  7.5min finished


0.501733986128111

In [11]:
X = scale(res_dataframe)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
params = {"C" : 10 ** np.arange(-3, 3, dtype = float), "degree" : [1, 2, 5, 9]}
print(params)
estimator_ = SVC(kernel="poly")
gs = GridSearchCV(estimator_, params, cv=cv,scoring="accuracy", verbose=2, n_jobs=-1)
gs.fit(X, target)
gs.cv_results_






{'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), 'degree': [1, 2, 5, 9]}
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.2min finished


{'mean_fit_time': array([10.14202571, 11.11791215, 11.27505498, 11.04444523, 10.67810769,
        11.38615389, 11.5959424 , 11.40016694, 10.62105789, 11.78111405,
        11.83376007, 12.09800043,  9.48302336, 11.72005682, 11.71064682,
        11.46822848, 10.74196892, 11.61075692, 11.57572184, 11.48324146,
        11.07246847, 11.97849274, 11.52988548, 10.50534701]),
 'std_fit_time': array([0.1849908 , 0.38413026, 0.62154767, 0.34380556, 0.25796554,
        0.30737111, 0.21032805, 0.26019062, 0.26475568, 0.35776369,
        0.27058938, 0.34770775, 0.36414519, 0.30897848, 0.31291309,
        0.22318819, 0.03750703, 0.18063463, 0.31332362, 0.19763557,
        0.44814232, 0.18593168, 0.2211602 , 0.60782624]),
 'mean_score_time': array([1.68733416, 1.74378357, 1.73757663, 1.6697144 , 1.63768792,
        1.6653132 , 1.69093604, 1.69253712, 1.65330172, 1.67071795,
        1.75579615, 1.60025401, 1.5916451 , 1.68052697, 1.67091947,
        1.6376873 , 1.57102637, 1.67171903, 1.77101178, 1.71