In [2]:
import os
import librosa
import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.display import display
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale, StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
from surfboard.sound import Waveform
from surfboard.feature_extraction import extract_features_from_waveform
### OPTIMIZE IMPORT ###

In [3]:
def extract_features(waveforms, components_list, statistics_list=None):
    """This is an important function. Given a list of Waveform objects, a list of
    Waveform methods in the form of strings and a list of Barrel methods in the
    form of strings, compute the time-independent features resulting. This function
    does multiprocessing.

    Args:
        waveforms (list of Waveform): This is a list of waveform objects
        components_list (list of str/dict): This is a list of the methods which
            should be applied to all the waveform objects in waveforms. If a dict,
            this also contains arguments to the sound.Waveform methods.
        statistics_list (list of str): This is a list of the methods which
            should be applied to all the time-dependent features computed
            from the waveforms.

    Returns:
        pandas DataFrame: pandas dataframe where every row corresponds
            to features extracted for one of the waveforms and columns
            represent individual features.
    """
    output_feats = []
    # waveforms = tqdm(waveforms, desc='Extracting features...')

    for wave in waveforms:
        output_feats.append(
            extract_features_from_waveform(
                components_list, statistics_list, wave
            )
        )

    return pd.DataFrame(output_feats)

Используемые библиотеки:
1) liborsa
2) surfboard
3)

Порядок действий:
    Усреднить громкость наверное
    1) librosa.effects.preemphasis (In order to equalize the effect of the propagation)
    2) features:
        2.1)MFCC
        2.2)Energy
        2.3)Pitch
        2.4)formants (1 - 3)
        2.5)spectrum centroid????
        2.6)Zero crossing rate
        2.7)jitter
        2.8)shimmers
        2.7) Mean, max, std (статистика по всем фичам на запись)

In [31]:
# sound_path = "../data/1/03-01-05-01-01-01-01_A1.wav"
# components_list_for_plain_features = [
#         'f0_contour', 'log_energy', 'spectral_centroid', 'loudness', 'intensity', 'rms', 'ppe']
# mfcc_with_arg = {'mfcc': {'n_mfcc': 16, 'n_fft_seconds': 512 / 48000}}
# statistics_list = ['mean', 'std', 'min', 'max', 'skewness', 'kurtosis']
# components_list_for_plain_features.append(mfcc_with_arg)
#
# audio, sampling_freq = librosa.load(sound_path, sr=None, res_type='scipy')
# # 512 / sampling_freq
# processed_audio = librosa.effects.preemphasis(audio)
# # Extract dataframe...
# audio = Waveform(signal=processed_audio, sample_rate=sampling_freq)
# feature_df2 : pd.DataFrame= extract_features(
#     waveforms=[audio], components_list=components_list_for_plain_features)
# # display(feature_df2.mfcc[0][2:, ].ravel().shape)
# feature_df2 = unpack_features_inplace(feature_df2)
# feature_df2

23.967546906279438

10165.480300419653

8.333792e-06

0.0014897384

-2.138103



  "Empty filters detected in mel frequency basis. "


ValueError: Shape of passed values is (1, 5082), indices imply (1, 1)

In [7]:
components_list_for_stat_features = [
    'f0_contour', 'log_energy', 'spectral_centroid', 'formants', 'loudness',
    'jitters', 'shimmers', 'zerocrossing',
]
components_list_for_plain_features = [
        'f0_contour', 'log_energy', 'spectral_centroid', 'loudness', 'intensity', 'rms', 'ppe']
mfcc_with_arg = {'mfcc': {'n_mfcc': 16, 'n_fft_seconds': 512 / 48000}}
statistics_list = ['mean', 'std', 'min', 'max', 'skewness', 'kurtosis',
                   'first_derivative_mean', 'first_derivative_std', 'q3_q1_range']
components_list_for_stat_features.append(mfcc_with_arg)
components_list_for_plain_features.append(mfcc_with_arg)

def unpack_features_inplace(feature_dataframe : pd.DataFrame) -> pd.DataFrame:
    result  = pd.DataFrame()
    for column in feature_dataframe.columns:
        if type(feature_dataframe[column][0]) is np.ndarray:
            if column is "mfcc":
                # get 2 - 16 mfcc coef for each window
                features_arr : np.ndarray = feature_dataframe.mfcc[0][2:, 50:250].ravel()
                features_arr = features_arr[np.newaxis]
                length = features_arr.shape[1]
            else:
                features_arr = feature_dataframe[column][0][:, 50:250]
                length = len(features_arr[0])

            print(features_arr.shape)
            df = pd.DataFrame(features_arr, columns=[column + str(i) for i in range(length)])
            result = pd.concat([result, df], axis=1)
        else:
            result = pd.concat([result, feature_dataframe[column]], axis=1)
    return result

def get_features(sound_path : str) -> pd.DataFrame:
    audio, sampling_freq = librosa.load(sound_path, sr=None, res_type='scipy')
    # 512 / sampling_freq
    processed_audio = librosa.effects.preemphasis(audio)
    # Extract dataframe...
    audio = Waveform(signal=processed_audio, sample_rate=sampling_freq)
    feature_df = extract_features(
        waveforms=[audio], components_list=components_list_for_stat_features, statistics_list=statistics_list)
    return feature_df


In [8]:
global_audio_path = "../data"
res_dataframe = pd.DataFrame()
classes = []
for i in range(1, 6):
    for root, dirs, files in os.walk(os.path.join(global_audio_path, str(i)), topdown=False):
        print(len(files))
        for name in files:
            important_features = get_features(os.path.join(root, name))
            # display(important_features)
            res_dataframe = res_dataframe.append(important_features, ignore_index=True)
            classes.append(i)

172
259
172
85
171


  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty filters detected in mel frequency basis. "
  "Empty fil

In [9]:
display(res_dataframe)
res_dataframe.describe()

Unnamed: 0,f0_contour_mean,f0_contour_std,f0_contour_min,f0_contour_max,f0_contour_skewness,f0_contour_kurtosis,f0_contour_first_derivative_mean,f0_contour_first_derivative_std,f0_contour_q3_q1_range,log_energy,...,mfcc_q3_q1_range_7,mfcc_q3_q1_range_8,mfcc_q3_q1_range_9,mfcc_q3_q1_range_10,mfcc_q3_q1_range_11,mfcc_q3_q1_range_12,mfcc_q3_q1_range_13,mfcc_q3_q1_range_14,mfcc_q3_q1_range_15,mfcc_q3_q1_range_16
0,23.967547,55.219171,0.0,189.119143,1.895990,1.668783,1.570260e-16,27.951188,0.000000,-50.789417,...,15.766265,6.817384,10.575198,5.019678,18.639672,4.687609,3.798230,6.998048,4.952859,7.033751
1,27.045372,71.229321,0.0,259.841201,2.297994,3.420751,-1.652425e-16,29.417767,0.000000,-52.897926,...,23.343090,6.969563,17.334488,9.290317,6.188273,5.173421,7.520164,4.676119,8.310646,5.102108
2,8.724462,32.011158,0.0,209.613961,4.010852,17.134221,0.000000e+00,19.592358,0.000000,-57.779049,...,18.296813,12.634435,14.232127,10.144253,10.668671,12.063463,9.836006,11.097036,10.172810,9.805441
3,28.523561,74.396412,0.0,260.546090,2.255366,3.177505,0.000000e+00,36.652883,0.000000,-50.261565,...,16.871922,4.114951,18.549006,9.859148,6.460007,3.631738,6.366929,4.032610,11.482723,7.560311
4,10.253963,40.387550,0.0,238.921752,3.916850,14.445161,0.000000e+00,36.281940,0.000000,-46.754228,...,31.643030,9.679077,11.189731,7.478577,15.093403,6.569043,5.774323,6.686566,10.186784,7.822491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,0.000000,0.000000,0.0,0.000000,0.000000,-3.000000,0.000000e+00,0.000000,0.000000,-49.614353,...,38.190409,11.919920,19.566975,20.034756,26.156811,9.096250,10.394791,12.184984,5.814711,5.547981
855,11.937603,40.763433,0.0,176.104431,3.170061,8.235236,0.000000e+00,19.115677,0.000000,-50.706402,...,18.682658,14.766144,11.331717,14.411584,21.332257,6.488466,8.219062,12.793879,12.382707,8.606090
856,80.651854,98.998965,0.0,237.845414,0.422148,-1.802225,-1.583382e-16,40.416811,197.180354,-55.082888,...,14.759832,9.662964,14.929921,8.962712,10.297725,8.585087,6.362794,13.534030,7.544165,9.226057
857,47.841428,76.043443,0.0,200.364790,0.980022,-0.996395,0.000000e+00,25.452955,151.191570,-55.624970,...,16.418184,10.921530,20.866169,14.092030,12.207006,9.323483,9.331396,9.245415,10.395979,9.108313


Unnamed: 0,f0_contour_mean,f0_contour_std,f0_contour_min,f0_contour_max,f0_contour_skewness,f0_contour_kurtosis,f0_contour_first_derivative_mean,f0_contour_first_derivative_std,f0_contour_q3_q1_range,log_energy,...,mfcc_q3_q1_range_7,mfcc_q3_q1_range_8,mfcc_q3_q1_range_9,mfcc_q3_q1_range_10,mfcc_q3_q1_range_11,mfcc_q3_q1_range_12,mfcc_q3_q1_range_13,mfcc_q3_q1_range_14,mfcc_q3_q1_range_15,mfcc_q3_q1_range_16
count,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,...,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0,859.0
mean,32.679361,61.67047,0.0,204.244583,2.089628,8.862409,0.0005657557,31.836192,50.766291,-55.53267,...,18.861103,8.791814,15.332696,9.794284,9.935873,7.130749,8.320488,7.67152,8.140776,6.906689
std,20.745549,26.378037,0.0,66.31167,2.529656,42.154758,0.01185124,12.422068,75.607405,9.849781,...,7.376569,3.533583,5.343398,4.383627,4.568073,2.765017,3.655359,3.099579,3.114858,2.799041
min,0.0,0.0,0.0,0.0,0.0,-3.0,-6.161888e-16,0.0,0.0,-79.909811,...,4.237615,0.0,1.744585,0.375644,1.538204,0.60021,0.468612,0.053435,0.875458,1.651371
25%,17.189673,45.091285,0.0,180.123285,0.985625,-0.941569,-7.459766000000001e-17,24.962153,0.0,-62.46471,...,13.823632,6.141642,11.199074,6.642437,6.529782,5.162255,5.885468,5.281707,5.893342,4.877548
50%,30.608498,63.876307,0.0,226.122932,1.483075,0.337269,0.0,33.257774,0.0,-57.157595,...,18.083952,8.316085,15.021816,9.011215,9.128621,6.919869,7.745886,7.202434,7.630065,6.443548
75%,47.769585,83.996671,0.0,251.080904,2.181621,3.039097,7.884106000000001e-17,40.50766,112.032888,-48.948679,...,23.13287,11.031903,19.114989,12.291657,12.208972,8.664798,10.028368,9.89995,9.925875,8.690651
max,96.018157,109.769293,0.0,280.559255,21.166063,446.002222,0.2796821,59.36217,224.091046,-29.154129,...,44.057369,23.916194,31.031125,29.180224,28.69911,22.758804,27.303226,22.335853,20.096462,21.351741


In [10]:
res_dataframe.fillna(0, inplace=True)
target = pd.Series(classes, name="target")
target.unique()

array([1, 2, 3, 4, 5], dtype=int64)

In [33]:
sum_df = res_dataframe.sum(axis=0)
index_to_del = sum_df[sum_df == 0].index
print(index_to_del)
sum_df

Int64Index([6735, 6736, 6737, 6738, 6739, 6740, 6741, 6742, 6743, 6744, 6745,
            6746, 6747, 6748, 6749],
           dtype='int64')


0        471.002571
1       1469.362606
2       1779.936194
3       1624.555274
4       1206.148421
           ...     
6760       0.704981
6761       0.664472
6762       0.621463
6763       0.576115
6764       0.528595
Length: 6765, dtype: float64

In [3]:
# load old data
data = pd.read_csv("features_and_stat_full_2.csv")
target = data["target"]
res_dataframe = data.drop(columns="target")
display(res_dataframe)
display(target)

Unnamed: 0,f0_contour_mean,f0_contour_std,f0_contour_min,f0_contour_max,f0_contour_skewness,f0_contour_kurtosis,f0_contour_first_derivative_mean,f0_contour_first_derivative_std,f0_contour_q3_q1_range,log_energy,...,mfcc_q3_q1_range_7,mfcc_q3_q1_range_8,mfcc_q3_q1_range_9,mfcc_q3_q1_range_10,mfcc_q3_q1_range_11,mfcc_q3_q1_range_12,mfcc_q3_q1_range_13,mfcc_q3_q1_range_14,mfcc_q3_q1_range_15,mfcc_q3_q1_range_16
0,23.967547,55.219171,0.0,189.119143,1.895990,1.668783,1.570260e-16,27.951188,0.000000,-50.789417,...,15.766265,6.817384,10.575198,5.019678,18.639672,4.687609,3.798230,6.998048,4.952859,7.033751
1,27.045372,71.229321,0.0,259.841201,2.297994,3.420751,-1.652425e-16,29.417767,0.000000,-52.897926,...,23.343090,6.969563,17.334488,9.290317,6.188273,5.173421,7.520164,4.676119,8.310646,5.102108
2,8.724462,32.011158,0.0,209.613961,4.010852,17.134221,0.000000e+00,19.592358,0.000000,-57.779049,...,18.296813,12.634435,14.232127,10.144253,10.668671,12.063463,9.836006,11.097036,10.172810,9.805441
3,28.523561,74.396412,0.0,260.546090,2.255366,3.177505,0.000000e+00,36.652883,0.000000,-50.261565,...,16.871922,4.114951,18.549006,9.859148,6.460007,3.631738,6.366929,4.032610,11.482723,7.560311
4,10.253963,40.387550,0.0,238.921752,3.916850,14.445161,0.000000e+00,36.281940,0.000000,-46.754228,...,31.643030,9.679077,11.189731,7.478577,15.093403,6.569043,5.774323,6.686566,10.186784,7.822491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,0.000000,0.000000,0.0,0.000000,0.000000,-3.000000,0.000000e+00,0.000000,0.000000,-49.614353,...,38.190409,11.919920,19.566975,20.034756,26.156811,9.096250,10.394791,12.184984,5.814711,5.547981
855,11.937603,40.763433,0.0,176.104431,3.170061,8.235236,0.000000e+00,19.115677,0.000000,-50.706402,...,18.682658,14.766144,11.331717,14.411584,21.332257,6.488466,8.219062,12.793879,12.382707,8.606090
856,80.651854,98.998965,0.0,237.845414,0.422148,-1.802225,-1.583382e-16,40.416811,197.180354,-55.082888,...,14.759832,9.662964,14.929921,8.962712,10.297725,8.585087,6.362794,13.534030,7.544165,9.226057
857,47.841428,76.043443,0.0,200.364790,0.980022,-0.996395,0.000000e+00,25.452955,151.191570,-55.624970,...,16.418184,10.921530,20.866169,14.092030,12.207006,9.323483,9.331396,9.245415,10.395979,9.108313


0      1
1      1
2      1
3      1
4      1
      ..
854    5
855    5
856    5
857    5
858    5
Name: target, Length: 859, dtype: int64

Секция с Feature selection

In [11]:
# feature names to drop (based on MI decomposition method)
res = mutual_info_classif(res_dataframe, target, random_state=42, n_neighbors=30)
df_res = pd.Series(res)
sorted_df = df_res.sort_values(ascending=False)
index_to_del : pd.Int64Index = sorted_df[sorted_df < 0.1].index
print(index_to_del.shape)
columns_to_del = [res_dataframe.columns[i] for i in index_to_del]
for c in columns_to_del:
    print(c)
# features_cleared_df = res_dataframe.drop(columns=)
# features_cleared_df

(140,)
f0_contour_q3_q1_range
apq3Shimmer
mfcc_q3_q1_range_13
mfcc_mean_13
localShimmer
spectral_centroid_skewness
mfcc_kurtosis_5
mfcc_q3_q1_range_5
mfcc_first_derivative_std_1
mfcc_std_14
spectral_centroid_mean
mfcc_kurtosis_1
mfcc_max_4
mfcc_kurtosis_7
mfcc_std_8
mfcc_mean_5
spectral_centroid_min
mfcc_max_16
mfcc_mean_15
mfcc_std_10
mfcc_first_derivative_std_11
mfcc_mean_14
mfcc_first_derivative_std_8
mfcc_min_4
mfcc_min_6
localdbShimmer
mfcc_q3_q1_range_10
mfcc_mean_12
mfcc_max_12
mfcc_min_15
spectral_centroid_first_derivative_std
mfcc_skewness_7
f0_contour_max
mfcc_q3_q1_range_2
mfcc_std_12
mfcc_q3_q1_range_16
mfcc_first_derivative_mean_3
mfcc_kurtosis_11
mfcc_kurtosis_13
mfcc_mean_16
mfcc_std_9
mfcc_first_derivative_std_5
mfcc_skewness_6
mfcc_first_derivative_std_10
mfcc_min_10
mfcc_min_2
mfcc_skewness_2
mfcc_q3_q1_range_15
num_zerocrossings
mfcc_kurtosis_4
mfcc_min_8
mfcc_max_2
mfcc_mean_4
mfcc_skewness_1
mfcc_skewness_12
f3
mfcc_first_derivative_mean_14
mfcc_first_derivative_st

In [45]:
# k = 90 for 1st set
# k = 110 for 2nd set
# k = 200
features_cleared_df = SelectKBest(mutual_info_classif, k=120).fit_transform(res_dataframe, target)
pd.DataFrame(features_cleared_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,119
0,23.967547,55.219171,189.119143,1.895990,1.668783,0.000000,-50.789417,10165.480300,4205.444653,0.000000,...,15.766265,6.817384,10.575198,5.019678,18.639672,4.687609,3.798230,6.998048,4.952859,7.033751
1,27.045372,71.229321,259.841201,2.297994,3.420751,0.000000,-52.897926,8238.646944,4965.301582,0.000000,...,23.343090,6.969563,17.334488,9.290317,6.188273,5.173421,7.520164,4.676119,8.310646,5.102108
2,8.724462,32.011158,209.613961,4.010852,17.134221,0.000000,-57.779049,10573.577648,2372.872379,5207.613500,...,18.296813,12.634435,14.232127,10.144253,10.668671,12.063463,9.836006,11.097036,10.172810,9.805441
3,28.523561,74.396412,260.546090,2.255366,3.177505,0.000000,-50.261565,8503.507631,5009.949377,0.000000,...,16.871922,4.114951,18.549006,9.859148,6.460007,3.631738,6.366929,4.032610,11.482723,7.560311
4,10.253963,40.387550,238.921752,3.916850,14.445161,0.000000,-46.754228,10341.238850,4201.536338,3761.336784,...,31.643030,9.679077,11.189731,7.478577,15.093403,6.569043,5.774323,6.686566,10.186784,7.822491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,0.000000,0.000000,0.000000,0.000000,-3.000000,0.000000,-49.614353,8274.023605,4547.170923,0.000000,...,38.190409,11.919920,19.566975,20.034756,26.156811,9.096250,10.394791,12.184984,5.814711,5.547981
855,11.937603,40.763433,176.104431,3.170061,8.235236,0.000000,-50.706402,9779.106757,4538.508236,3203.973249,...,18.682658,14.766144,11.331717,14.411584,21.332257,6.488466,8.219062,12.793879,12.382707,8.606090
856,80.651854,98.998965,237.845414,0.422148,-1.802225,197.180354,-55.082888,10840.344698,3517.150012,5156.937923,...,14.759832,9.662964,14.929921,8.962712,10.297725,8.585087,6.362794,13.534030,7.544165,9.226057
857,47.841428,76.043443,200.364790,0.980022,-0.996395,151.191570,-55.624970,10787.963161,2634.707493,6508.520967,...,16.418184,10.921530,20.866169,14.092030,12.207006,9.323483,9.331396,9.245415,10.395979,9.108313


In [39]:
pca_ = PCA(n_components='mle', whiten=True, random_state=42)
features_cleared_df = pca_.fit_transform(res_dataframe)
print("Components: ")
print(pca_.n_components)
print("Var: ")
print(pca_.explained_variance_ratio_)
print("Dim of data: ")
pd.DataFrame(features_cleared_df)

Components: 
mle
Var: 
[9.31892957e-01 3.06630407e-02 1.53402566e-02 9.76171428e-03
 7.03184323e-03 1.43844174e-03 1.03285113e-03 8.94161627e-04
 6.89213744e-04 5.31252539e-04 4.34096033e-04 1.41510449e-04
 5.02834704e-05 3.14587058e-05 1.69566299e-05 1.13799716e-05
 5.50875443e-06 3.63428612e-06 3.31432591e-06 2.76885923e-06
 2.03455642e-06 1.64424169e-06 1.55359180e-06 1.36903476e-06
 1.27255987e-06 1.13541670e-06 1.09165650e-06 1.02807865e-06
 9.59402057e-07 8.35611023e-07 7.67073798e-07 7.22557666e-07
 6.10982627e-07 5.79762035e-07 5.59130410e-07 4.97779244e-07
 4.64097777e-07 4.29026479e-07 4.19806483e-07 3.83149428e-07
 3.48465844e-07 3.15226987e-07 3.02643730e-07 2.82474098e-07
 2.77690634e-07 2.57219402e-07 2.47982052e-07 2.35972545e-07
 2.24140561e-07 2.03176403e-07 1.93094973e-07 1.82324773e-07
 1.74894116e-07 1.52160724e-07 1.32815505e-07 1.23353155e-07
 1.14323505e-07 1.04079091e-07 9.82803279e-08 8.55237552e-08
 8.33585098e-08 7.44160398e-08 7.06706902e-08 6.03891760e-08
 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,168,169,170,171,172,173,174,175,176,177
0,-0.654417,-0.718141,0.030132,-1.072910,0.708123,-0.020596,-0.017338,-0.427760,-1.085239,0.421242,...,-0.437299,-0.381024,-0.126177,-0.536485,0.179708,-0.767484,-0.257893,-0.294117,0.220843,-1.061629
1,-0.960351,-0.933735,0.608160,0.640322,-1.252787,-0.458721,-0.934445,-0.248745,0.559189,-0.358836,...,-0.735460,0.436552,0.642516,0.032466,-0.472319,-0.327293,2.469069,-0.011533,0.354069,0.329528
2,2.130659,-0.445539,-1.069364,2.001907,0.002100,-0.344728,0.760424,0.515909,-0.302349,-1.300116,...,-1.014061,-0.712787,-1.197466,-0.544358,-1.696698,-0.350922,0.681480,-1.449786,-0.447226,-0.741498
3,-0.826648,-1.030432,0.703325,1.846412,-0.696029,-0.377974,-0.469339,0.938330,-0.320545,1.094599,...,-0.780647,0.290459,0.547129,0.114253,0.444668,0.542399,2.306907,-0.170407,-0.740525,0.703598
4,0.224253,0.367593,1.689470,1.139996,0.136001,0.675708,-0.211493,-0.738881,0.135077,1.742833,...,-0.149573,0.876098,0.317102,-0.119419,0.075978,-0.108216,-0.775660,-0.725422,-0.774697,-1.059167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
854,-0.614313,-1.181972,-0.197831,-0.360738,-1.427620,-0.512256,-0.997067,-0.836648,0.637792,2.093875,...,-0.419476,-0.593385,1.493381,-0.106604,0.350686,-0.607659,-0.108891,-0.009390,0.306686,1.029953
855,-0.193450,0.371001,2.198106,-0.610272,-0.579873,0.303826,0.213992,-0.150671,-0.487775,1.583659,...,-0.349163,0.218169,-0.310940,1.234166,-0.170619,-0.965031,-0.471036,-0.031749,-0.476844,1.173416
856,-0.168456,1.562769,-0.109923,0.865072,-0.549880,0.162355,0.367857,-0.156663,-0.076021,-0.139445,...,0.424871,0.301903,0.788551,-0.019423,0.644747,0.001294,2.049701,-1.184725,-1.564025,1.264369
857,2.375942,0.096376,-0.192177,-0.799078,-0.359728,1.401976,0.695170,1.090006,-0.552889,-0.526679,...,0.479583,-2.924415,-0.018133,1.367868,-1.141636,1.469116,1.142611,0.832079,-0.312097,0.053825


Секция с классификаторами

In [None]:
X = scale(res_dataframe)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
params = {"max_depth" : [3,5,7], "n_estimators" : [100, 200, 400]}
print(params)
estimator_ = GradientBoostingClassifier()
gs = GridSearchCV(estimator_, params, cv=cv,scoring="accuracy", verbose=2, n_jobs=-1)
gs.fit(X, target)
gs.cv_results_

In [70]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = KNeighborsClassifier(n_neighbors=11)
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1)
# best ~ 0.56
scores.mean()

0.5646198830409357

In [33]:
# X = scale(res_dataframe)
X = scale(features_cleared_df)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = LinearSVC(C=0.01, dual=False)
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1)
# best ~ 0.64
scores.mean()

0.5413504691962464

In [42]:
X = scale(features_cleared_df)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
params = {"C" : 10 ** np.arange(-5, 5, dtype = float), "penalty" : ["l1", 'l2'],
          'dual' : [True, False], 'loss' : ['hinge', 'squared_hinge'], 'multi_class' : ['ovr', 'crammer_singer']}
print(params)
estimator_ = LinearSVC()
gs = GridSearchCV(estimator_, params, cv=cv,scoring="accuracy", verbose=2, n_jobs=-1)
gs.fit(X, target)
print(gs.cv_results_)
print(gs.best_params_)

{'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04]), 'penalty': ['l1', 'l2'], 'dual': [True, False], 'loss': ['hinge', 'squared_hinge'], 'multi_class': ['ovr', 'crammer_singer']}
Fitting 5 folds for each of 160 candidates, totalling 800 fits
{'mean_fit_time': array([3.00226212e-03, 9.20944214e-03, 3.34295750e-02, 2.46222973e-02,
       2.20155716e-03, 1.36121273e-02, 2.46209145e-02, 2.44212627e-02,
       2.80222893e-03, 3.20196152e-03, 3.12282562e-02, 3.22289467e-02,
       1.62152767e-02, 1.80142879e-02, 2.62217045e-02, 2.46229172e-02,
       2.60186195e-03, 7.00702667e-03, 3.16290855e-02, 3.78340244e-02,
       1.80072784e-03, 1.90170288e-02, 4.40408230e-02, 4.70427036e-02,
       3.80382538e-03, 2.20270157e-03, 4.40402985e-02, 4.34390068e-02,
       1.30101681e-02, 3.22304249e-02, 4.18373585e-02, 5.02446175e-02,
       9.00845528e-03, 1.84166908e-02, 6.48591042e-02, 5.64504147e-02,
       3.00307274e-03, 3.40310097e-02, 6.92625046e-0

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 444 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 773 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 10.1min finished


In [32]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = LogisticRegression("l2", C=0.001, solver="sag")
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
# best ~ 0.58
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    5.6s remaining:    3.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.7s finished


0.6100435196518428

In [85]:
X = res_dataframe
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = DecisionTreeClassifier(splitter="random")
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
# best ~ 0.58
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


0.4435060519515844

In [81]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = GradientBoostingClassifier()
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    9.9s remaining:    6.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.0s finished


0.6147558819529444

In [8]:
X = scale(features_cleared_df)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = MLPClassifier(hidden_layer_sizes=100, learning_rate='adaptive', solver='adam')
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    2.8s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished


0.6403644770841833

In [10]:
X = scale(res_dataframe)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
params = {"C" : 10 ** np.arange(-3, 3, dtype = float), "degree" : [1, 2, 5, 9]}
print(params)
estimator_ = SVC(kernel="poly")
gs = GridSearchCV(estimator_, params, cv=cv,scoring="accuracy", verbose=2, n_jobs=-1)
gs.fit(X, target)
gs.cv_results_

{'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), 'degree': [1, 2, 5, 9]}
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.7s finished


{'mean_fit_time': array([0.09148345, 0.099089  , 0.10509639, 0.09848862, 0.09108243,
        0.0966877 , 0.09588294, 0.11230145, 0.07626882, 0.09068227,
        0.09908924, 0.12010942, 0.06285706, 0.09848914, 0.10369301,
        0.11750598, 0.0738667 , 0.11070065, 0.11930799, 0.12831597,
        0.14553185, 0.11490469, 0.11931219, 0.11951218]),
 'std_fit_time': array([0.00463449, 0.00597178, 0.00522054, 0.00403344, 0.00800818,
        0.00680609, 0.00402167, 0.01702551, 0.00519699, 0.00377678,
        0.00569762, 0.00879469, 0.00458242, 0.01197118, 0.00224734,
        0.00771666, 0.01065641, 0.00944453, 0.01179876, 0.01593197,
        0.01501482, 0.00714488, 0.00896119, 0.00149845]),
 'mean_score_time': array([0.01541481, 0.01741581, 0.01481185, 0.01581411, 0.01481299,
        0.01441293, 0.01461334, 0.01601419, 0.0136127 , 0.01401286,
        0.01681523, 0.01681533, 0.01541395, 0.02322149, 0.01581535,
        0.01561456, 0.01261144, 0.01441336, 0.01561465, 0.01521392,
        0.011610

In [34]:
X = scale(res_dataframe)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
estimator_ = RandomForestClassifier(n_estimators=500)
scores : np.ndarray = cross_val_score(estimator_, X, target, cv = cv_, n_jobs=-1, verbose=2)
scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   11.6s remaining:    7.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.2s finished


0.5425472596219231

In [50]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
criterion = ['gini', 'entropy']
class_weight = ['balanced', 'balanced_subsample']
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion' : criterion,
               'class_weight' : class_weight,
               }
X = scale(features_cleared_df)
estimator_ = RandomForestClassifier()
gs = RandomizedSearchCV(estimator_, random_grid, n_iter = 500 ,cv=cv,scoring="accuracy", verbose=2, n_jobs=-1)
gs.fit(X, target)
print(gs.cv_results_)
print(gs.best_params_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
{'mean_fit_time': array([18.09346061, 17.21846437,  0.86558757, 15.68126674,  4.459657  ,
       26.19463229, 21.81524792,  0.94886308,  3.79445219,  5.52382507,
       14.15127439, 18.82732878, 22.29868755, 11.25563989, 11.32290154,
       32.73418193,  3.25836434,  7.63414545,  4.3259357 ,  9.87958837,
       21.26674876,  7.02839427,  8.63445559, 16.18592567,  8.22688527,
        2.54391403,  6.37479978, 20.18356295,  9.79110808, 18.32907596,
        6.63523712,  6.30453634, 24.95410314, 26.7275167 , 11.26524935,
       12.86070085,  1.71676178,  4.36296997,  7.69860368, 10.81183672,
       14.45054736, 10.93254719,  5.74142327,  3.93077626,  4.53772836,
        9.97587337,  4.65183206, 12.81826196,  2.93306832,  7.84433646,
       11.03243756,  6.25509114,  8.16262641,  0.95566921,  1.10240283,
       10.58282814,  1.02313128, 14.37447777,  1.94617019, 18.08605442,
        2.21901884, 12.33662353,  7.99987822,  2.29348

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 38.6min finished


In [51]:
gs.best_score_

0.6392084863321094

In [7]:
# GMM experience
gm = GaussianMixture()
gm.fit(scale(features_cleared_df))
gm.means_

array([[ 2.48152294e-16, -1.81978349e-16,  0.00000000e+00,
         2.48152294e-16, -7.03098167e-17, -1.24076147e-17,
        -1.99555803e-16, -2.48152294e-17,  3.47413212e-16,
         2.06793579e-16, -1.42687569e-16,  4.15655093e-16,
         6.61739451e-17, -1.20715751e-16, -3.30869726e-16,
        -1.65434863e-16,  2.23337065e-16, -9.51250461e-17,
        -5.99701378e-17,  1.15804404e-16, -3.66024634e-16,
        -1.15804404e-16, -1.15804404e-16, -7.03098167e-17,
        -4.27545724e-16,  2.89511010e-17, -1.24076147e-17,
        -5.16983946e-17, -1.90250092e-16,  1.81978349e-16,
        -2.89511010e-17,  2.48152294e-17,  2.48152294e-17,
        -3.10190368e-17, -9.09891745e-17,  3.72228441e-17,
         1.34415826e-16,  2.68831652e-16,  4.96821572e-16,
        -7.44456883e-17,  4.96304588e-17,  4.71489359e-16,
         1.81978349e-16, -1.48891377e-16, -9.97779016e-17,
        -2.87960058e-16, -5.04576332e-16,  2.23337065e-16,
        -6.70011194e-16,  1.98521835e-16,  4.65285552e-1

In [71]:
# best rf {'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 90, 'criterion': 'gini', 'class_weight': 'balanced', 'bootstrap': False}
estimators = [
    ('mlp', MLPClassifier(hidden_layer_sizes=200, learning_rate='adaptive', solver='adam')),
    ("rf", RandomForestClassifier(n_estimators=500, min_samples_leaf=2, max_features = 'sqrt',
                                  max_depth= 90, bootstrap=False, random_state=51)),
    ('lr', LogisticRegression("l2", C=0.01)),
    ('svc', LinearSVC(C=0.01, dual=True, loss='hinge', random_state=51)),
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
X = scale(features_cleared_df)
cv_ = StratifiedKFold(n_splits = 5, shuffle=True, random_state=241)
scores : np.ndarray = cross_val_score(clf, X, target, cv = cv_, n_jobs=-1, verbose=2)
# score ~ 0.81
print("Min - " , np.min(scores))
print("Max - " , np.max(scores))
print("Mean - " , scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   36.4s remaining:   24.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   38.5s finished


Min -  0.7790697674418605
Max -  0.8372093023255814
Mean -  0.8125866993064056


In [11]:
gs.best_params_


{'C': 1.0, 'degree': 1}

In [72]:
full_df = pd.concat([res_dataframe, target], 1)
full_df.to_csv("features_and_stat_full_2.csv", index=False)

