In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import random
import copy
import os
import graphviz
import pickle
import scipy.io.wavfile as wav
from src.voice_activity_detection.extract_features import extract_features

In [2]:
with open("src/data/noise-train/features_df_1s.pickle", "rb") as file:
    voice_noise_df=pickle.load(file)
voice_noise_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392416 entries, 0 to 392415
Data columns (total 19 columns):
RMS         392416 non-null float64
SE          392416 non-null float64
ZCR         392416 non-null float64
LEFR        392416 non-null float64
SF          392416 non-null float64
SF_std      392416 non-null float64
SRF         392416 non-null float64
SRF_std     392416 non-null float64
SC          392416 non-null float64
SC_std      392416 non-null float64
BW          392416 non-null float64
BW_std      392416 non-null float64
NWPD        392416 non-null float64
NWPD_std    392416 non-null float64
RSE         391352 non-null float64
RSE_std     391339 non-null float64
type        392416 non-null object
name        392416 non-null object
number      392416 non-null int64
dtypes: float64(16), int64(1), object(2)
memory usage: 56.9+ MB


In [3]:
voice_noise_df = voice_noise_df[pd.notnull(voice_noise_df['RSE'])]
voice_noise_df = voice_noise_df[pd.notnull(voice_noise_df['RSE_std'])]
voice_noise_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 391339 entries, 0 to 392415
Data columns (total 19 columns):
RMS         391339 non-null float64
SE          391339 non-null float64
ZCR         391339 non-null float64
LEFR        391339 non-null float64
SF          391339 non-null float64
SF_std      391339 non-null float64
SRF         391339 non-null float64
SRF_std     391339 non-null float64
SC          391339 non-null float64
SC_std      391339 non-null float64
BW          391339 non-null float64
BW_std      391339 non-null float64
NWPD        391339 non-null float64
NWPD_std    391339 non-null float64
RSE         391339 non-null float64
RSE_std     391339 non-null float64
type        391339 non-null object
name        391339 non-null object
number      391339 non-null int64
dtypes: float64(16), int64(1), object(2)
memory usage: 59.7+ MB


In [4]:
voice_noise_df.describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,number
count,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0
mean,3881.72355,4.4261,0.106271,0.295513,0.005627191,0.003939445,4468.916636,857.176179,766.989241,485.522178,700650.5,626379.4,0.268833,22.843964,-0.356949,0.363653,216.597027
std,2669.945559,0.43036,0.061618,0.240645,0.002347471,0.002494668,952.072181,328.532905,522.803081,460.40549,711035.3,580621.5,2.624867,6.38437,0.076928,0.019068,174.3911
min,0.490089,1.610912,0.0,0.0,6.686153e-11,1.857076e-11,1718.75,0.0,55.003685,0.005596,6713.368,12.29212,-60.607956,0.013836,-0.792863,0.293633,0.0
25%,1947.66333,4.160194,0.065317,0.040816,0.004143628,0.002372378,3645.408163,638.588104,442.474243,155.143118,255975.7,194709.9,-1.168532,18.690534,-0.409401,0.352149,74.0
50%,3232.506348,4.443933,0.094693,0.295918,0.005406255,0.003525452,4294.642857,848.883006,645.299844,319.852183,487534.9,439043.2,0.315418,22.102554,-0.353456,0.36045,171.0
75%,5282.64624,4.715955,0.133633,0.479592,0.00671055,0.00487705,5146.364796,1063.872622,945.241476,666.003322,897843.8,884848.6,1.780692,25.995758,-0.294575,0.37064,329.0
max,27775.876953,5.487236,0.865992,1.0,0.04780443,0.05316446,7559.94898,2366.830854,7029.272341,3001.529504,8526785.0,4892790.0,52.321439,80.425028,-0.139832,0.583676,1046.0


In [5]:
le=LabelEncoder()
voice_noise_df['type'] = le.fit_transform(voice_noise_df["type"])
list(le.classes_)


['music', 'noise', 'speech']

In [6]:
voice_noise_df.describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0
mean,3881.72355,4.4261,0.106271,0.295513,0.005627191,0.003939445,4468.916636,857.176179,766.989241,485.522178,700650.5,626379.4,0.268833,22.843964,-0.356949,0.363653,1.163268,216.597027
std,2669.945559,0.43036,0.061618,0.240645,0.002347471,0.002494668,952.072181,328.532905,522.803081,460.40549,711035.3,580621.5,2.624867,6.38437,0.076928,0.019068,0.957943,174.3911
min,0.490089,1.610912,0.0,0.0,6.686153e-11,1.857076e-11,1718.75,0.0,55.003685,0.005596,6713.368,12.29212,-60.607956,0.013836,-0.792863,0.293633,0.0,0.0
25%,1947.66333,4.160194,0.065317,0.040816,0.004143628,0.002372378,3645.408163,638.588104,442.474243,155.143118,255975.7,194709.9,-1.168532,18.690534,-0.409401,0.352149,0.0,74.0
50%,3232.506348,4.443933,0.094693,0.295918,0.005406255,0.003525452,4294.642857,848.883006,645.299844,319.852183,487534.9,439043.2,0.315418,22.102554,-0.353456,0.36045,2.0,171.0
75%,5282.64624,4.715955,0.133633,0.479592,0.00671055,0.00487705,5146.364796,1063.872622,945.241476,666.003322,897843.8,884848.6,1.780692,25.995758,-0.294575,0.37064,2.0,329.0
max,27775.876953,5.487236,0.865992,1.0,0.04780443,0.05316446,7559.94898,2366.830854,7029.272341,3001.529504,8526785.0,4892790.0,52.321439,80.425028,-0.139832,0.583676,2.0,1046.0


In [7]:
voice_noise_df.groupby('type').count()

Unnamed: 0_level_0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,name,number
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826
1,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794
2,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719


In [8]:
voice_noise_df[voice_noise_df["type"]==0].describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0
mean,5260.765401,4.424306,0.094214,0.105165,0.004948,0.003457888,4709.087179,765.977004,613.917737,265.020561,693406.8,520160.0,0.128564,23.893921,-0.299098,0.360282,0.0,144.843561
std,2985.758783,0.421915,0.052359,0.150751,0.002595,0.002697407,996.254522,292.960678,402.856023,268.817637,725850.0,581211.2,2.638701,6.473233,0.049296,0.013886,0.0,123.19891
min,0.49314,1.610912,0.0,0.0,2e-06,3.00369e-07,2520.408163,25.944956,56.69804,0.219177,32927.52,872.6883,-33.636531,0.574031,-0.650863,0.309139,0.0,0.0
25%,3051.080017,4.175859,0.058066,0.0,0.003234,0.001678861,3905.931122,539.965701,370.92745,90.489629,169443.2,86721.53,-1.356098,19.41752,-0.327677,0.352253,0.0,58.0
50%,4845.141357,4.456599,0.083755,0.030612,0.004352,0.002657558,4700.57398,763.421494,528.052621,181.27167,450792.8,296632.2,0.206791,23.132141,-0.292903,0.358419,0.0,120.0
75%,7288.090332,4.710782,0.121445,0.163265,0.005911,0.004351539,5499.681122,978.046886,751.472281,339.713755,973924.1,737761.2,1.706189,27.452872,-0.263077,0.365586,0.0,195.0
max,20044.730469,5.469131,0.847928,0.969388,0.04089,0.05254569,7549.42602,2366.830854,6859.332631,2775.633655,7946042.0,4188144.0,24.933125,75.767237,-0.140178,0.583676,0.0,986.0


In [9]:
voice_noise_df[voice_noise_df["type"]==1].describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0
mean,3262.921516,4.646693,0.145969,0.148091,0.005898923,0.003618198,5329.322978,473.939689,1118.559027,314.108277,1247476.0,506371.2,0.423374,20.522269,-0.295885,0.366325,1.0,45.45976
std,3443.133244,0.459104,0.13418,0.247794,0.004306943,0.004024925,1095.296361,305.141461,1145.555542,363.861089,1402186.0,577778.2,4.140878,9.961264,0.047644,0.014017,0.0,66.960912
min,0.490089,1.666465,0.0,0.0,6.686153e-11,1.857076e-11,1718.75,0.0,55.003685,0.005596,6713.368,12.29212,-60.607956,0.013836,-0.792863,0.301916,1.0,0.0
25%,859.693207,4.371085,0.052878,0.0,0.002953919,0.001064399,4462.691327,238.733886,331.944172,81.52241,262356.2,95193.35,-0.760495,13.328905,-0.309761,0.360438,1.0,7.0
50%,2083.733765,4.66483,0.102194,0.0,0.004723632,0.002316589,5305.803571,417.027593,677.408318,164.81159,631123.5,275719.5,0.503186,18.117083,-0.28743,0.363976,1.0,21.0
75%,4792.595825,4.98178,0.199622,0.204082,0.007465715,0.00467359,6279.655612,647.904884,1521.478683,406.197584,1730434.0,705146.1,1.880707,25.507686,-0.277494,0.36833,1.0,53.0
max,27775.876953,5.487236,0.865992,1.0,0.04780443,0.05316446,7559.94898,2162.255981,7029.272341,2609.393025,8526785.0,4517787.0,52.321439,78.42031,-0.139832,0.558185,1.0,493.0


In [10]:
voice_noise_df[voice_noise_df["type"]==2].describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0
mean,2971.479047,4.405181,0.110781,0.444569,0.006079,0.004311,4213.027677,960.02762,839.577218,658.253571,650768.0,713351.7,0.352207,22.337032,-0.403885,0.365761,2.0,284.406314
std,1784.847066,0.427063,0.053206,0.180046,0.001674,0.002041,797.280659,306.271002,461.433898,501.41845,557760.5,565688.8,2.404992,5.700792,0.061694,0.022083,0.0,180.52333
min,0.493964,2.038474,0.0,0.0,0.000414,7.7e-05,2725.765306,30.929479,64.236217,4.109941,38214.3,7739.655,-23.636608,3.34833,-0.70746,0.293633,2.0,0.0
25%,1718.92627,4.133914,0.072255,0.326531,0.004994,0.003079,3578.125,749.299586,520.773712,263.506682,302950.3,305522.2,-1.085957,18.644982,-0.442553,0.351086,2.0,129.0
50%,2640.538574,4.415585,0.101694,0.438776,0.005897,0.003966,3953.125,925.345787,742.87275,508.775029,494925.3,521104.3,0.371128,21.681376,-0.398538,0.361839,2.0,270.0
75%,3820.906616,4.692225,0.138759,0.561224,0.006947,0.005058,4715.561224,1150.897622,1041.975587,919.072806,824025.7,992937.5,1.819917,25.090125,-0.361237,0.37589,2.0,429.0
max,13021.230469,5.468673,0.640165,1.0,0.031259,0.040925,7528.061224,2344.886859,5544.019437,3001.529504,7502836.0,4892790.0,25.507908,80.425028,-0.151569,0.522019,2.0,1046.0


In [11]:
slice_df=voice_noise_df[voice_noise_df["type"]==2]
slice_df.loc[slice_df['RMS'].idxmin()]

RMS                         0.493964
SE                           5.40496
ZCR                        0.0293143
LEFR                               0
SF                        0.00139793
SF_std                   0.000165807
SRF                          7367.03
SRF_std                       73.669
SC                           3977.55
SC_std                       137.286
BW                       5.20161e+06
BW_std                        355116
NWPD                       -0.512627
NWPD_std                     8.83862
RSE                        -0.283284
RSE_std                     0.362347
type                               2
name        speech-librivox-0150.wav
number                           788
Name: 236702, dtype: object

In [12]:
slice_df[slice_df['RMS']<103.6].sort_values(by=['RMS'],ascending=False) #-50dB

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,name,number
386308,103.584030,4.097093,0.033752,0.295918,0.009945,0.005711,3776.466837,519.036374,263.515731,182.143154,1.885769e+05,2.044392e+05,-2.123486,25.366293,-0.318543,0.352558,2,speech-us-gov-0241.wav,89
303818,103.533897,5.099678,0.101069,0.000000,0.004178,0.001387,5162.308673,394.897758,561.912273,193.458258,7.043432e+05,4.608478e+05,1.918406,18.989261,-0.295715,0.363661,2,speech-us-gov-0096.wav,229
199132,103.509537,5.142185,0.130071,0.316327,0.005896,0.006514,5558.035714,620.535714,1018.195921,644.365509,1.164952e+06,1.022221e+06,-2.819079,24.308860,-0.317283,0.360759,2,speech-librivox-0050.wav,281
301631,103.502007,4.769756,0.089006,0.000000,0.004820,0.001552,4860.650510,462.978249,539.939378,110.836712,4.950455e+05,2.126581e+05,2.330338,17.575715,-0.292739,0.358919,2,speech-us-gov-0092.wav,439
251239,103.466988,4.513281,0.112945,0.000000,0.004197,0.001251,3332.270408,272.042735,735.273286,227.572021,4.906727e+05,9.666213e+04,-2.875354,15.000538,-0.298359,0.359911,2,speech-us-gov-0004.wav,523
327072,103.340569,4.745417,0.105632,0.632653,0.006001,0.004415,3662.308673,489.051409,834.551585,452.514144,6.664655e+05,3.432772e+05,-0.646894,15.040396,-0.359059,0.359644,2,speech-us-gov-0137.wav,258
288668,103.264503,4.864324,0.166760,0.326531,0.003911,0.001788,4642.538265,699.826679,1217.145337,267.600700,8.567203e+05,6.667334e+05,2.097822,13.573284,-0.328395,0.365623,2,speech-us-gov-0069.wav,592
302701,103.189339,4.439213,0.078630,0.000000,0.004522,0.001392,4240.752551,340.903368,503.332102,151.797299,3.282388e+05,1.771965e+05,2.967351,22.700313,-0.289400,0.360795,2,speech-us-gov-0094.wav,310
233458,103.173180,4.876123,0.147259,0.040816,0.003854,0.002153,6293.367347,335.909917,1088.597999,830.726610,2.121370e+06,1.299536e+06,0.544089,19.823289,-0.318783,0.353166,2,speech-librivox-0142.wav,412
276602,103.172539,4.846758,0.121820,0.316327,0.004945,0.002140,4271.045918,684.285963,854.785438,224.859834,5.239398e+05,3.119056e+05,0.123551,14.995667,-0.326095,0.363017,2,speech-us-gov-0048.wav,460


In [13]:
slice_df[slice_df['RMS']<103.6].sort_values(by=['SE'],ascending=False) #-50dB

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,name,number
192177,0.592558,5.435210,0.067692,0.000000,0.001391,0.000146,7340.880102,56.660692,4101.085445,135.734866,5.145740e+06,3.310882e+05,0.099361,8.487369,-0.284118,0.367230,2,speech-librivox-0035.wav,549
181238,0.528559,5.433505,0.040815,0.000000,0.001399,0.000149,7385.204082,57.652497,4027.536113,147.132823,5.229439e+06,3.099772e+05,0.619976,8.523189,-0.281328,0.359952,2,speech-librivox-0014.wav,716
208691,0.568716,5.432298,0.062504,0.000000,0.001418,0.000151,7360.650510,69.184672,4220.526233,139.979203,5.052441e+06,3.004055e+05,-0.497758,8.201373,-0.280465,0.366302,2,speech-librivox-0071.wav,394
211855,2.053762,5.432011,0.068192,0.785714,0.001593,0.000570,7220.663265,390.511880,3591.458963,912.411203,4.924863e+06,1.002959e+06,0.618965,8.039362,-0.305668,0.360264,2,speech-librivox-0082.wav,11
216673,1.375000,5.431993,0.035440,0.989796,0.001695,0.002622,7383.290816,147.831302,3993.986064,424.528121,5.262637e+06,5.927163e+05,0.184411,8.954110,-0.291910,0.371885,2,speech-librivox-0095.wav,291
194119,0.700045,5.431071,0.100381,0.000000,0.001422,0.000142,7285.076531,77.999300,4017.777836,155.662131,4.992807e+06,3.161068e+05,1.936959,7.423900,-0.283605,0.362569,2,speech-librivox-0040.wav,138
218369,0.521596,5.429787,0.042753,0.000000,0.001422,0.000155,7403.698980,67.656168,4145.963077,160.568246,5.402992e+06,3.675339e+05,-0.726187,8.078359,-0.282884,0.363847,2,speech-librivox-0099.wav,368
246533,0.741241,5.428766,0.117507,0.000000,0.001417,0.000161,7321.109694,61.984579,4095.943495,143.349054,5.084910e+06,3.294805e+05,0.664495,8.405857,-0.282566,0.364575,2,speech-librivox-0169.wav,271
231778,0.499062,5.426991,0.030752,0.000000,0.001417,0.000144,7378.507653,71.690737,3992.751283,147.723963,5.279817e+06,3.038917e+05,-0.278920,7.906033,-0.282573,0.364181,2,speech-librivox-0136.wav,661
194785,0.521596,5.425334,0.038377,0.000000,0.001419,0.000127,7377.869898,86.805592,4014.564707,174.956928,5.245538e+06,3.009929e+05,0.068437,9.011627,-0.281895,0.365159,2,speech-librivox-0042.wav,33


In [14]:
slice_df.sort_values(by=['SE'],ascending=False).head(100)

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,name,number
217539,3375.467285,5.468673,0.119320,0.306122,0.005058,0.004276,4955.038265,1079.259573,907.617092,1341.959345,6.414223e+05,1.074255e+06,-5.577054,25.970609,-0.435760,0.407534,2,speech-librivox-0097.wav,232
221615,6075.375977,5.451541,0.053503,0.346939,0.008506,0.005906,3806.760204,1417.334060,378.947805,461.766572,3.119963e+05,8.632239e+05,-1.131592,31.810956,-0.386648,0.424434,2,speech-librivox-0111.wav,131
229492,2046.112061,5.451130,0.201888,0.397959,0.006278,0.004782,5802.933673,1403.529172,1574.022418,2034.882992,1.668924e+06,2.267882e+06,-0.747449,21.705788,-0.524313,0.377675,2,speech-librivox-0131.wav,336
246488,5099.961914,5.442212,0.087255,0.489796,0.008367,0.008889,4751.594388,1556.329477,661.016355,926.841396,9.738693e+05,1.748501e+06,0.926624,27.741289,-0.452073,0.385464,2,speech-librivox-0169.wav,226
241753,9277.487305,5.441174,0.063879,0.204082,0.004637,0.003922,4678.890306,1390.463980,1160.956924,1340.334821,9.810752e+05,1.841677e+06,1.879028,26.604772,-0.459653,0.379636,2,speech-librivox-0159.wav,582
194193,1809.947266,5.439093,0.160260,0.530612,0.007370,0.006569,4920.280612,1389.849965,1572.549575,1669.027760,1.175929e+06,1.711476e+06,-0.071274,18.078990,-0.547633,0.385835,2,speech-librivox-0040.wav,212
194029,366.018433,5.436615,0.094318,0.959184,0.002311,0.003981,7058.992347,1139.555452,3732.563220,988.596151,4.726678e+06,1.284646e+06,-1.489663,10.027913,-0.328606,0.391727,2,speech-librivox-0040.wav,48
199196,2980.912842,5.436276,0.077380,0.734694,0.005063,0.007565,5826.211735,1551.761995,2176.331542,1678.362399,2.600672e+06,2.426361e+06,1.060393,22.670100,-0.448411,0.360932,2,speech-librivox-0050.wav,345
192177,0.592558,5.435210,0.067692,0.000000,0.001391,0.000146,7340.880102,56.660692,4101.085445,135.734866,5.145740e+06,3.310882e+05,0.099361,8.487369,-0.284118,0.367230,2,speech-librivox-0035.wav,549
181238,0.528559,5.433505,0.040815,0.000000,0.001399,0.000149,7385.204082,57.652497,4027.536113,147.132823,5.229439e+06,3.099772e+05,0.619976,8.523189,-0.281328,0.359952,2,speech-librivox-0014.wav,716


In [15]:
rms_filtered_df = voice_noise_df[voice_noise_df['RMS']>=103.6] #drop silent
rms_filtered_df.groupby('type').count()

Unnamed: 0_level_0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,name,number
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335,151335
1,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668,20668
2,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179,213179


In [16]:
rms_filtered_df.describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0,385182.0
mean,3943.194152,4.420365,0.106465,0.297434,0.005648364,0.003962381,4444.463623,864.956189,756.077813,487.375612,677141.4,625964.1,0.268533,22.91904,-0.357994,0.363629,1.160558,216.347695
std,2646.199281,0.42703,0.061242,0.239609,0.002321985,0.002482663,928.265558,323.093003,492.209877,461.316331,648329.0,579760.3,2.609568,6.309511,0.076933,0.019163,0.959461,173.941605
min,103.617767,1.610912,0.0,0.0,6.686153e-11,1.857076e-11,1718.75,0.0,55.003685,0.005596,6713.368,12.29212,-60.103344,0.013836,-0.792863,0.293633,0.0,0.0
25%,2009.436188,4.156901,0.065692,0.040816,0.004177053,0.002403382,3640.625,647.92358,442.244055,155.715844,254052.0,194271.1,-1.174951,18.776212,-0.410406,0.352026,0.0,74.0
50%,3277.464844,4.439663,0.094881,0.295918,0.005423886,0.003544314,4276.147959,853.939614,643.269513,321.760984,482584.1,439852.9,0.315234,22.151707,-0.355004,0.360327,2.0,171.0
75%,5328.787964,4.70939,0.133758,0.479592,0.006720004,0.004888095,5113.520408,1067.435827,938.935166,669.319221,883354.8,884639.7,1.784112,26.019376,-0.295984,0.370741,2.0,329.0
max,27775.876953,5.487236,0.865992,0.989796,0.04089047,0.05316446,7559.94898,2366.830854,7029.272341,3001.529504,8526785.0,4892790.0,52.321439,80.425028,-0.139832,0.583676,2.0,1041.0


In [17]:
music_dropped_df = rms_filtered_df[rms_filtered_df['type']>=1] #drop music

In [18]:
X=music_dropped_df.drop(['RMS','SE','type','name','number'], axis=1)
y=music_dropped_df['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=666)

In [19]:
classifier = tree.DecisionTreeClassifier(max_depth = 3)
classifier = classifier.fit(X_train,y_train)
with open("src/data/noise-train/model_tree_1s.pickle", "wb") as file:
    pickle.dump(classifier,file)


In [20]:
prediction = classifier.predict(X_test)
print(np.mean(np.equal(prediction,y_test).astype(np.float32)))

0.9645071625709534


## For testing new audio

In [36]:
from os.path import dirname,abspath,join
TEST_AUDIO_FOLDER = join(os.getcwd(),'src','data','testwav','0712')
TEST_AUDIO_FOLDER

'C:\\Users\\tianr\\Programming\\Python\\Project Speaker Recog\\speaker_recognition_GMM_UBM\\src\\data\\testwav\\0712'

In [34]:
def create_dataset(DATA_FOLDER,WINDOW_LENGTH = 1,FRAME_LENGTH = 25):
    feature_name = "RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,name,number".split(",")
    features_dict = {feature:[] for feature in feature_name}

    for root, dirs, files in os.walk(DATA_FOLDER):
        for audio in files:
            if "noise" in audio or "music" in audio or "speech" in audio or "audio" in audio:
                print("****************************")
                print("reading:", audio)
                sampling_rate, sig = wav.read(join(root, audio))
                print("sampling rate:", sampling_rate, "signal length", len(sig))
                index = 0
                number = 0
                while index + (sampling_rate * WINDOW_LENGTH) < len(sig):
                    sample = sig[index:(index + (sampling_rate * WINDOW_LENGTH))]
                    ef = extract_features(sample, FRAME_LENGTH, sampling_rate)
                    rms, se, zcr, lefr, sf, srf, sc, bd, nwpd, rse = ef.return_()
                    features_dict["RMS"].append(rms)
                    features_dict["SE"].append(se)
                    features_dict["ZCR"].append(zcr)
                    features_dict["LEFR"].append(lefr)
                    features_dict["SF"].append(np.mean(sf))
                    features_dict["SF_std"].append(np.std(sf))
                    features_dict["SC"].append(np.mean(sc))
                    features_dict["SC_std"].append(np.std(sc))
                    features_dict["SRF"].append(np.mean(srf))
                    features_dict["SRF_std"].append(np.std(srf))
                    features_dict["BW"].append(np.mean(bd))
                    features_dict["BW_std"].append(np.std(bd))
                    features_dict["NWPD"].append(np.mean(nwpd))
                    features_dict["NWPD_std"].append(np.std(nwpd))
                    features_dict["RSE"].append(np.mean(rse))
                    features_dict["RSE_std"].append(np.std(rse))
                    features_dict["type"].append(audio.split("-")[0])
                    features_dict["name"].append(audio)
                    features_dict["number"].append(number)
                    number+=1
                    index += sampling_rate * WINDOW_LENGTH

    features_df=pd.DataFrame.from_dict(features_dict)
    features_df = features_df[feature_name]
    return features_df

In [43]:
test_df = create_dataset(TEST_AUDIO_FOLDER)

****************************
reading: noise-12-07-2018-14-28-33.wav
sampling rate: 16000 signal length 80640
****************************
reading: noise-12-07-2018-14-30-07.wav
sampling rate: 16000 signal length 80640


  fft_frame_norm = fft_frame[1:][frames_freq > 0] / (np.sum(abs(fft_frame[1:][frames_freq > 0])))
  lower_spectral_indices = np.argwhere(spectral_cum_sum < roll_of_percentage)


IndexError: index -1 is out of bounds for axis 0 with size 0

In [38]:
test_df

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,name,number
0,293.991791,3.97302,0.099944,0.061224,0.006149,0.003092,5278.69898,1011.903735,672.880786,1207.606688,1222018.0,2903692.0,1.669986,26.309206,-0.324989,0.365549,audiotest08,audiotest08-06-2018-13-12-39.wav,0
1,449.33725,4.313853,0.112195,0.102041,0.005929,0.004058,5226.721939,1169.764777,869.97895,1183.923162,1822642.0,3454109.0,-0.285055,32.10823,-0.422678,0.407742,audiotest08,audiotest08-06-2018-13-12-39.wav,1
2,361.286774,3.420102,0.105819,0.020408,0.006043,0.00358,4904.97449,1254.709864,820.224289,1325.073692,1465716.0,2986901.0,-0.22728,28.435747,-0.375775,0.362199,audiotest08,audiotest08-06-2018-13-12-39.wav,2
3,216.102097,4.086264,0.026564,0.0,0.007073,0.002368,4008.928571,740.223805,214.557554,20.714408,74800.95,37128.49,-1.474259,33.534133,-0.241685,0.368233,audiotest08,audiotest08-06-2018-13-12-39.wav,3
4,398.503754,3.910998,0.084193,0.020408,0.006324,0.003283,4866.390306,1248.426789,555.239605,502.014865,1063088.0,2068430.0,3.395518,23.665625,-0.370642,0.352038,audiotest08,audiotest08-06-2018-13-12-39.wav,4
5,90.472511,4.837205,0.08413,0.77551,0.005437,0.00513,6248.086735,780.264893,606.927651,597.262087,1493769.0,1333869.0,1.416747,20.433988,-0.346418,0.354285,audiotest08,audiotest08-06-2018-16-49-40.wav,0
6,143.601425,4.923841,0.132321,0.418367,0.007005,0.006098,5128.507653,1419.705206,1128.768649,1906.31846,830231.4,1486546.0,2.428192,25.528278,-0.418496,0.423359,audiotest08,audiotest08-06-2018-16-49-40.wav,1
7,238.644638,4.596469,0.07913,0.408163,0.005996,0.004643,4700.57398,1351.8969,656.794525,883.418627,974726.4,2392047.0,2.108834,24.773406,-0.418293,0.375955,audiotest08,audiotest08-06-2018-16-49-40.wav,2
8,191.138077,3.821882,0.090818,0.540816,0.006262,0.005952,5116.709184,1251.948028,804.254097,1304.926184,1123511.0,2331296.0,2.831823,27.854644,-0.441153,0.357276,audiotest08,audiotest08-06-2018-16-49-40.wav,3
9,220.147583,4.153591,0.082255,0.510204,0.006102,0.006304,5037.627551,1375.85461,764.820155,1487.80589,722901.3,1539863.0,6.532763,27.96534,-0.425382,0.349427,audiotest08,audiotest08-06-2018-16-49-40.wav,4


In [46]:
test_df['is_silent'] = test_df['RMS']<103.6

In [49]:
test_df_features=test_df.drop(['RMS','SE','type','name','number','is_silent','is_speech'], axis=1)
test_predictions = classifier.predict(test_df_features)
test_df['is_speech']=test_predictions==2


In [60]:
test_df[test_df['is_silent']==False][['name','number','is_speech']]

Unnamed: 0,name,number,is_speech
0,audiotest08-06-2018-13-12-39.wav,0,False
1,audiotest08-06-2018-13-12-39.wav,1,True
2,audiotest08-06-2018-13-12-39.wav,2,False
3,audiotest08-06-2018-13-12-39.wav,3,False
4,audiotest08-06-2018-13-12-39.wav,4,False
6,audiotest08-06-2018-16-49-40.wav,1,True
7,audiotest08-06-2018-16-49-40.wav,2,True
8,audiotest08-06-2018-16-49-40.wav,3,True
9,audiotest08-06-2018-16-49-40.wav,4,True
11,audiotest08-06-2018-17-01-12.wav,1,True


In [58]:
from sklearn.tree import _tree
import json

JSON_FILE_NAME=join(TEST_AUDIO_FOLDER,'tree_model.json')
feature_names = "ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std".split(",")

tree_ = classifier.tree_


feature_name = [
    feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
    for i in tree_.feature
]
print("def tree({}):".format(", ".join(feature_names)))

json_file = {}

def recurse(node, depth, json_file):
    indent = "  " * depth
    if tree_.feature[node] != _tree.TREE_UNDEFINED:
        name = feature_name[node]
        threshold = tree_.threshold[node]
        json_file["feature"] = name
        json_file["threshold"] = threshold
#        json_file["decision"] = None
        print("{}if {} <= {}:".format(indent, name, threshold))
        try:
            temp = json_file["left"]
        except:
            json_file["left"] = {}
        recurse(tree_.children_left[node], depth + 1, json_file["left"])
        print("{}else:  # if {} > {}".format(indent, name, threshold))
        try:
            temp = json_file["right"]
        except:
            json_file["right"] = {}
        recurse(tree_.children_right[node], depth + 1, json_file["right"])
    else:
        print("{}return {}".format(indent, tree_.value[node]))
        json_file["decision"] = np.argmax(tree_.value[node]) == 1
#        json_file["threshold"] = 0.0
#        json_file["feature"] = None
#        json_file["left"] = None
#        json_file["right"] = None
        return json_file

recurse(0, 1, json_file)
print(json.dumps(json_file, sort_keys=True, indent=4))
with open(JSON_FILE_NAME, "w") as file:
    json.dump(json_file, file)


def tree(ZCR, LEFR, SF, SF_std, SRF, SRF_std, SC, SC_std, BW, BW_std, NWPD, NWPD_std, RSE, RSE_std):
  if LEFR <= 0.07653061300516129:
    if SRF <= 3910.873779296875:
      if BW_std <= 81729.5625:
        return [[1412.  488.]]
      else:  # if BW_std > 81729.5625
        return [[ 334. 1468.]]
    else:  # if SRF > 3910.873779296875
      if NWPD_std <= 30.761507034301758:
        return [[9489.  579.]]
      else:  # if NWPD_std > 30.761507034301758
        return [[1108.  636.]]
  else:  # if LEFR > 0.07653061300516129
    if RSE <= -0.2980222702026367:
      if SRF <= 6230.0703125:
        return [[  2820. 184134.]]
      else:  # if SRF > 6230.0703125
        return [[1359. 2221.]]
    else:  # if RSE > -0.2980222702026367
      if SRF <= 4419.32421875:
        return [[ 383. 1421.]]
      else:  # if SRF > 4419.32421875
        return [[1727.  883.]]
{
    "decision": null,
    "feature": "LEFR",
    "left": {
        "decision": null,
        "feature": "SRF",
        "left":