In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import random
import copy
import os
import graphviz
import pickle
import scipy.io.wavfile as wav
from src.voice_activity_detection.extract_features import extract_features

In [2]:
with open("src/data/noise-train/features_df_1s.pickle", "rb") as file:
    voice_noise_df=pickle.load(file)
voice_noise_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392416 entries, 0 to 392415
Data columns (total 19 columns):
RMS         392416 non-null float64
SE          392416 non-null float64
ZCR         392416 non-null float64
LEFR        392416 non-null float64
SF          392416 non-null float64
SF_std      392416 non-null float64
SRF         392416 non-null float64
SRF_std     392416 non-null float64
SC          392416 non-null float64
SC_std      392416 non-null float64
BW          392416 non-null float64
BW_std      392416 non-null float64
NWPD        392416 non-null float64
NWPD_std    392416 non-null float64
RSE         391352 non-null float64
RSE_std     391339 non-null float64
type        392416 non-null object
name        392416 non-null object
number      392416 non-null int64
dtypes: float64(16), int64(1), object(2)
memory usage: 56.9+ MB


In [3]:
voice_noise_df = voice_noise_df[pd.notnull(voice_noise_df['RSE'])]
voice_noise_df = voice_noise_df[pd.notnull(voice_noise_df['RSE_std'])]
voice_noise_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 391339 entries, 0 to 392415
Data columns (total 19 columns):
RMS         391339 non-null float64
SE          391339 non-null float64
ZCR         391339 non-null float64
LEFR        391339 non-null float64
SF          391339 non-null float64
SF_std      391339 non-null float64
SRF         391339 non-null float64
SRF_std     391339 non-null float64
SC          391339 non-null float64
SC_std      391339 non-null float64
BW          391339 non-null float64
BW_std      391339 non-null float64
NWPD        391339 non-null float64
NWPD_std    391339 non-null float64
RSE         391339 non-null float64
RSE_std     391339 non-null float64
type        391339 non-null object
name        391339 non-null object
number      391339 non-null int64
dtypes: float64(16), int64(1), object(2)
memory usage: 59.7+ MB


In [4]:
voice_noise_df.describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,number
count,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0
mean,3881.72355,4.4261,0.106271,0.295513,0.005627191,0.003939445,4468.916636,857.176179,766.989241,485.522178,700650.5,626379.4,0.268833,22.843964,-0.356949,0.363653,216.597027
std,2669.945559,0.43036,0.061618,0.240645,0.002347471,0.002494668,952.072181,328.532905,522.803081,460.40549,711035.3,580621.5,2.624867,6.38437,0.076928,0.019068,174.3911
min,0.490089,1.610912,0.0,0.0,6.686153e-11,1.857076e-11,1718.75,0.0,55.003685,0.005596,6713.368,12.29212,-60.607956,0.013836,-0.792863,0.293633,0.0
25%,1947.66333,4.160194,0.065317,0.040816,0.004143628,0.002372378,3645.408163,638.588104,442.474243,155.143118,255975.7,194709.9,-1.168532,18.690534,-0.409401,0.352149,74.0
50%,3232.506348,4.443933,0.094693,0.295918,0.005406255,0.003525452,4294.642857,848.883006,645.299844,319.852183,487534.9,439043.2,0.315418,22.102554,-0.353456,0.36045,171.0
75%,5282.64624,4.715955,0.133633,0.479592,0.00671055,0.00487705,5146.364796,1063.872622,945.241476,666.003322,897843.8,884848.6,1.780692,25.995758,-0.294575,0.37064,329.0
max,27775.876953,5.487236,0.865992,1.0,0.04780443,0.05316446,7559.94898,2366.830854,7029.272341,3001.529504,8526785.0,4892790.0,52.321439,80.425028,-0.139832,0.583676,1046.0


In [5]:
le=LabelEncoder()
voice_noise_df['type'] = le.fit_transform(voice_noise_df["type"])
list(le.classes_)


['music', 'noise', 'speech']

In [6]:
voice_noise_df.describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0,391339.0
mean,3881.72355,4.4261,0.106271,0.295513,0.005627191,0.003939445,4468.916636,857.176179,766.989241,485.522178,700650.5,626379.4,0.268833,22.843964,-0.356949,0.363653,1.163268,216.597027
std,2669.945559,0.43036,0.061618,0.240645,0.002347471,0.002494668,952.072181,328.532905,522.803081,460.40549,711035.3,580621.5,2.624867,6.38437,0.076928,0.019068,0.957943,174.3911
min,0.490089,1.610912,0.0,0.0,6.686153e-11,1.857076e-11,1718.75,0.0,55.003685,0.005596,6713.368,12.29212,-60.607956,0.013836,-0.792863,0.293633,0.0,0.0
25%,1947.66333,4.160194,0.065317,0.040816,0.004143628,0.002372378,3645.408163,638.588104,442.474243,155.143118,255975.7,194709.9,-1.168532,18.690534,-0.409401,0.352149,0.0,74.0
50%,3232.506348,4.443933,0.094693,0.295918,0.005406255,0.003525452,4294.642857,848.883006,645.299844,319.852183,487534.9,439043.2,0.315418,22.102554,-0.353456,0.36045,2.0,171.0
75%,5282.64624,4.715955,0.133633,0.479592,0.00671055,0.00487705,5146.364796,1063.872622,945.241476,666.003322,897843.8,884848.6,1.780692,25.995758,-0.294575,0.37064,2.0,329.0
max,27775.876953,5.487236,0.865992,1.0,0.04780443,0.05316446,7559.94898,2366.830854,7029.272341,3001.529504,8526785.0,4892790.0,52.321439,80.425028,-0.139832,0.583676,2.0,1046.0


In [7]:
voice_noise_df.groupby('type').count()

Unnamed: 0_level_0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,name,number
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826,152826
1,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794,21794
2,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719,216719


In [8]:
voice_noise_df[voice_noise_df["type"]==0].describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0,152826.0
mean,5260.765401,4.424306,0.094214,0.105165,0.004948,0.003457888,4709.087179,765.977004,613.917737,265.020561,693406.8,520160.0,0.128564,23.893921,-0.299098,0.360282,0.0,144.843561
std,2985.758783,0.421915,0.052359,0.150751,0.002595,0.002697407,996.254522,292.960678,402.856023,268.817637,725850.0,581211.2,2.638701,6.473233,0.049296,0.013886,0.0,123.19891
min,0.49314,1.610912,0.0,0.0,2e-06,3.00369e-07,2520.408163,25.944956,56.69804,0.219177,32927.52,872.6883,-33.636531,0.574031,-0.650863,0.309139,0.0,0.0
25%,3051.080017,4.175859,0.058066,0.0,0.003234,0.001678861,3905.931122,539.965701,370.92745,90.489629,169443.2,86721.53,-1.356098,19.41752,-0.327677,0.352253,0.0,58.0
50%,4845.141357,4.456599,0.083755,0.030612,0.004352,0.002657558,4700.57398,763.421494,528.052621,181.27167,450792.8,296632.2,0.206791,23.132141,-0.292903,0.358419,0.0,120.0
75%,7288.090332,4.710782,0.121445,0.163265,0.005911,0.004351539,5499.681122,978.046886,751.472281,339.713755,973924.1,737761.2,1.706189,27.452872,-0.263077,0.365586,0.0,195.0
max,20044.730469,5.469131,0.847928,0.969388,0.04089,0.05254569,7549.42602,2366.830854,6859.332631,2775.633655,7946042.0,4188144.0,24.933125,75.767237,-0.140178,0.583676,0.0,986.0


In [9]:
voice_noise_df[voice_noise_df["type"]==1].describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0,21794.0
mean,3262.921516,4.646693,0.145969,0.148091,0.005898923,0.003618198,5329.322978,473.939689,1118.559027,314.108277,1247476.0,506371.2,0.423374,20.522269,-0.295885,0.366325,1.0,45.45976
std,3443.133244,0.459104,0.13418,0.247794,0.004306943,0.004024925,1095.296361,305.141461,1145.555542,363.861089,1402186.0,577778.2,4.140878,9.961264,0.047644,0.014017,0.0,66.960912
min,0.490089,1.666465,0.0,0.0,6.686153e-11,1.857076e-11,1718.75,0.0,55.003685,0.005596,6713.368,12.29212,-60.607956,0.013836,-0.792863,0.301916,1.0,0.0
25%,859.693207,4.371085,0.052878,0.0,0.002953919,0.001064399,4462.691327,238.733886,331.944172,81.52241,262356.2,95193.35,-0.760495,13.328905,-0.309761,0.360438,1.0,7.0
50%,2083.733765,4.66483,0.102194,0.0,0.004723632,0.002316589,5305.803571,417.027593,677.408318,164.81159,631123.5,275719.5,0.503186,18.117083,-0.28743,0.363976,1.0,21.0
75%,4792.595825,4.98178,0.199622,0.204082,0.007465715,0.00467359,6279.655612,647.904884,1521.478683,406.197584,1730434.0,705146.1,1.880707,25.507686,-0.277494,0.36833,1.0,53.0
max,27775.876953,5.487236,0.865992,1.0,0.04780443,0.05316446,7559.94898,2162.255981,7029.272341,2609.393025,8526785.0,4517787.0,52.321439,78.42031,-0.139832,0.558185,1.0,493.0


In [10]:
voice_noise_df[voice_noise_df["type"]==2].describe()

Unnamed: 0,RMS,SE,ZCR,LEFR,SF,SF_std,SRF,SRF_std,SC,SC_std,BW,BW_std,NWPD,NWPD_std,RSE,RSE_std,type,number
count,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0,216719.0
mean,2971.479047,4.405181,0.110781,0.444569,0.006079,0.004311,4213.027677,960.02762,839.577218,658.253571,650768.0,713351.7,0.352207,22.337032,-0.403885,0.365761,2.0,284.406314
std,1784.847066,0.427063,0.053206,0.180046,0.001674,0.002041,797.280659,306.271002,461.433898,501.41845,557760.5,565688.8,2.404992,5.700792,0.061694,0.022083,0.0,180.52333
min,0.493964,2.038474,0.0,0.0,0.000414,7.7e-05,2725.765306,30.929479,64.236217,4.109941,38214.3,7739.655,-23.636608,3.34833,-0.70746,0.293633,2.0,0.0
25%,1718.92627,4.133914,0.072255,0.326531,0.004994,0.003079,3578.125,749.299586,520.773712,263.506682,302950.3,305522.2,-1.085957,18.644982,-0.442553,0.351086,2.0,129.0
50%,2640.538574,4.415585,0.101694,0.438776,0.005897,0.003966,3953.125,925.345787,742.87275,508.775029,494925.3,521104.3,0.371128,21.681376,-0.398538,0.361839,2.0,270.0
75%,3820.906616,4.692225,0.138759,0.561224,0.006947,0.005058,4715.561224,1150.897622,1041.975587,919.072806,824025.7,992937.5,1.819917,25.090125,-0.361237,0.37589,2.0,429.0
max,13021.230469,5.468673,0.640165,1.0,0.031259,0.040925,7528.061224,2344.886859,5544.019437,3001.529504,7502836.0,4892790.0,25.507908,80.425028,-0.151569,0.522019,2.0,1046.0


In [11]:
slice_df=voice_noise_df[voice_noise_df["type"]==2]
slice_df.loc[slice_df['RMS'].idxmin()]

RMS                         0.493964
SE                           5.40496
ZCR                        0.0293143
LEFR                               0
SF                        0.00139793
SF_std                   0.000165807
SRF                          7367.03
SRF_std                       73.669
SC                           3977.55
SC_std                       137.286
BW                       5.20161e+06
BW_std                        355116
NWPD                       -0.512627
NWPD_std                     8.83862
RSE                        -0.283284
RSE_std                     0.362347
type                               2
name        speech-librivox-0150.wav
number                           788
Name: 236702, dtype: object

In [12]:
rms_filtered_df = voice_noise_df[voice_noise_df['RMS']>=103.6] #drop silent

In [13]:
music_dropped_df = rms_filtered_df[rms_filtered_df['type']>=1] #drop music

In [14]:
X=music_dropped_df.drop(['RMS','SE','type','name','number'], axis=1)
y=music_dropped_df['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=666)

In [28]:
classifier = tree.DecisionTreeClassifier(max_depth=5,class_weight ="balanced")
classifier = classifier.fit(X_train,y_train)
with open("src/data/noise-train/model_tree_1s_bal_5.pickle", "wb") as file:
    pickle.dump(classifier,file)


In [29]:
prediction = classifier.predict(X_test)
print(np.mean(np.equal(prediction,y_test).astype(np.float32)))

0.9486423134803772


## For testing new audio