In [None]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import random
import copy
import os
import graphviz
import scipy.io.wavfile as wav
from src.voice_activity_detection.extract_features import extract_features

In [2]:
voice_noise_data = np.load("src/data/noise-train/features.npy").item()
voice_noise_df = pd.DataFrame.from_dict(voice_noise_data)
voice_noise_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77664 entries, 0 to 77663
Data columns (total 9 columns):
RMS                  77664 non-null int64
ZCR                  77664 non-null float64
audio                77664 non-null object
bandwidth            77664 non-null float64
nwpd                 77664 non-null float64
rse                  76846 non-null float64
spectral_centroid    77664 non-null float64
spectral_flux        77664 non-null float64
spectral_rolloff     77664 non-null float64
dtypes: float64(7), int64(1), object(1)
memory usage: 5.3+ MB


In [3]:
voice_noise_df = voice_noise_df[pd.notnull(voice_noise_df['rse'])]
voice_noise_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76846 entries, 1 to 77663
Data columns (total 9 columns):
RMS                  76846 non-null int64
ZCR                  76846 non-null float64
audio                76846 non-null object
bandwidth            76846 non-null float64
nwpd                 76846 non-null float64
rse                  76846 non-null float64
spectral_centroid    76846 non-null float64
spectral_flux        76846 non-null float64
spectral_rolloff     76846 non-null float64
dtypes: float64(7), int64(1), object(1)
memory usage: 5.9+ MB


In [4]:
voice_noise_df.describe()

Unnamed: 0,RMS,ZCR,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
count,76846.0,76846.0,76846.0,76846.0,76846.0,76846.0,76846.0,76846.0
mean,167.818963,0.106162,686381.5,0.271261,-inf,757.398939,0.018741,4454.837767
std,108.289759,0.053201,592655.3,1.490372,,433.981223,0.007833,886.254567
min,0.0,5e-05,6881.946,-32.282923,-inf,55.599252,0.00153,1753.576807
25%,60.0,0.070601,296253.0,-0.410469,-0.2992179,477.869557,0.01365,3645.456827
50%,194.0,0.098514,528365.5,0.312929,-0.2534488,671.782718,0.017666,4319.088855
75%,253.0,0.132314,887765.4,1.02084,-0.1863887,947.220822,0.022452,5097.719001
max,495.0,0.860161,7441943.0,51.500733,-0.02729339,7007.05765,0.092539,7535.391566


In [5]:
voice_noise_df[voice_noise_df['rse']==-np.inf].describe()

Unnamed: 0,RMS,ZCR,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,252.5,0.165486,1056925.0,0.727845,-inf,1388.688601,0.019334,5284.732681
std,138.444935,0.183082,819981.5,1.051611,,1408.936775,0.013689,1094.253513
min,48.0,0.036113,464266.9,0.143988,-inf,499.139786,0.003947,4105.170683
25%,230.25,0.076295,598861.4,0.184715,-inf,646.239637,0.010123,4674.683107
50%,306.0,0.094451,750140.8,0.231948,-inf,782.903417,0.019801,5169.490462
75%,328.25,0.183643,1208205.0,0.775077,-inf,1525.352381,0.029012,5779.540035
max,350.0,0.43693,2263152.0,2.303495,-inf,3489.807785,0.033788,6694.779116


In [6]:
voice_noise_df['rse'].replace(-np.inf, np.nan, inplace=True)
voice_noise_df = voice_noise_df[pd.notnull(voice_noise_df['rse'])]
voice_noise_df.describe()

Unnamed: 0,RMS,ZCR,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
count,76842.0,76842.0,76842.0,76842.0,76842.0,76842.0,76842.0,76842.0
mean,167.814555,0.106159,686362.2,0.271237,-0.247532,757.366078,0.018741,4454.794567
std,108.287399,0.053188,592642.5,1.490393,0.070612,433.879312,0.007833,886.231032
min,0.0,5e-05,6881.946,-32.282923,-0.583062,55.599252,0.00153,1753.576807
25%,60.0,0.070601,296250.7,-0.410483,-0.29921,477.851408,0.01365,3645.456827
50%,194.0,0.098514,528360.8,0.312983,-0.253445,671.767431,0.017666,4319.088855
75%,253.0,0.132314,887765.4,1.02084,-0.186384,947.220822,0.022452,5097.577811
max,495.0,0.860161,7441943.0,51.500733,-0.027293,7007.05765,0.092539,7535.391566


In [7]:
le=LabelEncoder()
voice_noise_df['audio'] = le.fit_transform(voice_noise_df["audio"])
list(le.classes_)


['music', 'noise', 'speech']

In [8]:
voice_noise_df.describe()

Unnamed: 0,RMS,ZCR,audio,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
count,76842.0,76842.0,76842.0,76842.0,76842.0,76842.0,76842.0,76842.0,76842.0
mean,167.814555,0.106159,1.162151,686362.2,0.271237,-0.247532,757.366078,0.018741,4454.794567
std,108.287399,0.053188,0.96044,592642.5,1.490393,0.070612,433.879312,0.007833,886.231032
min,0.0,5e-05,0.0,6881.946,-32.282923,-0.583062,55.599252,0.00153,1753.576807
25%,60.0,0.070601,0.0,296250.7,-0.410483,-0.29921,477.851408,0.01365,3645.456827
50%,194.0,0.098514,2.0,528360.8,0.312983,-0.253445,671.767431,0.017666,4319.088855
75%,253.0,0.132314,2.0,887765.4,1.02084,-0.186384,947.220822,0.022452,5097.577811
max,495.0,0.860161,2.0,7441943.0,51.500733,-0.027293,7007.05765,0.092539,7535.391566


In [11]:
voice_noise_df.groupby('audio').count()

Unnamed: 0_level_0,RMS,ZCR,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
audio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,30221,30221,30221,30221,30221,30221,30221,30221
1,3940,3940,3940,3940,3940,3940,3940,3940
2,42681,42681,42681,42681,42681,42681,42681,42681


In [12]:
voice_noise_df = voice_noise_df[voice_noise_df['audio']>=1] #drop music
voice_noise_df.describe()

Unnamed: 0,RMS,ZCR,audio,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
count,46621.0,46621.0,46621.0,46621.0,46621.0,46621.0,46621.0,46621.0,46621.0
mean,229.701272,0.113753,1.915489,688378.2,0.36166,-0.28577,854.86497,0.017563,4294.457032
std,77.392241,0.055584,0.278156,571693.6,1.50249,0.055874,465.986515,0.00623,805.84511
min,0.0,5e-05,1.0,6881.946,-32.282923,-0.583062,55.599252,0.00153,1753.576807
25%,198.0,0.078226,2.0,335276.5,-0.302952,-0.320624,558.248307,0.013542,3589.922189
50%,238.0,0.106839,2.0,540454.0,0.371469,-0.286585,780.355226,0.017097,4101.531124
75%,275.0,0.137952,2.0,845632.1,1.05991,-0.255194,1041.261602,0.021153,4848.330823
max,495.0,0.860161,2.0,7441943.0,51.500733,-0.027293,7007.05765,0.092539,7535.391566


In [13]:
X=voice_noise_df.drop('audio', axis=1)
y=voice_noise_df['audio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=666)

In [14]:
classifier = tree.DecisionTreeClassifier(max_depth = 3)
classifier = classifier.fit(X_train,y_train)

In [15]:
prediction = classifier.predict(X_test)
print(np.mean(np.equal(prediction,y_test).astype(np.float32)))

0.9860605001449585


## For testing new audio

In [16]:
from os.path import dirname,abspath,join
TEST_AUDIO_FOLDER = join(os.getcwd(),'src','data','testwav')
TEST_AUDIO_FOLDER

'C:\\Users\\tianr\\Programming\\Python\\Project Speaker Recog\\speaker_recognition_GMM_UBM\\src\\data\\testwav'

In [17]:
def create_dataset(DATA_FOLDER,WINDOW_LENGTH = 5,FRAME_LENGTH = 25):
    dataset_dict = {"ZCR": [], "RMS": [], "spectral_flux": [], \
                    "spectral_centroid": [], "spectral_rolloff": [], \
                    "bandwidth": [], "audio": [], "nwpd": [], "rse": []}
    for root, dirs, files in os.walk(DATA_FOLDER):
        for audio in files:
            if "noise" in audio or "music" in audio or "speech" in audio or "audio" in audio:
                print("****************************")
                print("reading:", audio)
                sampling_rate, sig = wav.read(join(root, audio))
                print("sampling rate:", sampling_rate, "signal length", len(sig))
                index = 0
                while index + (sampling_rate * WINDOW_LENGTH) < len(sig):
                    sample = sig[index:(index + (sampling_rate * WINDOW_LENGTH))]
                    ef = extract_features(sample, FRAME_LENGTH, sampling_rate)
                    ZCR, RMS, sf, sr, sc, bd, nwpd, rse = ef.return_()
                    dataset_dict["ZCR"].append(ZCR)
                    dataset_dict["RMS"].append(RMS)
                    dataset_dict["spectral_flux"].append(np.mean(sf))
                    dataset_dict["spectral_centroid"].append(np.mean(sc))
                    dataset_dict["spectral_rolloff"].append(np.mean(sr))
                    dataset_dict["bandwidth"].append(np.mean(bd))
                    dataset_dict["nwpd"].append(np.mean(nwpd))
                    dataset_dict["rse"].append(np.mean(rse))
                    dataset_dict["audio"].append(audio.split("-")[0])
                    index += sampling_rate * WINDOW_LENGTH

    values = dataset_dict.values()
    print([len(e) for e in values])
    print("finished")
    return dataset_dict

In [18]:
features_test_dict = create_dataset(TEST_AUDIO_FOLDER)

****************************
reading: audiotest08-06-2018-13-12-39.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-16-49-40.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-01-12.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-01-26.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-01-43.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-01-56.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-02-11.wav
sampling rate: 16000 signal length 84480
****************************
reading: audiotest08-06-2018-17-02-44.wav
sampling rate: 16000 signal length 84480
[8, 8, 8, 8, 8, 8, 8, 8, 8]
finished


In [19]:
test_df = pd.DataFrame.from_dict(features_test_dict)
test_df=test_df.drop('audio', axis=1)
test_df.describe()

Unnamed: 0,RMS,ZCR,bandwidth,nwpd,rse,spectral_centroid,spectral_flux,spectral_rolloff
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,215.375,0.138881,1693560.0,0.820804,-0.272634,1023.248453,0.019258,5694.967369
std,96.34601,0.050681,1003788.0,1.163321,0.038125,337.02392,0.006145,618.055152
min,31.0,0.085739,668675.2,-0.566549,-0.337949,641.419533,0.008497,4864.646084
25%,175.25,0.104339,1071056.0,0.244993,-0.304492,798.55177,0.015903,5216.365462
50%,238.5,0.128089,1303808.0,0.517611,-0.256206,957.427323,0.019909,5574.171687
75%,270.25,0.155996,2038066.0,1.194605,-0.246046,1106.870288,0.022354,6265.358308
max,347.0,0.239553,3614656.0,3.289868,-0.235335,1700.323351,0.027444,6469.879518


In [20]:
test_predictions = classifier.predict(test_df)
print(test_predictions)

[1 2 2 2 2 2 2 2]


In [25]:
from sklearn.tree import _tree
import json

JSON_FILE_NAME=join(TEST_AUDIO_FOLDER,'tree_model.json')
feature_names=["RMS","ZCR","bandwidth","nwpd","rse",\
                                "spectral_centroid","spectral_flux",\
                                "spectral_rolloff"]
tree_ = classifier.tree_


feature_name = [
    feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
    for i in tree_.feature
]
print("def tree({}):".format(", ".join(feature_names)))

json_file = {}

def recurse(node, depth, json_file):
    indent = "  " * depth
    if tree_.feature[node] != _tree.TREE_UNDEFINED:
        name = feature_name[node]
        threshold = tree_.threshold[node]
        json_file["feature"] = name
        json_file["threshold"] = threshold
        json_file["decision"] = None
        print("{}if {} <= {}:".format(indent, name, threshold))
        try:
            temp = json_file["left"]
        except:
            json_file["left"] = {}
        recurse(tree_.children_left[node], depth + 1, json_file["left"])
        print("{}else:  # if {} > {}".format(indent, name, threshold))
        try:
            temp = json_file["right"]
        except:
            json_file["right"] = {}
        recurse(tree_.children_right[node], depth + 1, json_file["right"])
    else:
        print("{}return {}".format(indent, tree_.value[node]))
        json_file["decision"] = str(np.argmax(tree_.value[node]) == 1)
        json_file["threshold"] = 0.0
        json_file["feature"] = None
        json_file["left"] = None
        json_file["right"] = None
        return json_file

recurse(0, 1, json_file)
print(json.dumps(json_file, sort_keys=True, indent=4))
with open(JSON_FILE_NAME, "w") as file:
    json.dump(json_file, file)


def tree(RMS, ZCR, bandwidth, nwpd, rse, spectral_centroid, spectral_flux, spectral_rolloff):
  if rse <= -0.20204812288284302:
    if spectral_flux <= 0.005999185610562563:
      if spectral_flux <= 0.005247130990028381:
        return [[182.   7.]]
      else:  # if spectral_flux > 0.005247130990028381
        return [[32. 32.]]
    else:  # if spectral_flux > 0.005999185610562563
      if RMS <= 38.0:
        return [[75. 11.]]
      else:  # if RMS > 38.0
        return [[  438. 37827.]]
  else:  # if rse > -0.20204812288284302
    if rse <= -0.18715137243270874:
      if spectral_rolloff <= 4425.4833984375:
        return [[ 43. 231.]]
      else:  # if spectral_rolloff > 4425.4833984375
        return [[274.  45.]]
    else:  # if rse > -0.18715137243270874
      if RMS <= 310.0:
        return [[2399.  178.]]
      else:  # if RMS > 310.0
        return [[118.  66.]]
{
    "decision": null,
    "feature": "rse",
    "left": {
        "decision": null,
        "feature": "spectra