In [89]:
from pydub import AudioSegment
from python_speech_features import logfbank
from python_speech_features import mfcc
from sklearn.cluster import KMeans

import numpy as np
import scipy.io.wavfile as wav
import speech_recognition as sr

In [34]:
def detect_leading_silence(sound, silence_threshold=-40.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms

    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

In [3]:
def remover_silencio(comando):

    sound = AudioSegment.from_file(comando, format="wav")

    start_trim = detect_leading_silence(sound)
    end_trim = detect_leading_silence(sound.reverse())

    duration = len(sound)    
    trimmed_sound = sound[start_trim:duration-end_trim]
    
    comando_recortado = comando[:-4] + "_recortado.wav"

    trimmed_sound.set_channels(1).export(comando_recortado, format="wav")
    
    return (comando_recortado)

In [4]:
def limpar_audio(comando):

    r = sr.Recognizer()

    som = sr.AudioFile(comando)
    with som as source:
        r.adjust_for_ambient_noise(source)
        audio = r.record(source)
        
    comando_limpo = comando[:-4] + "_limpo.wav"

    with open(comando_limpo, "wb") as f:
        f.write(audio.get_wav_data())
        
    return (comando_limpo)

In [93]:
def extrair_caracteristica(audio):
    
    (rate,sig) = wav.read(remover_silencio(limpar_audio(audio)))
    fbank_feat = logfbank(sig,samplerate=rate, lowfreq=50, highfreq=3400)
    fbank_feat = fbank_feat[0:,2:13]
    
    caracteristica = []
    for x in fbank_feat:
        for y in x:
            caracteristica.append(y)   
    
    return (caracteristica)

In [6]:
def dados(audios):
    caracteristicas = []
    
    for audio in audios:
        caracteristicas.append(extrair_caracteristica(audio))
        
    
    return (treinamento(caracteristicas))

In [25]:
def treinamento(comandos):
    menor_comando = len(comandos[0])
    
    for comando in comandos:
        print(len(comando))
        if (len(comando) < menor_comando):
            menor_comando = len(comando)
    x = 0        
    for comando in comandos:
        comandos[x] = comando[0:menor_comando]
        x += 1

    X = np.array(comandos)
        
    kmeans = KMeans(precompute_distances=True, n_clusters=len(X), random_state=0, n_init=12, max_iter=3000, tol=0.00001, verbose=0, algorithm='elkan').fit(X)
    
    print (kmeans.cluster_centers_)
    
    print(kmeans.labels_)
    
    return (kmeans)

In [96]:
kmeans = dados(["0.wav", "1.wav", "2.wav", "3.wav","4.wav", "5.wav"])



1562
1716
1606
1815
1892
1760
[[ 10.96387825 -36.04365339  10.33902417 ...   8.75455252   9.99610335
    8.89413494]
 [ 10.82375432 -36.04365339  10.37738279 ...   7.23551959   6.42151046
    7.56654649]
 [ 10.35773664 -36.04365339   9.08769543 ...   8.63939086   8.34006435
    7.5047147 ]
 [ 10.62184655 -36.04365339   7.31078712 ...   5.70095284   7.36104991
    6.60659975]
 [  9.1009345  -36.04365339   8.65002498 ...   5.57823663   7.1081914
    4.62838427]
 [ 10.78928264 -36.04365339   9.24738535 ...   6.32668582   6.6074452
    7.65071806]]
[3 4 1 2 0 5]


In [83]:
a = extrair_caracteristica("0a.wav") * 2



In [107]:
kmeans.predict([extrair_caracteristica("4a.wav")[0:1562]])



array([2])

In [82]:
len(extrair_caracteristica("0a.wav"))



2849