In [2]:
from pydub import AudioSegment
from python_speech_features import logfbank
from sklearn.cluster import KMeans

import numpy as np
import scipy.io.wavfile as wav
import speech_recognition as sr



In [3]:
def detect_leading_silence(sound, silence_threshold=-40.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms

    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

In [4]:
def remover_silencio(comando):

    sound = AudioSegment.from_file(comando, format="wav")

    start_trim = detect_leading_silence(sound)
    end_trim = detect_leading_silence(sound.reverse())

    duration = len(sound)    
    trimmed_sound = sound[start_trim:duration-end_trim]
    
    comando_recortado = comando[:-4] + "_recortado.wav"

    trimmed_sound.set_channels(1).export(comando_recortado, format="wav")
    
    return (comando_recortado)

In [5]:
def limpar_audio(comando):

    r = sr.Recognizer()

    som = sr.AudioFile(comando)
    with som as source:
        r.adjust_for_ambient_noise(source)
        audio = r.record(source)
        
    comando_limpo = comando[:-4] + "_limpo.wav"

    with open(comando_limpo, "wb") as f:
        f.write(audio.get_wav_data())
        
    return (comando_limpo)

In [6]:
def extrair_caracteristica(audio):
    
    (rate,sig) = wav.read(remover_silencio(limpar_audio(audio)))
    fbank_feat = logfbank(sig,samplerate=rate, lowfreq=50, highfreq=3400)
    fbank_feat = fbank_feat[0:,2:13]
    
    caracteristica = []
    for x in fbank_feat:
        for y in x:
            caracteristica.append(y)   
    
    return (caracteristica)

In [7]:
def dados(audios):
    caracteristicas = []
    
    for audio in audios:
        caracteristicas.append(extrair_caracteristica(audio))
        
    
    return (treinamento(caracteristicas))

In [8]:
def treinamento(comandos):
    menor_comando = len(comandos[0])
    
    for comando in comandos:
        print(len(comando))
        if (len(comando) < menor_comando):
            menor_comando = len(comando)
    x = 0        
    for comando in comandos:
        comandos[x] = comando[0:menor_comando]
        x += 1

    X = np.array(comandos)
        
    kmeans = KMeans(precompute_distances=True, n_clusters=len(X), random_state=0, n_init=12, max_iter=3000, tol=0.00001, verbose=0, algorithm='elkan').fit(X)
    
    print (kmeans.cluster_centers_)
    
    print(kmeans.labels_)
    
    return (kmeans)

In [27]:
kmeans = dados(["zero.wav", "um.wav"])



352
231
[[ 10.38010272 -36.04365339   8.71986289   6.1190892    7.92944027
    4.89458479   4.9657277    4.00809017   4.20303438   4.91625016
    3.99258882   9.7046126  -36.04365339   9.13595045   8.12636072
    9.31276558   8.65272111   4.23767422   5.06959223   3.84242341
    4.344874     4.88040045  10.51966057 -36.04365339  10.13005209
    9.0716493    9.68740931   8.51936755   6.85804671   6.61847471
    6.16129054   7.06538056   6.01130174  10.64756415 -36.04365339
   10.44666598   9.78878939   9.24985897   8.43688625   5.6771477
    4.11883522   6.59950995   6.43368258   5.3840741    9.66408468
  -36.04365339   9.44613067   8.86274683   9.56911658   9.57136037
    7.22845118   6.46342051   6.0040419    2.46318953   6.68507724
   10.53453695 -36.04365339   9.47247462   9.96499231   8.50396309
    8.49505996   5.53018385   6.16498067   6.48234529   6.35542466
    6.29954092   9.67699545 -36.04365339   9.28370443   8.98197289
    9.36087616   7.63970837   7.54745905   7.03074237  

In [33]:
a = extrair_caracteristica("sim1.wav")



In [31]:
kmeans.predict([extrair_caracteristica("zero2.wav")[0:231]])



array([0])

In [26]:
len(extrair_caracteristica("um2.wav"))



363