# Speech Recognition using DTW
- Thanat Lapthawan ID:60070701605


Distance: https://dtaidistance.readthedocs.io
<br>Script to record audio: https://gist.github.com/mabdrabo/8678538
## Import the required library

In [35]:
from python_speech_features import mfcc
import scipy.io.wavfile as wav
from dtaidistance import dtw_ndim

import pyaudio
import wave
import numpy as np
import pandas as pd
import IPython 
import seaborn as sns
import os
import glob
%matplotlib inline

## Record Sound Function

In [58]:
# define record function
def record_sound(output_file_name = 'output_file.wav', select_device=False, filepath='templates'):
    
    audio = pyaudio.PyAudio()
    
    # if we want to change microphone device (default = 0, build-in microphone)
    if select_device:
        # Show Record Device list 
        device_index = 2
        info = audio.get_host_api_info_by_index(0)
        numdevices = info.get('deviceCount')
        for i in range(0, numdevices):
                if (audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
                    print("Input Device id ", i, " - ", audio.get_device_info_by_host_api_device_index(0, i).get('name'))

        # Select Device from list above
        index = int(input())
        print("recording via index "+str(index)+'\n\n')
    else:
        index = 0

    # define varialbe
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    CHUNK = 512
    RECORD_SECONDS = 3
    
    print('--    Start Recording    --\n')
    print('Record time: %s sec'%RECORD_SECONDS)
    
    # Define audio 
    stream = audio.open(format=FORMAT, 
                        channels=CHANNELS,
                        rate=RATE, 
                        input=True,
                        input_device_index = index,
                        frames_per_buffer=CHUNK)
    print("recording...")
    frames = []
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("stop recording")

    stream.stop_stream()
    stream.close()
    audio.terminate()
    
    # concat file path
    if filepath: 
        filename = filepath+'/'+output_file_name
    else:
        filename = output_file_name

    # Save to WAV file
    waveFile = wave.open(filename, 'wb')
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(audio.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(b''.join(frames))
    waveFile.close()
    
    print('\n-- Record file successful --')

In [27]:
record_sound(output_file_name = 'output_file.wav')

--    Start Recording    --

Record time: 3 sec
recording...
stop recording

-- Record file successful --


In [19]:
IPython.display.Audio('output_file.wav')

## Create speech template function

In [32]:
def record_n_template(word, n = 2, filepath = 'templates'):
    for i in range(n):
        print('Record template %s of word %s'%(n,word))
        record_sound(word+'-'+str(i+1)+'.wav',filepath=filepath)
        if i < n-1: input("Press Enter to continue...")

In [38]:
record_n_template('hello',2)

Record template 2 of word hello
--    Start Recording    --

Record time: 3 sec
recording...
stop recording

-- Record file successful --


Press Enter to continue... 


Record template 2 of word hello
--    Start Recording    --

Record time: 3 sec
recording...
stop recording

-- Record file successful --


In [42]:
def read_speech_feature(filename):
    (rate,sig) = wav.read(filename)
    features = mfcc(sig,rate)
    return features

In [46]:
def create_template_speech_feat(word_list, filepath='templates'):
    
    template_speech_feat = []
    label = []
    
    for word in word_list:
        file_list = glob.glob('%s/%s*'%(filepath,word))
        for file in file_list:
            template_speech_feat.append(read_speech_feature(filename=file))
            label.append(word)
    
    return template_speech_feat, label

In [47]:
word_list = ['hi','hello']
template_speech_feat, label = create_template_speech_feat(word_list)

## Define DTW Function

In [50]:
def dtw(mfcc_input, template_speech_feat, template_label):
    
    speech_feature_list = [mfcc_input] + template_speech_feat
    d = np.round(dtw_ndim.distance_matrix(speech_feature_list),2)[0,1:]
    res = pd.DataFrame({'y':template_label, 'd':d})

    # calculate distance as weight 
    res['weight'] = 1/(res['d']**2)

    # scale weight as the contribution score
    res['weight_scaled'] = res['weight']/sum(res['weight'])

    # rank the score and pick the top one
    pred_res = res.groupby('y').agg(sum).sort_values(by='weight_scaled',ascending=False).index[0]

    print("The system recognise word: %s"%pred_res)

In [61]:
def recognition_dtw(template_speech_feat, template_label, show_input_sig=False):
    record_sound('file.wav',filepath=None)
    
    # Read file
    [rate, sig] = wav.read('file.wav')
    if show_input_sig:
        sns.lineplot(x=np.arange(1600,sig.shape[0]),y=sig[1600:])
    mfcc_input = mfcc(sig[1600:],rate)
    
    dtw(mfcc_input, template_speech_feat, template_label)
    

## Applied function

In [63]:
# record template 1
record_n_template('whatup',2)

Record template 2 of word whatup
--    Start Recording    --

Record time: 3 sec
recording...
stop recording

-- Record file successful --


Press Enter to continue... 


Record template 2 of word whatup
--    Start Recording    --

Record time: 3 sec
recording...
stop recording

-- Record file successful --


In [64]:
# record template 2
record_n_template('yo',2)

Record template 2 of word yo
--    Start Recording    --

Record time: 3 sec
recording...
stop recording

-- Record file successful --


Press Enter to continue... 


Record template 2 of word yo
--    Start Recording    --

Record time: 3 sec
recording...
stop recording

-- Record file successful --


In [65]:
# create template
word_list = ['whatup','yo']
template_speech_feat, label = create_template_speech_feat(word_list)

In [68]:
# recognise word
recognition_dtw(template_speech_feat, label, show_input_sig=False)

--    Start Recording    --

Record time: 3 sec
recording...
stop recording

-- Record file successful --
The system recognise word: yo
