In [142]:
import os
import wave
import time
import pickle
import pyaudio
import warnings
import numpy as np
import pandas as pd
from scipy.io.wavfile import read
import librosa
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizer_v1 import Adam
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
warnings.filterwarnings("ignore")

In [143]:
def extract_features(audio):

    signal, sr = librosa.load(audio, res_type = 'kaiser_fast')
    mfccs = librosa.feature.mfcc(signal, n_mfcc=13, sr = sr)
    delta_mfccs = librosa.feature.delta(mfccs)
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)
    comprehensive_mfccs = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))
    comprehensive_mfccs = np.mean(comprehensive_mfccs.transpose(), axis = 0)
    comprehensive_mfccs.reshape((-1,1))
    return comprehensive_mfccs




In [144]:
def record_audio_train():
    Name = (input("Please Enter Your Name:"))
    for count in range(5):
        FORMAT = pyaudio.paInt16
        CHANNELS = 1
        RATE = 44100
        CHUNK = 512
        RECORD_SECONDS = 10
        device_index = 2
        audio = pyaudio.PyAudio()
        print("----------------------record device list---------------------")
        info = audio.get_host_api_info_by_index(0)
        numdevices = info.get('deviceCount')
        for i in range(numdevices):
            if audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels') > 0:
                print("Input Device id " + str(i) + " - " +
                      audio.get_device_info_by_host_api_device_index(0, i).get('name'))
        print("-------------------------------------------------------------")
        index = int(input())
        print("recording via index "+str(index))
        stream = audio.open(format=FORMAT, channels=CHANNELS,
                            rate=RATE, input=True, input_device_index=index,
                            frames_per_buffer=CHUNK)
        print("recording started")
        Recordframes = []
        for i in range(int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            Recordframes.append(data)
        print("recording stopped")
        stream.stop_stream()
        stream.close()
        audio.terminate()
        OUTPUT_FILENAME = Name+"-sample"+str(count)+".wav"
        WAVE_OUTPUT_FILENAME = os.path.join("training_set", OUTPUT_FILENAME)
        trainedfilelist = open("training_set_addition.txt", 'a')
        trainedfilelist.write(OUTPUT_FILENAME+"\n")
        waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
        waveFile.setnchannels(CHANNELS)
        waveFile.setsampwidth(audio.get_sample_size(FORMAT))
        waveFile.setframerate(RATE)
        waveFile.writeframes(b''.join(Recordframes))
        waveFile.close()

In [145]:
def record_audio_test():

    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    CHUNK = 512
    RECORD_SECONDS = 10
    device_index = 2
    audio = pyaudio.PyAudio()
    print("----------------------record device list---------------------")
    info = audio.get_host_api_info_by_index(0)
    numdevices = info.get('deviceCount')
    for i in range(0, numdevices):
        if (audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
            print("Input Device id ", i, " - ",
                  audio.get_device_info_by_host_api_device_index(0, i).get('name'))
    print("-------------------------------------------------------------")
    index = int(input())
    print("recording via index "+str(index))
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True, input_device_index=index,
                        frames_per_buffer=CHUNK)
    print("recording started")
    Recordframes = []
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        Recordframes.append(data)
    print("recording stopped")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    savedname = input("Please input saved wave filename: ")
    OUTPUT_FILENAME = savedname + ".wav"
    WAVE_OUTPUT_FILENAME = "testing_set/" + OUTPUT_FILENAME
    trainedfilelist = open("testing_set_addition.txt", 'w')
    for fname in os.listdir("testing_set/"):
        if fname.endswith('.wav'):
            trainedfilelist.write(fname + "\n")
    trainedfilelist.write(OUTPUT_FILENAME +"\n")
    waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(audio.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(b''.join(Recordframes))
    waveFile.close()


In [146]:
def train_and_test_model():

    source = "training_set/"
    train_file = "training_set_addition.txt"
    
    file_paths = open(train_file, 'r')
    features = []

    for path in file_paths:
        path = path.strip()
        class_label = path.split("-")[0]
        print(path)
        data = extract_features(source + path)
        features.append([data, class_label])

    featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

    # Convert features and corresponding classification labels into numpy arrays
    X = np.array(featuresdf.feature.tolist())
    y = np.array(featuresdf.class_label.tolist())

    # Encode the classification labels
    le = LabelEncoder()
    yy = to_categorical(le.fit_transform(y))
    x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 127)

    model = Sequential()
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(yy.shape[1]))
    model.add(Activation('softmax'))


    # Evaluate the model and pretrain with test data
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    score = model.evaluate(x_test, y_test, verbose=0)
    print("pretraining accuracy: " + str(score[1] * 100))

     # train with train data
    model.fit(X, yy, batch_size=32, epochs=5, verbose=1, validation_data=(x_test, y_test))

    # evaluate the accuracy on both data
    score = model.evaluate(x_train, y_train, verbose=0)
    print("Training Accuracy: {0:.2%}".format(score[1]))

    score = model.evaluate(x_test, y_test, verbose=0)
    print("Testing Accuracy: {0:.2%}".format(score[1]))

    source = "testing_set/"
    test_file = "testing_set_addition.txt"
    file_paths = open(test_file, 'r')

    # Read the test directory and get the list of test audio files
    for path in file_paths:

        path = path.strip()
        print(path)
        data = extract_features(source + path)
        label = model.predict(X[:,0])
        print(label)
        

    

In [147]:
def test_model():

    source = "testing_set/"
    modelpath = "trained_models/"
    test_file = "testing_set_addition.txt"
    file_paths = open(test_file, 'r')

    gmm_files = [os.path.join(modelpath, fname) for fname in
                 os.listdir(modelpath) if fname.endswith('.gmm')]

    # Load the Gaussian gender Models
    models = [pickle.load(open(fname, 'rb')) for fname in gmm_files]
    speakers = [fname.split("\\")[-1].split(".gmm")[0] for fname
                in gmm_files]

    # Read the test directory and get the list of test audio files
    for path in file_paths:

        try:
            path = path.strip()
            sr, audio = read(source + path)
            vector = extract_features(audio)
        except:
            print("error: " + path + " not found")
            continue

        log_likelihood = np.zeros(len(models))

        for i in range(len(models)):
            gmm = models[i]  # checking with each model one by one
            scores = np.array(gmm.score(vector))
            log_likelihood[i] = scores.sum()

        winner = np.argmax(log_likelihood)
        print(path," detected as - ", speakers[winner])
        time.sleep(1.0)

In [148]:
while True:
    choice = int(input(
        "\n 1.Record audio for training \n 2.Train Model \n 3.Record audio for testing \n 4.Test Model\n"))
    if(choice == 1):
        record_audio_train()
    elif(choice == 2):
        train_and_test_model()
    elif(choice == 3):
        record_audio_test()
    elif(choice == 4):
        test_model()
    if(choice > 4):
        exit()

Kailyn-sample4.wav
Kailyn-sample1.wav
Kailyn-sample0.wav
Kailyn-sample2.wav
Kailyn-sample3.wav
Frank-sample4.wav
Frank-sample2.wav
Frank-sample3.wav
Frank-sample1.wav
Frank-sample0.wav
Godfather-sample0.wav
Godfather-sample1.wav
Godfather-sample2.wav
Godfather-sample3.wav
Godfather-sample4.wav
pretraining accuracy: 33.33333432674408
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training Accuracy: 66.67%
Testing Accuracy: 66.67%
godfather.wav


ValueError: in user code:

    File "/Users/frank/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "/Users/frank/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/frank/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/frank/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1751, in predict_step
        return self(x, training=False)
    File "/Users/frank/opt/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/frank/opt/anaconda3/lib/python3.8/site-packages/keras/engine/input_spec.py", line 228, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" '

    ValueError: Exception encountered when calling layer "sequential_20" (type Sequential).
    
    Input 0 of layer "dense_59" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (None,)
    
    Call arguments received:
      • inputs=tf.Tensor(shape=(None,), dtype=float32)
      • training=False
      • mask=None
