In [1]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
from os import listdir
import numpy as np

In [2]:
# Some functions for audio visualization

# Plot the <num_samples> of audio signal <x>
def plot_wave(x, num_samples):
    y = x[:num_samples]
    
    x = np.arange(num_samples)
    x = x/44100
    
    plt.title("Amplitude versus Time(Samples)")
    plt.ylabel("Amplitude")
    plt.xlabel("Time(Samples)")
    plt.plot(x,y)
    plt.show()
    
# Plot the results of STFT on time series <x> in decibels
def plot_db_scaled_spectogram(x, num_samples, n_fft=2048, hop_length=512):
    x = x[:num_samples]
    
    D = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)
    half = D.shape[0] // 2
    D = D[:half]
    
    D = librosa.amplitude_to_db(D, ref=np.max)
    
    librosa.display.specshow(D, y_axis='linear')
    plt.colorbar(format='%+2.0f dB')
    plt.title('DB spectogram')
    plt.show()

# Similar to the function above, uses logarithmic scale for the y axis
def plot_log_db_scaled_spectogram(x, num_samples, n_fft=2048, hop_length=512):
    x = x[:num_samples]

    D = librosa.amplitude_to_db(librosa.stft(x, n_fft=n_fft, hop_length=hop_length), ref=np.max)
    librosa.display.specshow(D, y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('DB spectogram')
    plt.show()

In [3]:
def plot_db(D):
    librosa.display.specshow(D, y_axis='linear')
    plt.colorbar(format='%+2.0f dB')
    plt.title('DB spectogram')
    plt.show()

# Journal
1) By using window_size(n_fft) of 0.25s(sample_rate // 4) and hop_length = n_fft // 4, we can get an accuracy of around 90% for predicting 6 classes. The number of training examples affect the accuracy. 100 examples per speaker is good.

# Things that can be done
1) Cut the frequencies by half after FFT because you don't actually need the top few frequencies

In [4]:
test_dir = "./LibriSpeech/test-clean"
examples = [] # To store examples
sample_rate = 44100 # Control sampling rate
n_fft = sample_rate // 4 # We are going to use 0.25s as the window
hop_length = n_fft // 4 # Hop_length is default which is n_fft // 4
num_examples_per_speaker = 200 # Number of samples to get for each speaker
use_decibel = False # Use decibel instead of magnitude if this is set to true
num_class = 6 # Number of speakers to used, must be smaller than number of possible speakers
seconds_per_example = 1
silence_threshold = 0.01 # If the amplitude in the time series does not exceed this threshold, it is not included in the examples
frequency_threshold = n_fft * 3 // 16 # A threshold calculated by checking the mean and max of STFT result
# frequency_threshold = n_fft

In [5]:
# Convert a time series into multiple smaller examples
def convert_series_to_multiple_examples(x, label):
    D = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)
    D = D[:frequency_threshold]
    
    if use_decibel:
        # Use decibel
        D = librosa.amplitude_to_db(D, ref=np.max)
    else:
        # Use magnitude
        D = np.abs(D)
    
    D = np.transpose(D)
    D = np.insert(D, D.shape[1], label, 1)

    return D

In [6]:
# Convert a time series into an example
def convert_series_to_example(x, label):
    D = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)
    D = D[:frequency_threshold]

    if use_decibel:
        # Use decibel
        D = librosa.amplitude_to_db(D, ref=np.max)
    else:
        # Use magnitude
        D = np.abs(D)
    
    # Calculate and plot mean and max of STFT result
#     length =  len(D) // 16
    
#     plot_db(D)
    
#     i = 0
#     for _ in range(8):
#         print("Quarter {}".format(_))
#         k = D[i:i+length]
#         print(np.mean(k))
#         print(np.amax(k))
#         i += length

    D = np.ndarray.flatten(D)
    D = np.append(D, label)
#     D = np.transpose(D)
#     D = np.insert(D, D.shape[1], label, 1)
    
    return D

In [7]:
num_speakers = 0
for speaker_id in listdir(test_dir):
    # Loop through each speaker
    label = int(speaker_id)
    num_examples = 0
    curr_examples = []

    if num_speakers >= num_class:
        break
    
    for books in listdir(test_dir + "/" + speaker_id):
        # Loop through each books for speaker_id
        for audio_files in listdir(test_dir + "/" + speaker_id + "/" + books):
            # Loop through each audio for that book
            
            if audio_files.split('.')[1] != 'flac':
                # Check if its a flac file
                continue
            
            # Load time series
            file_name = test_dir + "/" + speaker_id + "/" + books + "/" + audio_files
            time_series, rate = librosa.load(file_name, sr=sample_rate)
            
            duration = len(time_series) // sample_rate
            
            sample_time_series = []
            
            hop_per_example = int(sample_rate * seconds_per_example)
            num_loops = int(duration // seconds_per_example)
            
            # First method
            start = 0
            for _ in range(num_loops):
                extracted = time_series[start:start + hop_per_example]
                
                if max(extracted) > silence_threshold:
                    sample_time_series += [extracted]
                    
                start += hop_per_example
                            
            for i in sample_time_series:
                curr_examples.append(convert_series_to_example(i, label))
                num_examples += 1
                
                if num_examples >= num_examples_per_speaker:
                    break
            
            # Second method
#             curr_examples += convert_time_series_to_multiple_examples(time_series, label)
#             num_examples += D.shape[0]
    
            if num_examples >= num_examples_per_speaker:
                break
            
        if num_examples >= num_examples_per_speaker:
            # Stop for the outer loop        
            break
    
    # Second method
#     C = np.concatenate(curr_examples)
#     examples += [C[:num_examples_per_speaker]]        

    # First method
    examples += curr_examples
    num_speakers += 1
    print("Speaker {} done".format(num_speakers))

Speaker 1 done
Speaker 2 done
Speaker 3 done
Speaker 4 done
Speaker 5 done
Speaker 6 done


In [8]:
# E = np.concatenate(examples)
E = np.array(examples)
np.random.shuffle(E)
print(E.shape)

(1200, 35140)


In [None]:
# Old code
# num_class_to_predict = 6 # Number of speakers to used, must be smaller than num_speakers

# np.random.shuffle(examples) # Shuffle the speakers before extracting
# E = np.concatenate(examples[:num_class_to_predict])

In [9]:
training_size = len(E) * 90 // 100
testing_size = len(E) - training_size
print("Training_size: {}\nTesting_size: {}".format(training_size, testing_size))

np.random.shuffle(E)
train = E[:training_size]
test = E[training_size:]

features_train = train[:,:-1]
labels_train = train[:,-1]

features_test = test[:,:-1]
labels_test = test[:,-1]

Training_size: 1080
Testing_size: 120


In [11]:
from sklearn.neural_network import MLPClassifier

In [None]:
# clf = MLPClassifier(early_stopping=True, activation='logistic', hidden_layer_sizes=(100,100), alpha=0.00001, verbose=True)
clf = MLPClassifier(early_stopping=True, activation='relu', alpha=1e-06, verbose=True, 
	learning_rate='invscaling', solver='adam', hidden_layer_sizes=(256,128), warm_start=True)
clf.fit(features_train, labels_train) 
print("Accuracy: {}".format(clf.score(features_test, labels_test)))
print(clf)

Iteration 1, loss = 14.86042987
Validation score: 0.287037
Iteration 2, loss = 8.97531327
Validation score: 0.574074
Iteration 3, loss = 2.77990311
Validation score: 0.814815
Iteration 4, loss = 1.16104590
Validation score: 0.814815
Iteration 5, loss = 0.43860079
Validation score: 0.824074


# Network Configuration
early_stopping=True, activation='relu', alpha=1e-06, verbose=True, learning_rate='invscaling', solver='adam', hidden_layer_sizes=(256,128), warm_start=True

Using warm start, which means that the previous trained classifier is used instead of starting it from scratch


In [None]:
clf = MLPClassifier(early_stopping=True, activation='relu', alpha=1e-06, verbose=True, 
        solver='adam', hidden_layer_sizes=(256,128), warm_start=False)

In [None]:
clf = MLPClassifier(early_stopping=True, activation='relu', alpha=1e-06, verbose=False, 
        solver='adam', hidden_layer_sizes=(256), warm_start=False)

In [None]:
from sklearn.model_selection import validation_curve

In [None]:
train_scores, valid_scores = validation_curve(clf, features_train, labels_train, "hidden_layer_sizes", [(2**i) for i in range(11)], verbose=3)


In [None]:
f = plt.figure()
title = "Number of Hidden Units Versus Accuracy\n1 Hidden Layer Neural Network"
plt.title(title)

plt.xlabel("Number of Hidden Units")
plt.ylabel("Accuracy")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

train_sizes_abs = [(2**i) for i in range(11)]

plt.fill_between(train_sizes_abs, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes_abs, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes_abs, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes_abs, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.xscale('log')

plt.legend(loc="best")
# plt.show()

f.savefig("hidden_units_one_layer_10_speakers.pdf")

In [None]:
# For storing stuff, don't simply run this
# speakers_x = x
# speakers_y = y

In [None]:
# f = plt.figure()
plt.title("Number of Speakers Classified vs Accuracy")
plt.xlabel("Number of Speakers Classified")
plt.ylabel("Accuracy")
m, b = np.polyfit(x, y, 1)

plt.plot(x, y, '.')
plt.plot(x, m * x + b, '-')
# plt.show()

f.savefig("speakers_acc.pdf")

In [None]:
f = plt.figure()
title = "Learning Curve Classifying 10 Speakers"
plt.title(title)

plt.xlabel("Training examples")
plt.ylabel("Score")

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.grid()

plt.fill_between(train_sizes_abs, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes_abs, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes_abs, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes_abs, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.legend(loc="best")
plt.show()

f.savefig("learning_curve_20_speakers.pdf")