# 0. setup

In [14]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import librosa
import librosa.display
import pandas as pd
import tensorflow.keras as keras
import tensorflow_io as tfio

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
from pathlib import Path
from IPython.display import Audio

from sklearn.model_selection import train_test_split

# Set the seed value for experiment reproducibility.
seed = 42
SR=16000 # resampling as the ram can't handle this much calculation
BATCH_SIZE = 8
AUD_LENGTH = 10#sec
tf.random.set_seed(seed)
np.random.seed(seed)

# 1. Dataset generation

In [15]:
# getting audio dataset path to divide into 3 datasets and also for making tf datasets later

DATASET_AUDIO_PATH = 'classwise_final_2k_imtiaz/classwise_final_2k/'
class_names = os.listdir(DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))

audio_paths = []
labels = []
for label, name in enumerate(class_names):
    label = int(name)
    print("Processing speaker {}".format(name,))
    print("Actual Label ",label)
    dir_path = Path(DATASET_AUDIO_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)

print(
    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)

Our class names: ['0', '1', '2', '3', '4', '5']
Processing speaker 0
Actual Label  0
Processing speaker 1
Actual Label  1
Processing speaker 2
Actual Label  2
Processing speaker 3
Actual Label  3
Processing speaker 4
Actual Label  4
Processing speaker 5
Actual Label  5
Found 10500 files belonging to 6 classes.


## asvspoof

In [16]:
## Adding ASVSpoof data
eval_path ='dataset_all/external files/asvspoof' 
speaker_sample_paths = [ os.path.join(eval_path, filepath) 
                        for filepath in os.listdir(eval_path) if filepath.endswith(".wav") ] 
X_asv = [] 
X_asv += speaker_sample_paths
X_asv = X_asv[:30000]
label_asv = [5]*len(X_asv)
print( "Found {} files".format(len(X_asv)))
check, X_taken, _, _ = train_test_split(X_asv, label_asv, test_size=0.005, random_state=seed)
print(len(X_taken))
audio_paths+= X_taken
labels += [5]*len(X_taken)
print(len(labels))

Found 30000 files
150
10650


In [17]:
final_test = check
len(final_test)

29850

## librispeech

In [18]:
## Adding librispeech data
eval_path ='dataset_all/external files/librispeech' 
speaker_sample_paths = [ os.path.join(eval_path, filepath) 
                        for filepath in os.listdir(eval_path) if filepath.endswith(".wav") ] 
X_asv = [] 
X_asv += speaker_sample_paths
label_asv = [5]*len(X_asv)
print( "Found {} files".format(len(X_asv)))
X_taken,libri_test, _, _ = train_test_split(X_asv, label_asv, test_size=0.85, random_state=seed)
print(len(libri_test))
audio_paths+= X_taken
labels += [5]*len(X_taken)
print(len(labels))

Found 1000 files
850
10800


In [19]:
X_train, X_val, y_train, y_val = train_test_split(audio_paths[:6300], labels[:6300], test_size=0.3, random_state=seed)

In [20]:
print(len(X_train),len(y_train))
print(len(X_val),len(y_val))

4410 4410
1890 1890


### making repeated dataset

In [21]:
# utility functions for repeating audio files
def repeated_data(file_path):
    """ This function will take a file path and give out truncated and padded to 10s version waveform"""
    y, sr = librosa.load(file_path,sr=SR)
    aud_length = AUD_LENGTH*sr # making all audio length 10 s and truncating the rest
    duration = librosa.get_duration(y=y, sr=sr)
    if duration < AUD_LENGTH:
        y = np.tile(y, int((aud_length/sr) // duration)+1)
    y = librosa.resample(y[:aud_length], orig_sr=sr, target_sr=SR)
    # y = tf.signal.dct(y, type=2, n=SR*AUD_LENGTH, axis=-1, norm=None, name='dct')
    return y

def repeated_dataset(dataset):
    """ This function generated waveshape dataset"""
    new_ds = []
    for f in dataset:
        new_ds.append(repeated_data(f))
    return new_ds

### datagenerator

In [22]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size= BATCH_SIZE, 
                 n_classes=6, shuffle=True):
        'Initialization'
        self.dim = AUD_LENGTH * SR
        self.batch_size = batch_size
        self.labels = labels
        self.shuffle = shuffle
        self.list_IDs = list_IDs
        self.on_epoch_end()

    def path_to_audio(self,path):
        """Reads and decodes an audio file."""
        return repeated_data(path)

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'
        
        X = []
        y = []
        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            _tempx = self.path_to_audio(self.list_IDs[ID])
            #_tempx = self.spect_audio(_tempx)
            X.append(_tempx)

            # Store class
            y.append(self.labels[ID])
        #print(np.reshape(np.array(X), (self.batch_size,SR*AUD_LENGTH,1)).shape)
        return np.reshape(np.array(X), (self.batch_size,SR*AUD_LENGTH,1)).astype(np.float32),np.array(y).astype(np.float32)

In [23]:
# generating tf datasets
train_ds = DataGenerator(X_train,y_train)
valid_ds = DataGenerator(X_val,y_val)

# 2. Building A Model

In [24]:
attLayer2 = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=2)
attLayer3 = tf.keras.layers.MultiHeadAttention(num_heads=3, key_dim=2)
attLayer4 = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=2)

In [25]:
# model3
def vgg191D(input_shape,num_classes):
    inputs = tf.keras.layers.Input(shape=input_shape, name="input")
    
    x1     = tf.keras.layers.Conv1D(16, kernel_size = 9, padding="same")(inputs)
    x1,_,_ = tf.keras.layers.LSTM(16, return_sequences=True, return_state=True)(x1)
    x2     = tf.keras.layers.Conv1D(16, kernel_size = 9, padding="same")(x1)
    x2,_,_ = tf.keras.layers.LSTM(16, return_sequences=True, return_state=True)(x2)
    x      = tf.keras.layers.Concatenate(axis=1)([x1,x2])
    #x, _   = attLayer2(x1, x2, return_attention_scores=True)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    x      = tf.keras.layers.ReLU()(x)
    
    x1     = tf.keras.layers.Conv1D(64, kernel_size = 7, padding="same")(x)
    x1,_,_ = tf.keras.layers.LSTM(64, return_sequences=True, return_state=True)(x1)
    x2     = tf.keras.layers.Conv1D(64, kernel_size = 7, padding="same")(x1)
    x2,_,_ = tf.keras.layers.LSTM(64, return_sequences=True, return_state=True)(x2)
    x      = tf.keras.layers.Concatenate(axis=1)([x1,x2])
    #x, _   = attLayer2(x1, x2, return_attention_scores=True)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    x      = tf.keras.layers.ReLU()(x)
    
    x1     = tf.keras.layers.Conv1D(128, kernel_size = 3, padding="same")(x)
    x1,_,_ = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)(x1)
    x2     = tf.keras.layers.Conv1D(128, kernel_size = 3, padding="same")(x1)
    x2,_,_ = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)(x2)
    x3     = tf.keras.layers.Conv1D(128, kernel_size = 3, padding="same")(x2)
    x3,_,_ = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)(x3)
    x4     = tf.keras.layers.Conv1D(128, kernel_size = 3, padding="same")(x3)
    x4,_,_ = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)(x4)   
    x      = tf.keras.layers.Concatenate(axis=1)([x1,x2,x3,x4])
    #x, _   = attLayer4(x1, x2, x3, x4, return_attention_scores=True)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    x      = tf.keras.layers.ReLU()(x)
    
    x1     = tf.keras.layers.Conv1D(64, kernel_size = 7, padding="same")(x)
    x1,_,_ = tf.keras.layers.LSTM(64, return_sequences=True, return_state=True)(x1)
    x2     = tf.keras.layers.Conv1D(64, kernel_size = 7, padding="same")(x1)
    x2,_,_ = tf.keras.layers.LSTM(64, return_sequences=True, return_state=True)(x2)
    x3     = tf.keras.layers.Conv1D(64, kernel_size = 7, padding="same")(x2)
    x3,_,_ = tf.keras.layers.LSTM(64, return_sequences=True, return_state=True)(x3)
    x4     = tf.keras.layers.Conv1D(64, kernel_size = 7, padding="same")(x3)
    x4,_,_ = tf.keras.layers.LSTM(64, return_sequences=True, return_state=True)(x4)
    x      = tf.keras.layers.Concatenate(axis=1)([x1,x2,x3,x4])
    #x, _   = attLayer4(x1, x2, x3, x4, return_attention_scores=True)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    x      = tf.keras.layers.ReLU()(x)
    
    x1     = tf.keras.layers.Conv1D(16, kernel_size = 9, padding="same")(x)
    x1,_,_ = tf.keras.layers.LSTM(16, return_sequences=True, return_state=True)(x1)
    x2     = tf.keras.layers.Conv1D(16, kernel_size = 9, padding="same")(x1)
    x2,_,_ = tf.keras.layers.LSTM(16, return_sequences=True, return_state=True)(x2)
    x3     = tf.keras.layers.Conv1D(16, kernel_size = 9, padding="same")(x2)
    x3,_,_ = tf.keras.layers.LSTM(16, return_sequences=True, return_state=True)(x2)
    x4     = tf.keras.layers.Conv1D(16, kernel_size = 9, padding="same")(x3)
    x4,_,_ = tf.keras.layers.LSTM(16, return_sequences=True, return_state=True)(x4)
    x      = tf.keras.layers.Concatenate(axis=1)([x1,x2,x3,x4])
    #x, _   = attLayer4(x1, x2, x3, x4, return_attention_scores=True)
    x      = tf.keras.layers.MaxPool1D(pool_size = x.shape[-1])(x)
    x      = tf.keras.layers.ReLU()(x)
    ####
    x     = tf.keras.layers.Conv1D(64, kernel_size = 1, padding="same")(x)
    x,_,_ = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)(x)
    x     = tf.keras.layers.GlobalAveragePooling1D()(x)
    x     = tf.keras.layers.ReLU()(x)

    x      = tf.keras.layers.Flatten()(x)
    x      = tf.keras.layers.Dense(64, activation="relu")(x)
    x      = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(num_classes, activation="softmax", name="output")(x)
    
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)

aud_length = SR*AUD_LENGTH

model = vgg191D((aud_length, 1), len(class_names))

model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 160000, 1)]  0           []                               
                                                                                                  
 conv1d_34 (Conv1D)             (None, 160000, 16)   160         ['input[0][0]']                  
                                                                                                  
 lstm_34 (LSTM)                 [(None, 160000, 16)  2112        ['conv1d_34[0][0]']              
                                , (None, 16),                                                     
                                 (None, 16)]                                                      
                                                                                            

In [26]:
from sklearn.model_selection import KFold
EPOCHS=100
NFOLDS=5
folds = KFold(n_splits=NFOLDS)
splits = folds.split(audio_paths, labels)

def evaluate_model(X_train, X_val, y_train, y_val,j):
    
    train_ds = DataGenerator(X_train,y_train)
    valid_ds = DataGenerator(X_val,y_val)
    aud_length = AUD_LENGTH * SR
    model = vgg191D((aud_length, 1), len(class_names))
    epochs = EPOCHS
    batch_size = BATCH_SIZE
    model.compile(
    optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    weight_save_filename = "weight_vgg19_lstm_pretrained_cv"+str(j)+"fold_.h5"
    lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=1, mode='min', min_lr=1e-9)
    earlystopping_cb = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.001, patience=10, mode='min', restore_best_weights=True)
    mdlcheckpoint_cb = tf.keras.callbacks.ModelCheckpoint(weight_save_filename, monitor="val_accuracy", save_best_only=True,save_weights_only=True)
    history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[lr_reduce,earlystopping_cb, mdlcheckpoint_cb],
)
 
    _, val_acc = model.evaluate(valid_ds, verbose = 1)
    model.load_weights("./"+weight_save_filename) #
    model.save("model_vgg19_lstm_pretrained_cv"+str(j)+"fold_.h5")
    return model, val_acc

fin_model = 1
cv_scores, model_history = list(), list()
train = audio_paths
targets = labels
for fold, (train_idx, val_idx) in enumerate(splits):
    X_train = []
    X_valid = []
    y_train = []
    y_valid = []
    for i in train_idx:
        X_train.append(train[i])
        y_train.append(targets[i])
    for j in val_idx:
        X_valid.append(train[j])
        y_valid.append(targets[j])
    #X_train, X_valid = train[], train[test_idx]
    #y_train, y_valid = targets[train_idx], targets[test_idx]
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    model, val_acc = evaluate_model(X_train, X_val, y_train, y_val,fold)
    print('>%.3f' % val_acc)
    cv_scores.append(val_acc)
    if val_acc == max(cv_scores):
        fin_model = model
    model_history.append(model)

--------------- > Fold 1 < ---------------
Epoch 1/100

In [None]:
def ensemble_predictions(members, testX,testy=1):
    yhats = [model.predict(testX) for model in members]
    yhats = np.array(yhats)
    # sum across ensemble members
    summed = np.sum(yhats, axis=0)
    # argmax across classes
    result = np.argmax(summed, axis=1)
    return result

In [None]:
preds = ensemble_predictions(model_history, valid_ds)
preds

array([1, 1, 2, ..., 0, 1, 2], dtype=int64)