# Introduction

This is our implementation of inceptionnet inspired model on raw wave dataset(time domain) dataset. Here one can just hit run all and results will be generated in 2 csv files as inteded. The weights and models will be also saved for reproducibility. Thus this notebook can be used to train custom version of our model and experiment further.

### **!!! Disclaimer:**

Here as we are using raw waveform without data generator, loading all 9000 evaluation files in a single dataset will exhaust the available ram, so we are generating 3 subsections for each predictions, please download those csv, and merge them manually, to get the final_predictions scores

# 0. Setup

At first we are importing and setting up all the libraries to implement this solution. If some of the files are not installed in your device, please refer to the readme file for installation direction

In [19]:
!pip install -r "requirement (1).txt"



### utility libraries

In [20]:
import os
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import librosa
import librosa.display
import pandas as pd
import tensorflow.keras as keras

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
from pathlib import Path
from IPython.display import Audio
from sklearn.model_selection import train_test_split

### declaring hyperparameters

In [21]:
# Set the seed value for experiment reproducibility.
seed = 42
SR=16000 # resampling as the ram can't handle this much calculation
BATCH_SIZE = 4
AUD_LENGTH = 10
TRAIN_TEST_SPLIT = 0.2
tf.random.set_seed(seed)
np.random.seed(seed)

### All file paths

In [22]:
# insert needed paths here

# this is the known and unknown dataset path
DATASET_AUDIO_PATH = 'classwise_final_2k/classwise_final_2k'

# this is the random extra data addition folders
ASVSPOOF_DATA_PATH  = 'external files/asvspoof'
LIBRISPEECH_DATA_PATH = 'external files/librispeech'

# this is the evaluation folder for phase 1 and 2
EVAL_PATH_1 = 'spcup_2022_eval_part1'
EVAL_PATH_2 = 'spcup_2022_eval_part1'

### saving paths
CSV_DIR = './'
MODEL_SAVE_DIR = './'
WEIGHT_SAVE_DIR = './'

# 1. Dataset Generation

In [23]:
# getting audio dataset path to divide into 3 datasets and also for making tf datasets later

class_names = os.listdir(DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))

audio_paths = []
labels = []
for label, name in enumerate(class_names):
    label = int(name)
    print("Processing speaker {}".format(name,))
    print("Actual Label ",label)
    dir_path = Path(DATASET_AUDIO_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)

print(
    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)

Our class names: ['0', '1', '2', '3', '4', '5']
Processing speaker 0
Actual Label  0
Processing speaker 1
Actual Label  1
Processing speaker 2
Actual Label  2
Processing speaker 3
Actual Label  3
Processing speaker 4
Actual Label  4
Processing speaker 5
Actual Label  5
Found 11200 files belonging to 6 classes.


In [24]:
# Split into training and validation
X_train, X_val, y_train, y_val = train_test_split(audio_paths, labels, test_size=TRAIN_TEST_SPLIT, random_state=seed)

In [25]:
print(len(X_train),len(y_train))
print(len(X_val),len(y_val))

8960 8960
2240 2240


In [26]:
# utility functions for repeating audio files
def repeated_data(file_path):
    """ This function will take a file path and give out truncated and padded to 10s version waveform"""
    y, sr = librosa.load(file_path,sr=SR)
    aud_length = AUD_LENGTH*sr # making all audio length 10 s and truncating the rest
    duration = librosa.get_duration(y=y, sr=sr)
    if duration < AUD_LENGTH:
        y = np.tile(y, int((aud_length/sr) // duration)+1)
    y = librosa.resample(y[:aud_length], orig_sr=sr, target_sr=SR)
    return y

def repeated_dataset(dataset):
    """ This function generated waveshape dataset"""
    new_ds = []
    for f in dataset:
        new_ds.append(repeated_data(f))
    return new_ds

In [27]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size= BATCH_SIZE, 
                 n_classes=6, shuffle=True):
        'Initialization'
        self.dim = AUD_LENGTH * SR
        self.batch_size = batch_size
        self.labels = labels
        self.shuffle = shuffle
        self.list_IDs = list_IDs
        self.on_epoch_end()

    def path_to_audio(self,path):
        """Reads and decodes an audio file."""
        return repeated_data(path)

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'
        
        X = []
        y = []
        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            _tempx = self.path_to_audio(self.list_IDs[ID])
            #_tempx = self.spect_audio(_tempx)
            X.append(_tempx)

            # Store class
            y.append(self.labels[ID])

        return np.reshape(np.array(X), (self.batch_size,SR*AUD_LENGTH,1)).astype(np.float32),np.array(y).astype(np.float32)

In [28]:
# generating tf datasets
train_ds = DataGenerator(X_train,y_train)
valid_ds = DataGenerator(X_val,y_val)

# 2. Building A Model

In [29]:
### model 2
def DilatedConvModule(xx, filters):
    
    xx1 = tf.keras.layers.Conv1D(filters, kernel_size = 9,dilation_rate=1, padding= "same")(xx)
    xx1 = tf.keras.layers.BatchNormalization()(xx1)
    xx1 = tf.keras.layers.ReLU()(xx1)
    
    xx2 = tf.keras.layers.Conv1D(filters, kernel_size = 7,dilation_rate=2, padding= "same")(xx1)
    xx2 = tf.keras.layers.BatchNormalization()(xx2)
    xx2 = tf.keras.layers.ReLU()(xx2)
    
    xx4 = tf.keras.layers.Conv1D(filters, kernel_size = 3,dilation_rate=4, padding= "same")(xx2)
    xx4 = tf.keras.layers.BatchNormalization()(xx4)
    xx4 = tf.keras.layers.ReLU()(xx4)
    
    xx8 = tf.keras.layers.Conv1D(filters, kernel_size = 1,dilation_rate=8, padding= "same")(xx4)
    xx8 = tf.keras.layers.BatchNormalization()(xx8)
    xx8 = tf.keras.layers.ReLU()(xx8)
    #concat
    yy = tf.keras.layers.Concatenate(axis=1)([xx1,xx2,xx4,xx8])
    yy = tf.keras.layers.BatchNormalization()(yy)
    yy = tf.keras.layers.ReLU()(yy)
    
    return yy

def build_model(input_shape, num_classes):
    inputs = tf.keras.layers.Input(shape=input_shape, name="input")
    x      = tf.keras.layers.Conv1D(16, kernel_size = 9,dilation_rate=1, padding= "same")(inputs)
    x      = tf.keras.layers.BatchNormalization()(x)
    x      = tf.keras.layers.ReLU()(x)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    
    # stacked resnet modules
    # inc1
    x      = DilatedConvModule(x,32)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    # inc2
    x      = DilatedConvModule(x,64)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    # inc4
    x      = DilatedConvModule(x,128)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    # inc8
    x      = DilatedConvModule(x,256)
    x      = tf.keras.layers.MaxPool1D(pool_size =  x.shape[-1])(x)
    
    x      = tf.keras.layers.GlobalAveragePooling1D()(x)
    
    x      = tf.keras.layers.Flatten()(x)
    x      = tf.keras.layers.Dense(64, activation="relu")(x)
    x      = tf.keras.layers.Dense(32, activation="relu")(x)
    x      = tf.keras.layers.Dropout(0.25)(x)
    
    outputs = tf.keras.layers.Dense(num_classes, activation="softmax", name="output")(x)
    
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


aud_length = SR*AUD_LENGTH

model = build_model((aud_length, 1), len(class_names))

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 160000, 1)]  0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 160000, 16)   160         ['input[0][0]']                  
                                                                                                  
 batch_normalization (BatchNorm  (None, 160000, 16)  64          ['conv1d[0][0]']                 
 alization)                                                                                       
                                                                                                  
 re_lu (ReLU)                   (None, 160000, 16)   0           ['batch_normalization[0][0]']

In [30]:
# configuring the run
# Compile the model using Adam's default learning rate
model.compile(
    optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
weight_save_filename = "weight_inceptionnet_final_50ep.h5"

lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=1, mode='min', min_lr=1e-9)
earlystopping_cb = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.001, patience=10, mode='min', restore_best_weights=True)
mdlcheckpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    weight_save_filename, monitor="val_accuracy", save_best_only=True,save_weights_only=True
)

# 3. Training the Model

In [31]:
EPOCHS = 50
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[lr_reduce,earlystopping_cb, mdlcheckpoint_cb],
)
model.save("model_inceptionnet_final_50ep.h5")

Epoch 1/50
Epoch 2/50

KeyboardInterrupt: 