# 0.setup

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import os
import pathlib
import pickle

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
import librosa
import librosa.display
import pandas as pd

import tensorflow.keras as keras
#import tensorflow_io as tfio

from tqdm import tqdm
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
from pathlib import Path
from IPython.display import Audio


from sklearn.model_selection import train_test_split


# Set the seed value for experiment reproducibility.
seed = 42
SR=16000 # resampling as the ram can't handle this much calculation
BATCH_SIZE = 4
AUD_LENGTH = 10#sec
tf.random.set_seed(seed)
np.random.seed(seed)

# 1.Dataset Generation

In [3]:
# getting audio dataset path to divide into 3 datasets and also for making tf datasets later

DATASET_AUDIO_PATH = 'semisup_ds/semisup_ds/labeled'
class_names = os.listdir(DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))

audio_paths = []
labels = []
for label, name in enumerate(class_names):
    label=int(name)
    print("Processing speaker {}".format(name,))
    print(label)
    dir_path = Path(DATASET_AUDIO_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)

print(
    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)

Our class names: ['0', '1', '2', '3', '4', '5']
Processing speaker 0
0
Processing speaker 1
1
Processing speaker 2
2
Processing speaker 3
3
Processing speaker 4
4
Processing speaker 5
5
Found 6700 files belonging to 6 classes.


In [4]:
X_train, X_val, y_train, y_val = train_test_split(audio_paths, labels, test_size=0.3, random_state=seed)

In [5]:
print(len(X_train),len(y_train))
print(len(X_val),len(y_val))

4690 4690
2010 2010


In [6]:
# utility functions for repeating audio files
def repeated_data(file_path):
    """ This function will take a file path and give out truncated and padded to 10s version waveform"""
    y, sr = librosa.load(file_path,sr=SR)
    aud_length = AUD_LENGTH*sr # making all audio length 10 s and truncating the rest
    duration = librosa.get_duration(y=y, sr=sr)
    if duration < AUD_LENGTH:
        y = np.tile(y, int((aud_length/sr) // duration)+1)
    y = librosa.resample(y[:aud_length], orig_sr=sr, target_sr=SR)
    return y

def repeated_dataset(dataset):
    """ This function generated waveshape dataset"""
    new_ds = []
    for f in dataset:
        new_ds.append(repeated_data(f))
    return new_ds

In [7]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size= BATCH_SIZE, 
                 n_classes=6, shuffle=True):
        'Initialization'
        self.dim = AUD_LENGTH * SR
        self.batch_size = batch_size
        self.labels = labels
        self.shuffle = shuffle
        self.list_IDs = list_IDs
        self.on_epoch_end()

    def path_to_audio(self,path):
        """Reads and decodes an audio file."""
        return repeated_data(path)

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'
        
        X = []
        y = []
        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            _tempx = self.path_to_audio(self.list_IDs[ID])
            #_tempx = self.spect_audio(_tempx)
            X.append(_tempx)

            # Store class
            y.append(self.labels[ID])
        #print(np.reshape(np.array(X), (self.batch_size,SR*AUD_LENGTH,1)).shape)
        return np.reshape(np.array(X), (self.batch_size,SR*AUD_LENGTH,1)).astype(np.float32),np.array(y).astype(np.float32)

In [8]:
# generating tf datasets
train_ds = DataGenerator(X_train,y_train)
valid_ds = DataGenerator(X_val,y_val)

# 2. Building A Model

In [9]:
### model 2
def DilatedConvModule(xx, filters):
    
    xx1 = tf.keras.layers.Conv1D(filters, kernel_size = 9,dilation_rate=1, padding= "same")(xx)
    xx1 = tf.keras.layers.BatchNormalization()(xx1)
    xx1 = tf.keras.layers.ReLU()(xx1)
    
    xx2 = tf.keras.layers.Conv1D(filters, kernel_size = 7,dilation_rate=2, padding= "same")(xx1)
    xx2 = tf.keras.layers.BatchNormalization()(xx2)
    xx2 = tf.keras.layers.ReLU()(xx2)
    
    xx4 = tf.keras.layers.Conv1D(filters, kernel_size = 3,dilation_rate=4, padding= "same")(xx2)
    xx4 = tf.keras.layers.BatchNormalization()(xx4)
    xx4 = tf.keras.layers.ReLU()(xx4)
    
    xx8 = tf.keras.layers.Conv1D(filters, kernel_size = 1,dilation_rate=8, padding= "same")(xx4)
    xx8 = tf.keras.layers.BatchNormalization()(xx8)
    xx8 = tf.keras.layers.ReLU()(xx8)
    #concat
    yy = tf.keras.layers.Concatenate(axis=1)([xx1,xx2,xx4,xx8])
    yy = tf.keras.layers.BatchNormalization()(yy)
    yy = tf.keras.layers.ReLU()(yy)
    
    return yy

def build_model(input_shape, num_classes):
    inputs = tf.keras.layers.Input(shape=input_shape, name="input")
    x      = tf.keras.layers.Conv1D(16, kernel_size = 9,dilation_rate=1, padding= "same")(inputs)
    x      = tf.keras.layers.BatchNormalization()(x)
    x      = tf.keras.layers.ReLU()(x)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    
    # stacked resnet modules
    # inc1
    x      = DilatedConvModule(x,32)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    #x      = tf.keras.layers.Dropout(0.5)(x)
    # inc2
    x      = DilatedConvModule(x,64)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
    #x      = tf.keras.layers.Dropout(0.25)(x)
    # inc4
    x      = DilatedConvModule(x,128)
    x      = tf.keras.layers.MaxPool1D(pool_size = 4)(x)
   #x      = tf.keras.layers.Dropout(0.25)(x)
    # inc8
    x      = DilatedConvModule(x,256)
    x      = tf.keras.layers.MaxPool1D(pool_size =  x.shape[-1])(x)
    #x      = tf.keras.layers.Dropout(0.5)(x)
    
    x      = tf.keras.layers.GlobalAveragePooling1D()(x)
    
    x      = tf.keras.layers.Flatten()(x)
    x      = tf.keras.layers.Dense(64, activation="relu")(x)
    x      = tf.keras.layers.Dense(32, activation="relu")(x)
    x      = tf.keras.layers.Dropout(0.25)(x)
    
    outputs = tf.keras.layers.Dense(num_classes, activation="softmax", name="output")(x)
    
    return tf.keras.models.Model(inputs=inputs, outputs=outputs)


aud_length = 16000*AUD_LENGTH

model = build_model((aud_length, 1), len(class_names))

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 160000, 1)]  0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 160000, 16)   160         ['input[0][0]']                  
                                                                                                  
 batch_normalization (BatchNorm  (None, 160000, 16)  64          ['conv1d[0][0]']                 
 alization)                                                                                       
                                                                                                  
 re_lu (ReLU)                   (None, 160000, 16)   0           ['batch_normalization[0][0]']

In [10]:
# LOAD_MODEL = '/content/drive/MyDrive/spcup2022/datasets/pretrained_models_weights/tssd_inc_short_50/model_TSSD_inc_short_1_50.h5'
# LOAD_WEIGHT = '/content/drive/MyDrive/spcup2022/datasets/pretrained_models_weights/tssd_inc_short_50/weight_TSSD_inc_short_1_50.h5'

# new_model = tf.keras.models.load_model(LOAD_MODEL)
# new_model.load_weights(LOAD_WEIGHT)

In [11]:
# configuring the run
# Compile the model using Adam's default learning rate
model.compile(
    optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
weight_save_filename = "/content/drive/MyDrive/spcup2022/datasets/teacher_save/inceptionnet/weight_inception_teacher_finetune.h5"

lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=1, mode='min', min_lr=1e-9)
earlystopping_cb = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.001, patience=10, mode='min', restore_best_weights=True)
mdlcheckpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    weight_save_filename, monitor="val_accuracy", save_best_only=True,save_weights_only=True)

# 3. train the model

In [12]:
# EPOCHS = 50
# history = model.fit(
#     train_ds,
#     epochs=EPOCHS,
#     validation_data=valid_ds,
#     callbacks=[lr_reduce,earlystopping_cb, mdlcheckpoint_cb],
# )
# model.save("/content/drive/MyDrive/spcup2022/datasets/teacher_save/inceptionnet/model_inception_teacher_finetune.h5")

# 4. Evaluation

In [13]:
LOAD_MODEL = 'inceptionnet/model_inception_teacher_finetune.h5'
LOAD_WEIGHT = 'inceptionnet/weight_inception_teacher_finetune.h5'

new_model = tf.keras.models.load_model(LOAD_MODEL)
new_model.load_weights(LOAD_WEIGHT)

In [14]:
DATASET_AUDIO_PATH = 'semisup_ds/semisup_ds/unlabeled'
class_names = [0, 1, 2, 3, 4, 5]
print("Our class names: {}".format(class_names,))

audio_paths = []
for name in os.listdir(DATASET_AUDIO_PATH):
    # label=int(name)
    #print("Processing speaker {}".format(name,))
    dir_path = f'{DATASET_AUDIO_PATH}/{name}'
    audio_paths.append(dir_path)

print(
    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)

Our class names: [0, 1, 2, 3, 4, 5]
Found 19200 files belonging to 6 classes.


In [15]:
len(os.listdir(DATASET_AUDIO_PATH))

19200

In [16]:
# for eval only
def paths_and_labels_to_dataset_1(ds):
    """Constructs a dataset of audios for testing"""
    audio_ds = tf.data.Dataset.from_tensor_slices(ds)
    return tf.data.Dataset.zip(audio_ds)

In [25]:
# making a column of filenames
X_eval = audio_paths
y_eval = [0, 1, 2, 3, 4, 5]

n = len(X_eval)
filenames = []
predictions = []
for i in tqdm(range(n)):
    f_name = X_eval[i]
    f_name = f_name.split("/")[-1]
    filenames.append(f_name)
    aud = repeated_data(X_eval[i])
    aud = paths_and_labels_to_dataset_1(np.expand_dims(np.expand_dims(aud, axis = 0),axis = 0))
    score_1 = new_model.predict(aud)
    pred_class_1 = score_1.argmax(axis=-1)
    predictions.append(pred_class_1[0])

100%|██████████| 19200/19200 [13:31<00:00, 23.65it/s]


In [26]:
score_df = pd.DataFrame({'file':filenames,'pred_class':predictions})
score_df.head()

Unnamed: 0,file,pred_class
0,000564048b88c05396a9e68b3a89840e.wav,5
1,0007e065b51cb7e792b0ce301600c449.wav,1
2,000c8deb702043e5c1689f1e7a71950f.wav,0
3,001945bc6c04dd435ccd6780f64fac3b.wav,3
4,001d3ce2cce8f3bb45e6023b9771d19d.wav,4


In [27]:
#converting dataframe to csv
score_df.to_csv('inception_unlabeled.csv', header=False, index=False)

In [28]:
score_df.pred_class.value_counts()

5    4097
2    3337
4    3203
3    2992
1    2943
0    2628
Name: pred_class, dtype: int64

In [29]:
eval_ds = DataGenerator(X_eval,y_eval)

In [30]:
history = model.evaluate(eval_ds)
history

IndexError: list index out of range

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_eval, predictions)

array([[100,   0,   0,   0,   0,   0],
       [  0,  91,   9,   0,   0,   0],
       [  0,  13,  86,   0,   0,   1],
       [  0,   0,   0, 100,   0,   0],
       [  0,   0,   0,   0,  98,   2],
       [  3,   0,  11,   2,  13, 671]])

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1', 'class 2','class 3','class 4','class 5']
print(classification_report(y_eval, predictions, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.97      1.00      0.99       100
     class 1       0.88      0.91      0.89       100
     class 2       0.81      0.86      0.83       100
     class 3       0.98      1.00      0.99       100
     class 4       0.88      0.98      0.93       100
     class 5       1.00      0.96      0.98       700

    accuracy                           0.95      1200
   macro avg       0.92      0.95      0.93      1200
weighted avg       0.96      0.95      0.96      1200

