In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import optimizers
import keras
from functools import partial
from math import exp
from tensorflow.keras.utils import get_custom_objects
from keras.layers import Activation
from keras.callbacks import ModelCheckpoint
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from matplotlib.cm import get_cmap
from tensorflow.keras.models import load_model
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
melspec_directory = '/kaggle/input/birdcall-melspec'
import h5py
import numpy as np
from sklearn.preprocessing import LabelEncoder
def load_from_hdf5(filename, directory):
    """Load data from HDF5 file"""
    filepath = os.path.join(directory, filename)
    with h5py.File(filepath, 'r') as hf:
        data = {name: hf[name][:] for name in hf.keys()}

    if 'classes' in data:
        label_encoder = LabelEncoder()
        label_encoder.classes_ = data['classes'].astype(str)
        data['label_encoder'] = label_encoder

    return data

loaded_data = load_from_hdf5('mfcc_data.h5', melspec_directory)

X = loaded_data['X_train']
y = loaded_data['y_train']
label_encoder = loaded_data['label_encoder']

original_labels = label_encoder.inverse_transform(y)


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

In [None]:
class ModelInspector:
    def __init__(self, model):
        self.model = model
        self.layer_dict = {layer.name: layer for layer in model.layers}

    def get_layer_output(self, layer_name, data):

        intermediate_model = tf.keras.Model(
            inputs=self.model.input,
            outputs=self.model.get_layer(layer_name).output
        )
        return intermediate_model.predict(data)

    def list_layers(self):

        for idx, layer in enumerate(self.model.layers):
            print(f"Layer {idx}: {layer.name} ({layer.__class__.__name__})")

In [None]:
model_path = '/kaggle/input/teacher/keras/default/1/teacher.keras'
teacher_model = load_model(model_path)

In [None]:
y_pred = teacher_model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

test_loss, test_acc = teacher_model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")

In [None]:
NCLASS=11
input_shape = X_train.shape[1:]
LEARNING_RATE= 0.002

# Student model formation

The pareto-optimal student networks found after optimization in Neural architectural search are recreated using the obtained information regarding their architecture.

In [None]:
def create_2d_cnn_model(residual, filters, kernel_size, fc_layers, use_bn, use_dropout, input_shape):

    inputs = layers.Input(shape=input_shape, name="input_layer")

    # Initial Conv Block
    x = layers.Conv2D(filters, kernel_size, padding='same')(inputs)
    if use_bn:
        x = layers.BatchNormalization()(x)
    x = layers.ReLU(name="relu1_1")(x)
    x = layers.Conv2D(filters, kernel_size, padding='same')(x)
    if use_bn:
        x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPooling2D((2,2), strides=(2,2), padding='same')(x)  # Changed to (2,2)


    # Residual Block
    for _ in range(residual - 1):
        filters = filters*2
        residual = layers.Conv2D(filters, (1,1), strides=(2,2), padding='same')(x)
        x = layers.Conv2D(filters, kernel_size, padding='same')(x)
        if use_bn:
            x = layers.BatchNormalization()(x)
        x = layers.ReLU()(x)
        x = layers.Conv2D(filters, kernel_size, padding='same')(x)
        if use_bn:
            x = layers.BatchNormalization()(x)
        x = layers.MaxPooling2D((2,2), strides=(2,2), padding='same')(x)
        x = layers.add([x, residual])
        x = layers.ReLU()(x)

    # Final Feature Processing
    x = layers.GlobalAveragePooling2D()(x)

    # Dense Layers
    fc_layer_configs = {
        4: [1024, 512, 256, 128],
        3: [512, 256, 128],
        2: [256, 128],
        1: [128]
        }

    num_fc_layers = fc_layers  # Example: 4, 3, 2, or 1

    if num_fc_layers in fc_layer_configs:
        for i, neurons in enumerate(fc_layer_configs[num_fc_layers]):
            x = layers.Dense(neurons, activation='relu')(x)
            if use_dropout:
                x = layers.Dropout(0.2)(x)


    outputs = layers.Dense(NCLASS, activation='softmax', name="output_layer")(x)

    # Compile
    model = Model(inputs=inputs, outputs=outputs)

    optimizer = tf.keras.optimizers.Adam(
        learning_rate=LEARNING_RATE,
        #clipnorm=clipnorm,
        #clipvalue=clipvalue
    )

    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=["accuracy"])
    #model.summary()
    trainable_params = np.sum([tf.keras.backend.count_params(w) for w in model.trainable_weights])
    non_trainable_params = np.sum([tf.keras.backend.count_params(w) for w in model.non_trainable_weights])
    total_params = trainable_params + non_trainable_params

    # Calculate size in MB (assuming float32 precision - 4 bytes per parameter)
    param_size_mb = (total_params * 4) / (1024)  # bytes to MB conversion

    print(f"Model created with:")
    print(f"- Trainable parameters: {trainable_params:,}")
    print(f"- Non-trainable parameters: {non_trainable_params:,}")
    print(f"- Total parameters: {total_params:,}")
    print(f"- Estimated size: {param_size_mb:.2f} KB (assuming float32)")

    return model

input_shape = X_train.shape[1:]
student_model = create_2d_cnn_model(2, 32, (3, 3), 2, 0, 0, input_shape)


In [None]:

model_configs = [

    {'residual': 1, 'filters': 16, 'kernel_size': (3,3), 'fc_layers': 1, 'use_bn': 0, 'use_dropout': 0},
    {'residual': 2, 'filters': 32, 'kernel_size': (3,3), 'fc_layers': 2, 'use_bn': 0, 'use_dropout': 0},
    {'residual': 1, 'filters': 32, 'kernel_size': (5,5), 'fc_layers': 2, 'use_bn': 1, 'use_dropout': 0},
    {'residual': 1, 'filters': 32, 'kernel_size': (3,3), 'fc_layers': 1, 'use_bn': 0, 'use_dropout': 1},
    {'residual': 1, 'filters': 16, 'kernel_size': (5,5), 'fc_layers': 1, 'use_bn': 1, 'use_dropout': 1},
    {'residual': 1, 'filters': 16, 'kernel_size': (5,5), 'fc_layers': 2, 'use_bn': 0, 'use_dropout': 1},
    {'residual': 2, 'filters': 64, 'kernel_size': (3,3), 'fc_layers': 1, 'use_bn': 1, 'use_dropout': 1},
    {'residual': 1, 'filters': 32, 'kernel_size': (5,5), 'fc_layers': 2, 'use_bn': 0, 'use_dropout': 1},
    {'residual': 2, 'filters': 32, 'kernel_size': (5,5), 'fc_layers': 2, 'use_bn': 0, 'use_dropout': 0},
    {'residual': 2, 'filters': 64, 'kernel_size': (3,3), 'fc_layers': 1, 'use_bn': 0, 'use_dropout': 0},
    {'residual': 2, 'filters': 32, 'kernel_size': (3,3), 'fc_layers': 1, 'use_bn': 0, 'use_dropout': 1},
    {'residual': 2, 'filters': 64, 'kernel_size': (5,5), 'fc_layers': 1, 'use_bn': 1, 'use_dropout': 1},
]

student_models = []
for idx, config in enumerate(model_configs, 1):
    print(f"\nCreating student_model_{idx}")
    model = create_2d_cnn_model(
        input_shape=input_shape,
        **config
    )
    student_models.append(model)

print(f"\nTotal models created: {len(student_models)}")
print(f"First model: {student_models[0]}")
print(f"Last model: {student_models[-1]}")

In [None]:
class SaveBestStudentModel(keras.callbacks.Callback):
    def __init__(self, save_path):
        super().__init__()
        self.save_path = save_path
        self.best_val_loss = float("inf")

    def on_epoch_end(self, epoch, logs=None):
        val_student_loss = logs.get("val_student_loss")
        if val_student_loss is not None and val_student_loss < self.best_val_loss:
            self.best_val_loss = val_student_loss
            print(f"\nSaving best student model with val_student_loss: {val_student_loss:.4f}")
            self.model.student.save(self.save_path)  # Only student model is saved

## Teacher-student knowledge distillation

The overall loss function using cross entropy loss from hard labels and KL-Divergence loss between soft labels of teacher and student models are defined.
The forward pass and callbacks based on validation_loss is implemented.

In [None]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student


    def call(self, inputs):
        # Forward pass of the student model
        return self.student(inputs)

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)


            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({
                        "student_loss": student_loss,
                        "distillation_loss": distillation_loss,
                        "loss": loss })

        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss, "loss": student_loss})
        return results
    def predict_step(self, data):
        x = data[0] if isinstance(data, (tuple, list)) else data
        return self.student(x, training=False)

In [None]:
student_best_path = "/kaggle/working/student_best.keras"
SAVED_MODEL_PATH = "/kaggle/working/student_last.keras"

save_best_student = SaveBestStudentModel(student_best_path)

# Compile the distiller
distiller = Distiller(student=student_models[0], teacher=teacher_model)
distiller.compile(
    optimizer=Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=1,
    temperature=5
)

# Train the distiller
history_distill = distiller.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[save_best_student],
    verbose=1
)

Example use case

In [None]:
student0_best= load_model(student_best_path)
test_loss, test_acc = student0_best.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# Training without teacher influence

All student models are trained without any help from teacher (alpha=1)

In [None]:
student_models_trained = []
histories = []

for i in range(10):
    student_best_path = f"/kaggle/working/student_best_{i}.keras"
    save_best_student = SaveBestStudentModel(student_best_path)
    
    distiller = Distiller(student=student_models[i], teacher=teacher_model)
    distiller.compile(
        optimizer=Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
        student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        distillation_loss_fn=keras.losses.KLDivergence(),
        alpha=1,
        temperature=1
    )
    
    history = distiller.fit(
        X_train,
        y_train,
        epochs=25,
        batch_size=64,
        validation_data=(X_val, y_val),
        callbacks=[save_best_student],
        verbose=1
    )
    
    histories.append(history)
    student_models_trained.append(student_best_path)

val_accuracies = []
test_accuracies = []
val_losses = []
test_losses = []

for i, model_path in enumerate(student_models_trained):
    student_model = load_model(model_path)
    
    val_loss, val_acc = student_model.evaluate(X_val, y_val, verbose=0)
    test_loss, test_acc = student_model.evaluate(X_test, y_test, verbose=0)
    
    val_accuracies.append(val_acc)
    test_accuracies.append(test_acc)
    val_losses.append(val_loss)
    test_losses.append(test_loss)
print("\nIndividual Model Performances:")
for i in range(10):
    print(f"Model {i+1}: Val Acc={val_accuracies[i]:.4f}, Test Acc={test_accuracies[i]:.4f}")

# Ablation study for Teacher influenced training

The hyperparameters alpha and temperature are varied to train the best student models with influence of the teacher soft labels.

In [None]:
temperatures = [3, 5, 7, 10, 12, 14]
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
all_results = []

for student_idx in range(10):
    print(f"\n=== Training Student Model {student_idx+1}/10 ===")
    student_best_metrics = {'val_acc': 0, 'params': None, 'model_path': None}
    
    # Random regularization helps avoid overfitting to specific hyperparameter combinations while maintaining computational efficiency
    combinations = [(t, a) for t in temperatures for a in alphas]
    selected_combinations = random.sample(combinations, len(combinations)//2)
    
    for temp, alpha in selected_combinations:
        print(f"\nTraining with temperature={temp}, alpha={alpha}")
            
        model_path = f"/kaggle/working/student_{student_idx}_taught_temp{temp}_alpha{alpha}.keras"
        save_best = SaveBestStudentModel(model_path)
        
        distiller = Distiller(student=student_models[student_idx], teacher=teacher_model)
        distiller.compile(
            optimizer=Adam(learning_rate=0.002, beta_1=0.9, beta_2=0.99),
            metrics=[keras.metrics.SparseCategoricalAccuracy()],
            student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            distillation_loss_fn=keras.losses.KLDivergence(),
            alpha=alpha,
            temperature=temp
        )
        
        history = distiller.fit(
            X_train,
            y_train,
            epochs=25,
            batch_size=64,
            validation_data=(X_val, y_val),
            callbacks=[save_best],
            verbose=1
        )
        
        best_model = load_model(model_path)
        val_loss, val_acc = best_model.evaluate(X_val, y_val, verbose=0)
        
        if val_acc > student_best_metrics['val_acc']:
            student_best_metrics = {
                'val_acc': val_acc,
                'params': {'temperature': temp, 'alpha': alpha},
                'model_path': model_path,
                'student_idx': student_idx
            }
        
        print(f"Val Acc: {val_acc:.4f} | Temp: {temp} | Alpha: {alpha}")
    
    all_results.append(student_best_metrics)

print("\n=== Best Performing Models ===")
for result in all_results:
    print(f"\nStudent {result['student_idx']+1}:")
    print(f"Validation Accuracy: {result['val_acc']:.4f}")
    print(f"Temperature: {result['params']['temperature']}")
    print(f"Alpha: {result['params']['alpha']}")
    print(f"Model Path: {result['model_path']}")

print("\n=== Final Evaluation on Test Set ===")
test_results = []
for result in all_results:
    best_model = load_model(result['model_path'])
    test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=0)
    test_results.append(test_acc)
    print(f"\nStudent {result['student_idx']+1}:")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test Loss: {test_loss:.4f}")

