In [9]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# =========================================================
# 1) LOAD NPZ 
# =========================================================
train = np.load("/kaggle/input/npz-file/pytorch/default/1/train_aug.npz", allow_pickle=True)
val   = np.load("/kaggle/input/npz-file/pytorch/default/1/val.npz", allow_pickle=True)
test  = np.load("/kaggle/input/npz-file/pytorch/default/1/test.npz", allow_pickle=True)
noisy = np.load("/kaggle/input/npz-file/pytorch/default/1/test_noisy.npz", allow_pickle=True)

X_train, y_train = train["X"].astype("float32"), train["y"].astype("int64")
X_val,   y_val   = val["X"].astype("float32"),   val["y"].astype("int64")
X_test,  y_test  = test["X"].astype("float32"),  test["y"].astype("int64")
X_noisy, y_noisy = noisy["X"].astype("float32"), noisy["y"].astype("int64")

classes = train["classes"].tolist()
num_classes = len(classes)
unknown_idx = classes.index("unknown") if "unknown" in classes else None

print("Train:", X_train.shape, y_train.shape)
print("Val  :", X_val.shape, y_val.shape)
print("Test :", X_test.shape, y_test.shape)
print("Noisy:", X_noisy.shape, y_noisy.shape)
print("Num classes:", num_classes)
print("classes:", classes)
print("unknown_idx:", unknown_idx)

def same_classes(a, b):
    return np.array_equal(a["classes"], b["classes"])

print("val classes match train  ?", same_classes(val, train))
print("test classes match train ?", same_classes(test, train))
print("noisy classes match train?", same_classes(noisy, train))


# =========================================================
# 2) ONE-HOT LABELS 
# =========================================================
y_train_oh = tf.one_hot(y_train, depth=num_classes)
y_val_oh   = tf.one_hot(y_val, depth=num_classes)
y_test_oh  = tf.one_hot(y_test, depth=num_classes)
y_noisy_oh = tf.one_hot(y_noisy, depth=num_classes)


# =========================================================
# 3) TF.DATA + SPEC AUGMENT
# =========================================================
BATCH = 64
AUTOTUNE = tf.data.AUTOTUNE

ds_train = tf.data.Dataset.from_tensor_slices((X_train, y_train_oh)).shuffle(6000).batch(BATCH).prefetch(AUTOTUNE)
ds_val   = tf.data.Dataset.from_tensor_slices((X_val, y_val_oh)).batch(BATCH).prefetch(AUTOTUNE)
ds_test  = tf.data.Dataset.from_tensor_slices((X_test, y_test_oh)).batch(BATCH).prefetch(AUTOTUNE)
ds_noisy = tf.data.Dataset.from_tensor_slices((X_noisy, y_noisy_oh)).batch(BATCH).prefetch(AUTOTUNE)

@tf.function
def specaugment(x, y, p=0.9, freq_mask_max=14, time_mask_max=40):
    """
    x: (B, 128, 64, 1)
    y: one-hot (B, num_classes)
    """
    r = tf.random.uniform([])

    def do_aug():
        # Frequency mask
        f = tf.random.uniform([], 0, freq_mask_max + 1, dtype=tf.int32)
        f0 = tf.random.uniform([], 0, 64 - f + 1, dtype=tf.int32)
        freq_mask = tf.concat([
            tf.ones([128, f0, 1]),
            tf.zeros([128, f, 1]),
            tf.ones([128, 64 - f0 - f, 1]),
        ], axis=1)

        # Time mask
        t = tf.random.uniform([], 0, time_mask_max + 1, dtype=tf.int32)
        t0 = tf.random.uniform([], 0, 128 - t + 1, dtype=tf.int32)
        time_mask = tf.concat([
            tf.ones([t0, 64, 1]),
            tf.zeros([t, 64, 1]),
            tf.ones([128 - t0 - t, 64, 1]),
        ], axis=0)

        x2 = x * freq_mask
        x2 = x2 * time_mask
        return x2, y

    return tf.cond(r < p, do_aug, lambda: (x, y))

ds_train_aug = ds_train.map(specaugment, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)

# sanity check
xb, yb = next(iter(ds_train_aug))
print("Train batch X:", xb.shape, "y(one-hot):", yb.shape, "min/max:", float(tf.reduce_min(xb)), float(tf.reduce_max(xb)))


# =========================================================
# 4) MODEL: Audio CNN (tăng capacity nhẹ + regularize)
# =========================================================
def build_audio_cnn(input_shape=(128, 64, 1), num_classes=14):
    inp = keras.Input(shape=input_shape)
    x = inp

    for f in [32, 64, 128]:
        x = layers.Conv2D(f, (3,3), padding="same")(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation("relu")(x)

        x = layers.Conv2D(f, (3,3), padding="same")(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation("relu")(x)

        x = layers.MaxPooling2D((2,2))(x)
        x = layers.Dropout(0.25)(x)

    x = layers.Conv2D(256, (3,3), padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.5)(x)
    out = layers.Dense(num_classes, activation="softmax")(x)

    return keras.Model(inp, out)

model = build_audio_cnn(input_shape=X_train.shape[1:], num_classes=num_classes)

loss_fn = keras.losses.CategoricalCrossentropy(label_smoothing=0.05)

model.compile(
    optimizer=keras.optimizers.Adam(1e-3),
    loss=loss_fn,
    metrics=["accuracy"]
)
model.summary()

cbs = [
    keras.callbacks.ModelCheckpoint("best_audio_cnn_v3.keras", monitor="val_loss", save_best_only=True, mode="min"),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6, verbose=1),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=12, restore_best_weights=True, verbose=1),
]


# =========================================================
# 5) CLASS WEIGHT (boost lớp yếu)
# =========================================================
base_counts = np.bincount(y_train, minlength=num_classes).astype(np.float32)
base_weights = base_counts.mean() / (base_counts + 1e-6)
class_weight = {i: float(base_weights[i]) for i in range(num_classes)}

for name, boost in [("engine", 1.8), ("rooster", 1.5), ("dog", 1.3)]:
    if name in classes:
        class_weight[classes.index(name)] *= boost

print("class_weight(sample):", {classes[i]: round(class_weight[i], 3) for i in range(num_classes)})


# =========================================================
# 6) TRAIN
# =========================================================
hist = model.fit(
    ds_train_aug,
    validation_data=ds_val,
    epochs=80,
    callbacks=cbs,
    class_weight=class_weight
)


# =========================================================
# 7) EVALUATE: test clean + test_noisy
# =========================================================
print("\n== TEST clean ==")
model.evaluate(ds_test, verbose=2)

print("\n== TEST noisy ==")
model.evaluate(ds_noisy, verbose=2)


# =========================================================
# 8) THRESHOLD UNKNOWN 
# =========================================================
from sklearn.metrics import f1_score, classification_report

def predict_probs(ds):
    probs, ys = [], []
    for xb, yb in ds:
        probs.append(model.predict(xb, verbose=0))
        ys.append(tf.argmax(yb, axis=1).numpy())  # one-hot -> index
    return np.concatenate(probs, axis=0), np.concatenate(ys, axis=0)

def apply_threshold(prob, unknown_idx, thr):
    pred = prob.argmax(axis=1)
    conf = prob.max(axis=1)
    pred2 = pred.copy()
    pred2[conf < thr] = unknown_idx
    return pred2

if unknown_idx is not None:
    prob_noisy, y_true_noisy = predict_probs(ds_noisy)

    best = None
    for thr in np.arange(0.30, 0.91, 0.05):
        pred_thr = apply_threshold(prob_noisy, unknown_idx, thr)
        f1 = f1_score(y_true_noisy, pred_thr, average="macro")
        if best is None or f1 > best[1]:
            best = (thr, f1)

    print("\nBest threshold on noisy (macro F1):", best)
    thr = best[0]
    pred_thr = apply_threshold(prob_noisy, unknown_idx, thr)

    print("\n=== Classification report (noisy) with threshold thr=%.2f ===" % thr)
    print(classification_report(y_true_noisy, pred_thr, target_names=classes, digits=4))
else:
    print("\n[INFO] Không có class 'unknown' nên bỏ qua threshold tuning.")


Train: (5824, 128, 64, 1) (5824,)
Val  : (104, 128, 64, 1) (104,)
Test : (104, 128, 64, 1) (104,)
Noisy: (104, 128, 64, 1) (104,)
Num classes: 14
classes: ['car_horn', 'cat', 'clock_alarm', 'coughing', 'crying_baby', 'dog', 'door_wood_knock', 'engine', 'frog', 'laughing', 'mouse_click', 'rooster', 'unknown', 'water_drops']
unknown_idx: 12
val classes match train  ? True
test classes match train ? True
noisy classes match train? True
Train batch X: (64, 128, 64, 1) y(one-hot): (64, 14) min/max: 0.0 1.0


class_weight(sample): {'car_horn': 1.0, 'cat': 1.0, 'clock_alarm': 1.0, 'coughing': 1.0, 'crying_baby': 1.0, 'dog': 1.3, 'door_wood_knock': 1.0, 'engine': 1.8, 'frog': 1.0, 'laughing': 1.0, 'mouse_click': 1.0, 'rooster': 1.5, 'unknown': 1.0, 'water_drops': 1.0}
Epoch 1/80
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 71ms/step - accuracy: 0.2417 - loss: 2.4992 - val_accuracy: 0.0385 - val_loss: 3.1576 - learning_rate: 0.0010
Epoch 2/80
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 47ms/step - accuracy: 0.5722 - loss: 1.6347 - val_accuracy: 0.0385 - val_loss: 3.4567 - learning_rate: 0.0010
Epoch 3/80
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 49ms/step - accuracy: 0.6803 - loss: 1.3158 - val_accuracy: 0.0385 - val_loss: 3.0473 - learning_rate: 0.0010
Epoch 4/80
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.7408 - loss: 1.1540 - val_accuracy: 0.0385 - val_loss: 3.4244 - learning_rate: 0.00

In [13]:
best = keras.models.load_model("best_audio_cnn_v3.keras", compile=False)
best.save("audio_cnn_best.h5")
print("Saved: audio_cnn_best.h5")



Saved: audio_cnn_best.h5
