# Deepfake Audio Detection using CNN

Project by:  
[Jen Patrick Nataba](https://ph.linkedin.com/in/cytojen)  
[John Ferry Lagman](https://ph.linkedin.com/in/thatjohnlagman)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# necessary imports

In [None]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from kerastuner.tuners import RandomSearch
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve

input_path = '/content/drive/MyDrive/omdena_hackathon/datasets/deepfake_audio'
train_path = os.path.join(input_path, 'training')
val_path = os.path.join(input_path, 'validation')
test_path = os.path.join(input_path, 'testing')

  from kerastuner.tuners import RandomSearch


# data preprocessing and augmentation

In [None]:
batch_size = 32
image_height = 224
image_width = 224

datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_generator = datagen.flow_from_directory(
    train_path,
    target_size=(image_height, image_width),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True
)

val_generator = datagen.flow_from_directory(
    val_path,
    target_size=(image_height, image_width),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True
)

test_generator = datagen.flow_from_directory(
    test_path,
    target_size=(image_height, image_width),
    batch_size=1,
    class_mode='binary',
    shuffle=False
)

Found 9600 images belonging to 2 classes.
Found 1200 images belonging to 2 classes.
Found 1200 images belonging to 2 classes.


# define the model

In [None]:
def build_cnn_model(hp):
    input_layer = Input(shape=(image_height, image_width, 3))

    x = Conv2D(
        filters=hp.Int('filters_1', min_value=32, max_value=128, step=32),
        kernel_size=hp.Choice('kernel_size_1', values=[3, 5]),
        activation='relu'
    )(input_layer)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(
        filters=hp.Int('filters_2', min_value=64, max_value=256, step=64),
        kernel_size=hp.Choice('kernel_size_2', values=[3, 5]),
        activation='relu'
    )(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Flatten()(x)
    x = Dense(units=hp.Int('dense_units', min_value=64, max_value=256, step=64), activation='relu')(x)
    output_layer = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_layer, outputs=output_layer)

    learning_rate = hp.Float('learning_rate', min_value=1e-5, max_value=1e-3, sampling='LOG')
    optimizer = hp.Choice('optimizer', values=['adam', 'rmsprop'])

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
tuner = RandomSearch(
    build_cnn_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='/content/drive/MyDrive/omdena_hackathon/models/deepfake_audio_detection/tuning',
    project_name='cnn_audio_deepfake'
)

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(factor=0.2, patience=3, min_delta=0.005)

tuner.search(
    train_generator,
    epochs=50,
    steps_per_epoch=train_generator.n // batch_size,
    validation_data=val_generator,
    callbacks=[early_stop, reduce_lr]
)

Trial 10 Complete [00h 08m 00s]
val_accuracy: 0.8454166650772095

Best val_accuracy So Far: 0.8779166638851166
Total elapsed time: 01h 31m 04s


In [None]:
best_cnn_model = tuner.get_best_models(num_models=1)[0]

best_cnn_model.fit(
    train_generator,
    epochs=50,
    steps_per_epoch=train_generator.n // batch_size,
    validation_data=val_generator,
    callbacks=[early_stop, reduce_lr]
)

  saveable.load_own_variables(weights_store.get(inner_path))


Epoch 1/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 161ms/step - accuracy: 0.9053 - loss: 0.2833 - val_accuracy: 0.8958 - val_loss: 0.3870 - learning_rate: 0.0010
Epoch 2/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.8958 - val_loss: 0.3870 - learning_rate: 0.0010
Epoch 3/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 154ms/step - accuracy: 0.9588 - loss: 0.1203 - val_accuracy: 0.8242 - val_loss: 1.0294 - learning_rate: 0.0010
Epoch 4/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.8242 - val_loss: 1.0294 - learning_rate: 0.0010
Epoch 5/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 155ms/step - accuracy: 0.9939 - loss: 0.0207 - val_accuracy: 0.8675 - val_loss: 0.9997 - learning_rate: 2.0000e-04
Epoch 6/50
[1m300/300[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x78917c643a00>

# performance metrics

In [None]:
# eval
def compute_eer(y_true, y_scores):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores, pos_label=1)
    fnr = 1 - tpr
    eer_threshold = thresholds[np.nanargmin(np.absolute((fnr - fpr)))]
    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    return eer, eer_threshold

# predict probabilities on the test set
y_pred_prob = np.squeeze(best_cnn_model.predict(test_generator, steps=len(test_generator)))

# EER and threshold
eer, eer_threshold = compute_eer(test_generator.classes, y_pred_prob)
print(f'EER: {eer * 100:.2f}% at threshold: {eer_threshold}')

# this will convert predictions to binary labels using the EER threshold
y_pred = (y_pred_prob > eer_threshold).astype(int)

print('Test Classification Report:')
print(classification_report(test_generator.classes, y_pred, target_names=['FAKE', 'REAL']))
print('Test Confusion Matrix:')
print(confusion_matrix(test_generator.classes, y_pred))

test_accuracy = accuracy_score(test_generator.classes, y_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')

[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step
EER: 13.50% at threshold: 0.3067167401313782
Test Classification Report:
              precision    recall  f1-score   support

        FAKE       0.86      0.87      0.87       600
        REAL       0.87      0.86      0.86       600

    accuracy                           0.86      1200
   macro avg       0.87      0.86      0.86      1200
weighted avg       0.87      0.86      0.86      1200

Test Confusion Matrix:
[[520  80]
 [ 82 518]]
Test Accuracy: 0.8650


# thoughts
Building this CNN for classifying AI-generated audio was a mix of challenges and wins. Tuning with RandomSearch was resource-heavy but worth it, and calculating EER added a nice touch to the evaluation. The setup went smoothly, but making sure the model didn’t overfit took some tweaking. The results showed solid progress and areas to improve. Overall, a great learning experience!






