# Deepfake Voice Detection Training

## Import

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf

2025-10-17 23:04:31.303739: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-17 23:04:31.317471: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-17 23:04:31.850650: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-17 23:04:33.705681: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

## Configuration

In [1]:
CSV_PATH = "../data/preprocessed.csv"
DEEPFAKEVOICE_KERAS_MODEL = "../app/model/deepfakevoice/deepfakevoice_v1.keras"
DEEPFAKEVOICE_SCALER = "../app/model/deepfakevoice/scaler.joblib"
DEEPFAKEVOICE_TFLITE_MODEL = "../app/model/deepfakevoice/deepfakevoice.tflite"
DEEPFAKEVOICE_LOGS =  '../app/model/deepfakevoice/deepfakevoice_logs'
RANDOM_SEED = 42

## Load data

In [3]:
df = pd.read_csv(CSV_PATH)
df.head()

Unnamed: 0,chroma_stft,rms,spectral_centroid,spectral_bandwidth,rolloff,zcr,mfcc1,mfcc2,mfcc3,mfcc4,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,0.358431,0.047544,3542.9941,2472.2634,6142.1016,0.229714,-183.50685,34.215317,-25.610506,5.232159,...,-14.175571,-15.062231,-2.411484,-2.136526,4.616874,-3.939895,-5.642972,-8.739243,-9.369484,REAL
1,0.398888,0.01398,2956.4412,2525.692,5812.986,0.171032,-350.51126,58.881863,-23.915743,13.723481,...,-9.554494,-8.560787,-2.228989,-14.121045,-0.228751,-2.58535,-3.479854,-10.985688,-4.435715,REAL
2,0.277066,0.028064,2216.6729,2266.4956,4373.1978,0.103405,-328.8224,84.15872,-20.430117,6.658457,...,-9.122506,-6.528575,5.203861,-6.371897,-2.242782,-7.658447,-6.257457,-6.273143,-3.551254,REAL
3,0.359222,0.035303,2968.641,2467.413,5483.626,0.178067,-265.03745,61.225735,-20.567951,-0.498797,...,-6.477275,-14.361066,0.192269,-5.031093,-0.998947,-3.82671,-2.932528,-9.419309,-3.316403,REAL
4,0.329339,0.024873,2111.4087,2129.4204,3848.326,0.105125,-283.91092,91.65695,-41.411015,-11.166985,...,-2.310187,-10.009683,-3.353666,-7.186924,-0.003199,-9.301208,-5.654605,-5.261728,2.49881,REAL


In [4]:
label_col = "label"

# Encode label (REAL -> 1, FAKE -> 0)
label_encoder = LabelEncoder()
df[label_col] = label_encoder.fit_transform(df[label_col])

print("Label classes:", label_encoder.classes_)

# Split features/labels
X_full = df.drop(columns=[label_col]).values.astype(np.float32)
y_full = df[label_col].values.astype(np.int32)

# Train/Val/Test split
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

print("Shapes:", X_train.shape, X_val.shape, X_test.shape)

Label classes: ['FAKE' 'REAL']
Shapes: (33555, 26) (8389, 26) (10486, 26)


## Preprocessing: Standardize

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Save scaler for later (inference)
import joblib
joblib.dump(scaler, DEEPFAKEVOICE_SCALER)

['../app/model/deepfakevoice/scaler.joblib']

## Build model

In [6]:
n_features = X_train.shape[1]

dropout_rate=0.35
l2_reg=1e-4
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(n_features,), name="mfcc_input"),
    tf.keras.layers.Dense(256, activation=tf.keras.layers.PReLU(),
                          kernel_initializer="he_normal",
                          kernel_regularizer=tf.keras.regularizers.l2(l2_reg)),
    tf.keras.layers.Dropout(dropout_rate),
    tf.keras.layers.Dense(128, activation=tf.keras.layers.PReLU(),
                          kernel_initializer="he_normal",
                          kernel_regularizer=tf.keras.regularizers.l2(l2_reg)),
    tf.keras.layers.Dropout(dropout_rate),
    tf.keras.layers.Dense(128, activation=tf.keras.layers.PReLU(),
                          kernel_initializer="he_normal",
                          kernel_regularizer=tf.keras.regularizers.l2(l2_reg)),
    tf.keras.layers.Dropout(dropout_rate),
    tf.keras.layers.Dense(1, activation="sigmoid", name="output")
])

model.summary()

E0000 00:00:1760717103.698770   16960 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1760717103.706903   16960 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=optimizer,
              loss="binary_crossentropy",
              metrics=[tf.keras.metrics.AUC(name="auc"), "accuracy"])

## Callbacks

In [9]:
run_index = 3 # increment this at every run
run_logdir = Path(DEEPFAKEVOICE_LOGS) / "run_{:03d}".format(run_index)
run_logdir

PosixPath('../app/model/deepfakevoice/deepfakevoice_logs/run_003')

In [10]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(DEEPFAKEVOICE_KERAS_MODEL, monitor="val_auc", save_best_only=True)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=10, restore_best_weights=True, verbose=1)
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=5, mode="max", verbose=1)
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)

## Train

In [11]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=64,
    callbacks=[checkpoint_cb, early_stopping_cb, lr_scheduler, tensorboard_cb],
    verbose=2
)

Epoch 1/100
525/525 - 3s - 6ms/step - accuracy: 0.8200 - auc: 0.9021 - loss: 0.4990 - val_accuracy: 0.9412 - val_auc: 0.9855 - val_loss: 0.2622 - learning_rate: 1.0000e-03
Epoch 2/100
525/525 - 2s - 3ms/step - accuracy: 0.9153 - auc: 0.9709 - loss: 0.3126 - val_accuracy: 0.9472 - val_auc: 0.9913 - val_loss: 0.2274 - learning_rate: 1.0000e-03
Epoch 3/100
525/525 - 2s - 3ms/step - accuracy: 0.9312 - auc: 0.9801 - loss: 0.2671 - val_accuracy: 0.9546 - val_auc: 0.9922 - val_loss: 0.2039 - learning_rate: 1.0000e-03
Epoch 4/100
525/525 - 2s - 3ms/step - accuracy: 0.9420 - auc: 0.9846 - loss: 0.2393 - val_accuracy: 0.9622 - val_auc: 0.9947 - val_loss: 0.1816 - learning_rate: 1.0000e-03
Epoch 5/100
525/525 - 3s - 5ms/step - accuracy: 0.9456 - auc: 0.9872 - loss: 0.2205 - val_accuracy: 0.9635 - val_auc: 0.9947 - val_loss: 0.1741 - learning_rate: 1.0000e-03
Epoch 6/100
525/525 - 2s - 3ms/step - accuracy: 0.9514 - auc: 0.9890 - loss: 0.2041 - val_accuracy: 0.9665 - val_auc: 0.9957 - val_loss: 0.1

## Evaluate

In [12]:
print("Evaluate on test set:")
model = tf.keras.models.load_model(DEEPFAKEVOICE_KERAS_MODEL) # rollback to best model
res = model.evaluate(X_test, y_test, verbose=2)
print(res)

Evaluate on test set:
328/328 - 1s - 2ms/step - accuracy: 0.9879 - auc: 0.9993 - loss: 0.0684
[0.06835038214921951, 0.9992964267730713, 0.9878886342048645]


In [13]:
# predict & classification report
y_pred_prob = model.predict(X_test).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)
print(classification_report(y_test, y_pred, target_names=["FAKE","REAL"]))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 898us/step
              precision    recall  f1-score   support

        FAKE       1.00      0.98      0.99      5237
        REAL       0.98      1.00      0.99      5249

    accuracy                           0.99     10486
   macro avg       0.99      0.99      0.99     10486
weighted avg       0.99      0.99      0.99     10486

Confusion matrix:
 [[5118  119]
 [   8 5241]]


In [14]:
print(DEEPFAKEVOICE_LOGS)
for path in sorted(Path(DEEPFAKEVOICE_LOGS).glob("**/*")):
    print("  " * (len(path.parts) - 1) + path.parts[-1])

../app/model/deepfakevoice/deepfakevoice_logs
          run_001
            train
              events.out.tfevents.1760288310.LAPTOPTHNNDAT.24415.0.v2
              events.out.tfevents.1760288638.LAPTOPTHNNDAT.24415.2.v2
            validation
              events.out.tfevents.1760288313.LAPTOPTHNNDAT.24415.1.v2
              events.out.tfevents.1760288640.LAPTOPTHNNDAT.24415.3.v2
          run_002
            train
              events.out.tfevents.1760368612.LAPTOPTHNNDAT.9851.0.v2
            validation
              events.out.tfevents.1760368615.LAPTOPTHNNDAT.9851.1.v2
          run_003
            train
              events.out.tfevents.1760717118.LAPTOPTHNNDAT.16960.0.v2
            validation
              events.out.tfevents.1760717121.LAPTOPTHNNDAT.16960.1.v2


In [15]:
%load_ext tensorboard
%tensorboard --logdir=../app/model/deepfakevoice/deepfakevoice_logs

## Convert to TFLITE

In [16]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

with open(DEEPFAKEVOICE_TFLITE_MODEL, "wb") as f:
    f.write(tflite_model)

INFO:tensorflow:Assets written to: /tmp/tmpxlvzyo13/assets


INFO:tensorflow:Assets written to: /tmp/tmpxlvzyo13/assets


Saved artifact at '/tmp/tmpxlvzyo13'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 26), dtype=tf.float32, name='mfcc_input')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  136892019898880: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136892020325216: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136892020147824: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136892020791904: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136892020786272: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136892020787504: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136892020792256: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136891820430208: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136892020799824: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136891820434432: TensorSpec(shape=(), dtype=tf.resource, name=None)
  136891820429152: TensorSp

W0000 00:00:1760717293.889692   16960 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1760717293.889740   16960 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-10-17 23:08:13.890130: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpxlvzyo13
2025-10-17 23:08:13.890886: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-10-17 23:08:13.890901: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpxlvzyo13
I0000 00:00:1760717293.897323   16960 mlir_graph_optimization_pass.cc:437] MLIR V1 optimization pass is not enabled
2025-10-17 23:08:13.898832: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-10-17 23:08:13.943799: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpxlvzyo13
2025-10-17 23:08:13.955650: I tensorflow/cc/saved_model/loader.cc:471] SavedModel 

# Inference TFLite Model

In [2]:
import numpy as np
import librosa
import joblib

In [3]:
AUDIO_REAL = "../data/DEMONSTRATION/linus-original-DEMO.mp3"
AUDIO_FAKE = "../data/DEMONSTRATION/linus-to-musk-DEMO.mp3"

In [4]:
def extract_features_per_segment(audio_segment, sr=22050):
    segment_samples = sr
    try:
        # Pad or truncate to ensure segment is correct length
        if len(audio_segment) < segment_samples :
            audio_segment = np.pad(audio_segment, (0, segment_samples - len(audio_segment)))
        elif len(audio_segment) > segment_samples:
            audio_segment = audio_segment[:segment_samples]
        
        # Extract 6 statistical features
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio_segment, sr=sr))
        rms = np.mean(librosa.feature.rms(y=audio_segment))
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_segment, sr=sr))
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio_segment, sr=sr))
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio_segment, sr=sr))
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio_segment))

        # Extract 20 MFCCs
        mfccs = librosa.feature.mfcc(y=audio_segment, sr=sr, n_mfcc=20)
        mfccs_mean = np.mean(mfccs, axis=1)

        # Combine all features
        features = np.array([chroma_stft, rms, spectral_centroid, spectral_bandwidth, spectral_rolloff, zcr, *mfccs_mean], dtype=np.float32)

        return features
    except Exception as e:
        raise Exception(f"Feature extraction failed: {str(e)}")

In [5]:
def extract_feature(file_path, sr=22050):
    """
    Extract feature entire audio file
    """
    try:
        # Load audio
        y, sr = librosa.load(file_path, sr=sr, mono=True)

        # Calculate number of 1-second segments
        segment_len = sr
        num_segments = int(np.ceil(len(y) / segment_len))

        features_list = []
        
        # Extract feature of 1-second segment audio
        for i in range(num_segments):
            start = i * segment_len
            end = min((i + 1) * segment_len, len(y))
            y_segment = y[start:end]

            features = extract_features_per_segment(y_segment, sr)
            features_list.append(features)

        return np.array(features_list)
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

In [6]:
scaler = joblib.load(DEEPFAKEVOICE_SCALER)

real_features = extract_feature(AUDIO_REAL)
real_scaled_features = scaler.transform(real_features)
fake_features = extract_feature(AUDIO_FAKE)
fake_scaled_features = scaler.transform(fake_features)

In [7]:
from ai_edge_litert.interpreter import Interpreter

# Load and allocate the TFLite interpreter
interpreter = Interpreter(
    model_path=DEEPFAKEVOICE_TFLITE_MODEL,
    num_threads=1
)

interpreter.allocate_tensors()

# Get input and output tensor details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

I0000 00:00:1760720450.600443   37115 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [8]:
def predict(features):
    batch_size = features.shape[0]
    interpreter.resize_tensor_input(
        input_details[0]['index'],
        [batch_size, features.shape[1]]
    )
    interpreter.allocate_tensors()


    interpreter.set_tensor(
        input_details[0]['index'],
        features
    )

    # Run inference
    interpreter.invoke()

    # Get the output tensor
    result = interpreter.get_tensor(
        output_details[0]['index']
    )

    # Squeeze output to remove extra dimensions
    probs = np.squeeze(result)

    # Threshold at 0.5 to get binary prediction (1: REAL if probs > 0.5)
    if probs.ndim == 0:
        # Single sample case: return scalar index
        result_index = 1 if probs > 0.5 else 0
    else:
        # Batch case: return array of indices
        result_index = (probs > 0.5).astype(int)

    if np.average(result_index) < 0.5:
        print("\tPrediction: Fake")
    else:
        print("\tPrediction: Real")

In [9]:
print("Real audio predict:")
predict(real_scaled_features)
print("Fake audio predict:")
predict(fake_scaled_features)

Real audio predict:
	Prediction: Real
Fake audio predict:
	Prediction: Fake
