In [None]:
!pip install datasets==3.3.1
!pip install pandas==2.2.3
!pip install numpy==1.26.4
!pip install torch==2.5.1
!pip install torchaudio==2.5.1
!pip install librosa==0.10.2.post1
!pip install tqdm==4.67.1
!pip install matplotlib==3.7.5
!pip install tensorflow==2.18.0
!pip install keras==3.5.0

Documentation to get hf token : https://huggingface.co/docs/hub/en/security-tokens

In [None]:
from datasets import load_dataset

dataset = load_dataset("nuriachandra/Deepfake-Eval-2024", token="your_hf_token")

Access metadata from here : https://huggingface.co/datasets/nuriachandra/Deepfake-Eval-2024/resolve/main/audio-metadata-publish.csv

In [None]:
import pandas as pd

df = pd.read_csv('./audio_metadata.csv')

### Extracting spectogram images for Deepfake-Eval-2024 dataset

In [None]:
import os
import torch
import torchaudio.transforms as T
import matplotlib.pyplot as plt
import librosa
from tqdm import tqdm

output_dir = "./mel_spectrograms"
os.makedirs(output_dir, exist_ok=True)

metadata_df = pd.read_csv("./audio_metadata.csv")
metadata_df = metadata_df.set_index('Filename')

def waveform_to_mel_image(waveform, sample_rate, out_path):
    waveform = torch.tensor(waveform, dtype=torch.float32)
    if waveform.dim() == 1:
        waveform = waveform.unsqueeze(0)

    mel_transform = T.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )
    mel_spec = mel_transform(waveform)
    db_transform = T.AmplitudeToDB(top_db=80)
    mel_db = db_transform(mel_spec)
    mel_np = mel_db.squeeze(0).numpy()

    plt.figure(figsize=(10, 4))
    plt.imshow(mel_np, origin='lower', aspect='auto', cmap='inferno')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
    plt.close()

dataset_split = dataset['train']

for index in tqdm(range(len(dataset_split))):
    try:
        audio_data = dataset_split[index]['audio']
        file_name = os.path.basename(audio_data['path'])
        waveform, sample_rate = librosa.load(audio_data['path'], sr=None, mono=True)

        out_path = os.path.join(output_dir, f"{file_name.replace('.mp3', '')}.png")

        waveform_to_mel_image(waveform, sample_rate, out_path)

    except Exception as e:
        print(f"Failed to convert sample at index {index}: {e}")


### Data Loading function

In [None]:
from tensorflow.keras.preprocessing import image
import os

def load_images(path, label, df, split):
    images = []
    labels = []

    valid_files = df[(df['Ground Truth'] == label) & (df['Finetuning Set'] == split)]['Filename']
    valid_basenames = set(name.split('.')[0] for name in valid_files)

    print(f"Looking for {len(valid_basenames)} matching .png images corresponding to .wav/.mp3 entries.")

    i = 0
    matched_files = []

    for file in os.listdir(path):
        file_basename = file.split('.')[0] 
        if file_basename in valid_basenames:
            matched_files.append(file)
            try:
                img_array = image.img_to_array(image.load_img(os.path.join(path, file), target_size=(224, 224, 3)))
                images.append(img_array)
                if label=='Real':
                    labels.append(0)
                else:
                    labels.append(1)

                i += 1
                if i % 50 == 0:
                    print('Loaded', i, 'images')
            except Exception as e:
                print(f"Failed to load {file}: {e}")

    print(f'\nTotal {label} {split.lower()} images loaded = {i}')
    print(f"Example matched: {matched_files[:5]}")
    return images, labels


### Defining training and testing set

In [2]:
x = []
y = []

In [None]:
images, labels = load_images('./mel_spectrograms', 'Real',df,'Train')

x += images
y += labels

print('Image shape:',x[0].shape)
print('Image label (REAL):',y[0])

In [None]:
images, labels = load_images('./mel_spectrograms', 'Fake',df,'Train')
    
x += images
y += labels

print('Image shape:',x[0].shape)
print('Image label (FAKE):',y[-1])

In [3]:
x_test = []
y_test = []

In [None]:
images, labels = load_images('./mel_spectrograms', 'Real',df,'Test')

x_test += images
y_test += labels

print('Image shape:',x[0].shape)
print('Image label (REAL):',y[0])

In [None]:
images, labels = load_images('./mel_spectrograms', 'Fake',df,'Test')

x_test += images
y_test += labels

print('Image shape:',x[0].shape)
print('Image label (REAL):',y[0])

In [None]:
from tensorflow.keras.utils import to_categorical

x_train_norm = np.array(x) / 255
x_test_norm = np.array(x_test) / 255

y_train_encoded = to_categorical(y)
y_test_encoded = to_categorical(y_test)

### Loading and Defining Model Architecture

In [None]:
from keras.layers import GlobalAveragePooling2D
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Flatten, Dense
import tensorflow.keras as K
from tensorflow.keras.applications.mobilenet import MobileNet
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',     
    patience=5,               
    restore_best_weights=True  
)

gpus = tf.config.list_physical_devices('GPU')
base_model = MobileNet(weights='imagenet', include_top=False)

In [None]:
x = base_model.output

x = Dense(32, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = GlobalAveragePooling2D()(x)
x = Dense(2, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs=base_model.input, outputs=x)


Freezing weights for deeper layers

In [None]:
for layer in base_model.layers:
    layer.trainable = False

### Training and Testing loop

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

hist = model.fit(
    x_train_norm, y_train_encoded, 
    epochs=50, 
    batch_size=32, 
    validation_data=(x_test_norm, y_test_encoded),
    callbacks=[early_stopping] 
)