In [1]:
import os
import shutil
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.decomposition import PCA


import pickle

import warnings
warnings.filterwarnings("ignore")

2024-06-20 08:25:10.231082: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-20 08:25:10.231237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-20 08:25:10.368719: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
BASE_PATH = '../input/asvpoof-2019-dataset/LA/LA'
FOLDS = 10
SEED = 101
DEBUG = True

# Audio params
SAMPLE_RATE = 16000
DURATION = 5.0 # duration in second
AUDIO_LEN = int(SAMPLE_RATE * DURATION)

# Spectrogram params
N_MELS = 128 # freq axis
N_FFT = 2048
SPEC_WIDTH = 256 # time axis
HOP_LEN = AUDIO_LEN//(SPEC_WIDTH - 1) # non-overlap region
FMAX = SAMPLE_RATE//2 # max frequency
SPEC_SHAPE = [SPEC_WIDTH, N_MELS] # output spectrogram shape

## Adding data from asvspoof-2019

# Meta Data

* `speaker_id` : 		LA_****, a 4-digit speaker ID
* `filename` : 	LA_****, name of the audio file
* `system_id` : 		ID of the speech spoofing system `(A01 - A19)`,  or, for **real** speech SYSTEM-ID is left blank ('-')
* `class_name` : 		**bonafide** for genuine speech, or, **spoof** for fake/spoof speech
* `target` : `1` for **fake/spoof**  and `0` for **real/genuine**

In [3]:
train_df = pd.read_csv(f'{BASE_PATH}/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt',
                       sep=" ", header=None)
train_df.columns =['speaker_id','filename','system_id','null','class_name']
train_df.drop(columns=['null'],inplace=True)
train_df['filepath'] = f'{BASE_PATH}/ASVspoof2019_LA_train/flac/'+train_df.filename+'.flac'
train_df['target'] = (train_df.class_name=='spoof').astype('int32') # set labels 1 for fake and 0 for real
if DEBUG:
    train_df = train_df.groupby(['target']).sample(2500).reset_index(drop=True)
print(f'Train Samples: {len(train_df)}')

'''In actual train has around 22800 samples with 1 and 3000 around with 0'''
train_df.head(2)

Train Samples: 5000


Unnamed: 0,speaker_id,filename,system_id,class_name,filepath,target
0,LA_0091,LA_T_2424483,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0
1,LA_0085,LA_T_1691318,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0


In [4]:
count_target_1 = (train_df['target'] == 1).sum()
print("Number of rows with target variable as 1:", count_target_1)
# print(len(train_df)-22296)

Number of rows with target variable as 1: 2500


In [5]:
valid_df = pd.read_csv(f'{BASE_PATH}/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt',
                       sep=" ", header=None)
valid_df.columns =['speaker_id','filename','system_id','null','class_name']
valid_df.drop(columns=['null'],inplace=True)
valid_df['filepath'] = f'{BASE_PATH}/ASVspoof2019_LA_dev/flac/'+valid_df.filename+'.flac'
valid_df['target'] = (valid_df.class_name=='spoof').astype('int32')
if DEBUG:
    valid_df = valid_df.groupby(['target']).sample(2000).reset_index(drop=True)
print(f'Valid Samples: {len(valid_df)}')


'''In actual val has 22296 samples with 1 and 2548 with 0'''
valid_df.head(2)

Valid Samples: 4000


Unnamed: 0,speaker_id,filename,system_id,class_name,filepath,target
0,LA_0070,LA_D_6289915,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0
1,LA_0099,LA_D_9253093,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0


In [6]:
test_df = pd.read_csv(f'{BASE_PATH}/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt',
                       sep=" ", header=None)
test_df.columns =['speaker_id','filename','system_id','null','class_name']
test_df.drop(columns=['null'],inplace=True)
test_df['filepath'] = f'{BASE_PATH}/ASVspoof2019_LA_eval/flac/'+test_df.filename+'.flac'
test_df['target'] = (test_df.class_name=='spoof').astype('int32')
if DEBUG:
    test_df = test_df.groupby(['target']).sample(3000).reset_index(drop=True)
print(f'Test Samples: {len(test_df)}')


'''In actual test has 63882 samples with 1 and 7355 with 0'''
test_df.head(2)

Test Samples: 6000


Unnamed: 0,speaker_id,filename,system_id,class_name,filepath,target
0,LA_0012,LA_E_5154555,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0
1,LA_0018,LA_E_1724045,-,bonafide,../input/asvpoof-2019-dataset/LA/LA/ASVspoof20...,0


In [7]:
combined_df = pd.concat([train_df, valid_df, test_df])
combined_df.reset_index(drop=True, inplace=True)

In [8]:
combined_df.iloc[1].filepath

'../input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac/LA_T_1691318.flac'

In [9]:
X = []
y = []

In [10]:
features = []
labels = []
n_components = 40
max_length = 500

In [11]:
for index, row in combined_df.iterrows():
    filepath = row['filepath']  # Assuming 'filepath' is the column name
    try:
        audio, sr = sf.read(filepath)
        
        # Extract MFCC features
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        
        # Extract Mel-spectrogram features
        melspec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=40)
        melspec_db = librosa.power_to_db(melspec, ref=np.max)
        
        # Pad or trim the MFCC feature array to a fixed length
        if mfccs.shape[1] < max_length:
            mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        # Pad or trim the Mel-spectrogram feature array to a fixed length
        if melspec_db.shape[1] < max_length:
            melspec_db = np.pad(melspec_db, ((0, 0), (0, max_length - melspec_db.shape[1])), mode='constant')
        else:
            melspec_db = melspec_db[:, :max_length]
        
        # Perform feature fusion (concatenate MFCC and Mel-spectrogram features)
        fused_features = np.concatenate((mfccs, melspec_db), axis=0)
        
        pca = PCA(n_components=n_components)
        fused_features_pca = pca.fit_transform(fused_features.T)  # Transpose for PCA
        fused_features_pca = fused_features_pca.T
        features.append(fused_features_pca)

        if row['target'] == 1:
            labels.append(1)  # 1 for fake
        else:
            labels.append(0)  # 0 for real
           
    except Exception as e:
        print(f"Error encountered while parsing file: {filepath}")
        continue
          
        
# X, y = np.array(features), np.array(labels)

In [12]:
X.extend(features)
y.extend(labels)


## Adding data from deep-voice-deepfake-voice-recognition

In [13]:
import gc

del features
del labels
gc.collect()

120016

In [14]:
features = []
labels = []

In [15]:
# Segmenting the audio files into segments of 8 seconds

# Reading fake audios, taking only 8 audios(since real has 8 only) and at a gap of 7 to include different variations
folder_path = "/kaggle/input/deep-voice-deepfake-voice-recognition/KAGGLE/AUDIO/FAKE"

count = 0
fake_count = 0
for file in tqdm(os.listdir(folder_path)):
    file_path = os.path.join(folder_path, file)
    try:
        
        if(count==56):
            break
        count += 1
        if(count%7!=0):
            continue 
        print(count)
        
        # Load audio file
        audio, _ = librosa.load(file_path, sr=16000)
        n_samples = len(audio)
        duration = 5
        sample_rate = 16000
        n_segments = int(np.ceil(n_samples / (sample_rate * duration)))
        
        for i in range(n_segments):
            start = i * sample_rate * duration
            end = min((i + 1) * sample_rate * duration, n_samples)
            segment = audio[start:end]
            
            mfccs = librosa.feature.mfcc(y=segment, sr=16000, n_mfcc=40)
            
            melspec = librosa.feature.melspectrogram(y=segment, sr=16000, n_mels=40)
            melspec_db = librosa.power_to_db(melspec, ref=np.max)
            
            # Pad or trim the feature array to a fixed length
            if mfccs.shape[1] < max_length:
                mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
            else:
                mfccs = mfccs[:, :max_length]
                
            # Pad or trim the Mel-spectrogram feature array to a fixed length
            if melspec_db.shape[1] < max_length:
                melspec_db = np.pad(melspec_db, ((0, 0), (0, max_length - melspec_db.shape[1])), mode='constant')
            else:
                melspec_db = melspec_db[:, :max_length]
                         
            fused_features = np.concatenate((mfccs, melspec_db), axis=0)
            pca = PCA(n_components=n_components)
            fused_features_pca = pca.fit_transform(fused_features.T)  # Transpose for PCA
            fused_features_pca = fused_features_pca.T
            fake_count+=1
            features.append(fused_features_pca)
            labels.append(1)  # 1 for fake

    except Exception as e:
        print(e)
        print(f"Error encountered while parsing file: {file_path}")
        continue

print("Number of fake audio segments:", fake_count)  
# print(labels)
     


  0%|          | 0/56 [00:00<?, ?it/s]

7
14
21
28
35
42
49
56
Number of fake audio segments: 747


In [16]:
count = 0
real_count = 0
# Reading real audios
folder_path = "/kaggle/input/deep-voice-deepfake-voice-recognition/KAGGLE/AUDIO/REAL"

for file in tqdm(os.listdir(folder_path)):
    file_path = os.path.join(folder_path, file)
    try:
        count+=1
        print(count)
        # Load audio file
        audio, _ = librosa.load(file_path, sr=16000)
        n_samples = len(audio)
        duration = 5
        sample_rate = 16000
        n_segments = int(np.ceil(n_samples / (sample_rate * duration)))
        
        for i in range(n_segments):
            start = i * sample_rate * duration
            end = min((i + 1) * sample_rate * duration, n_samples)
            segment = audio[start:end]
            
            mfccs = librosa.feature.mfcc(y=segment, sr=16000, n_mfcc=40)
            
            melspec = librosa.feature.melspectrogram(y=audio, sr=16000, n_mels=40)
            melspec_db = librosa.power_to_db(melspec, ref=np.max)
            # Pad or trim the feature array to a fixed length
            if mfccs.shape[1] < max_length:
                mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
            else:
                mfccs = mfccs[:, :max_length]
            
            if melspec_db.shape[1] < max_length:
                melspec_db = np.pad(melspec_db, ((0, 0), (0, max_length - melspec_db.shape[1])), mode='constant')
            else:
                melspec_db = melspec_db[:, :max_length]
            
            
            fused_features = np.concatenate((mfccs, melspec_db), axis=0)
            pca = PCA(n_components=n_components)
            fused_features_pca = pca.fit_transform(fused_features.T)  # Transpose for PCA
            fused_features_pca = fused_features_pca.T

            real_count+=1
            features.append(fused_features_pca)
            labels.append(0)  # 0 for real

    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}")
        continue
                         
print("Number of real audio segments:", real_count)  
                         
                         
# X_deep_voice = np.array(features)
# y_deep_voice = np.array(labels)

  0%|          | 0/8 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
Number of real audio segments: 755


In [17]:
X.extend(features)
y.extend(labels)

# Adding data from wavefake

In [18]:
def read_data(folder_path, features, labels, count=500, fake=True):
    number = 0
    for file in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, file)
        try:
            number+=1
            if(number==count+1):
                break
            # Load audio file
            audio, _ = librosa.load(file_path, sr=16000)

            # Extract features (example: using Mel-Frequency Cepstral Coefficients)
            mfccs = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=40)

            melspec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=40)
            melspec_db = librosa.power_to_db(melspec, ref=np.max)
            # Pad or trim the feature array to a fixed length
            if mfccs.shape[1] < max_length:
                mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
            else:
                mfccs = mfccs[:, :max_length]
                
            # Pad or trim the Mel-spectrogram feature array to a fixed length
            if melspec_db.shape[1] < max_length:
                melspec_db = np.pad(melspec_db, ((0, 0), (0, max_length - melspec_db.shape[1])), mode='constant')
            else:
                melspec_db = melspec_db[:, :max_length]
                
            fused_features = np.concatenate((mfccs, melspec_db), axis=0)
            pca = PCA(n_components=n_components)
            fused_features_pca = pca.fit_transform(fused_features.T)  # Transpose for PCA
            fused_features_pca = fused_features_pca.T

            features.append(fused_features_pca)

            if fake == True:
                labels.append(1)  # 1 for fake
            else:
                labels.append(0)  # 0 for real

        except Exception as e:
            print(f"Error encountered while parsing file: {file_path}")
            continue

In [19]:
del features
del labels
gc.collect()

38

In [20]:
features = []
labels = []

In [21]:
folder_path = "/kaggle/input/wavefake-test/generated_audio/common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech/generated"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))

folder_path = "/kaggle/input/wavefake-test/generated_audio/jsut_multi_band_melgan"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))                         

folder_path = "/kaggle/input/wavefake-test/generated_audio/jsut_parallel_wavegan"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))

folder_path = "/kaggle/input/wavefake-test/generated_audio/ljspeech_full_band_melgan"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))                         

folder_path = "/kaggle/input/wavefake-test/generated_audio/ljspeech_hifiGAN"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))

folder_path = "/kaggle/input/wavefake-test/generated_audio/ljspeech_melgan"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))                         

folder_path = "/kaggle/input/wavefake-test/generated_audio/ljspeech_melgan_large"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))

folder_path = "/kaggle/input/wavefake-test/generated_audio/ljspeech_multi_band_melgan"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))                         

folder_path = "/kaggle/input/wavefake-test/generated_audio/ljspeech_parallel_wavegan"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))

folder_path = "/kaggle/input/wavefake-test/generated_audio/ljspeech_waveglow"
read_data(folder_path, features, labels, 2000, True)
print(len(labels), len(features))                                              

  0%|          | 0/16283 [00:00<?, ?it/s]

2000 2000


  0%|          | 0/5000 [00:00<?, ?it/s]

4000 4000


  0%|          | 0/5000 [00:00<?, ?it/s]

6000 6000


  0%|          | 0/13100 [00:00<?, ?it/s]

8000 8000


  0%|          | 0/13100 [00:00<?, ?it/s]

10000 10000


  0%|          | 0/13100 [00:00<?, ?it/s]

18000 18000


  0%|          | 0/13100 [00:00<?, ?it/s]

20000 20000


In [22]:
len(X)

16502

In [23]:
X.extend(features)
y.extend(labels)

In [24]:
y.count(1)

28247

In [25]:
del features
del labels
gc.collect()

95

In [26]:
X = np.array(X)
y = np.array(y)
len(y)-np.count_nonzero(y)

8255

if you are running into memory issues you can save the data as follows and so on. else, ignore next 3 cells

In [27]:
n = len(X)
print("len x:",n)
idx_25 = n // 4
idx_50 = n // 2
idx_75 = 3 * n // 4

x_25 = X[:idx_25]
x_50 = X[idx_25:idx_50]
x_75 = X[idx_50:idx_75]
x_100 = X[idx_75:]
print("sum len x:",len(x_25)+len(x_50)+len(x_75)+len(x_100))

ny = len(y)
print("len y:",n)
idx_25 = n // 4
idx_50 = n // 2
idx_75 = 3 * n // 4

# Slice the array into the desired ranges
y_25 = y[:idx_25]
y_50 = y[idx_25:idx_50]
y_75 = y[idx_50:idx_75]
y_100 = y[idx_75:]
print("sum len x:",len(y_25)+len(y_50)+len(y_75)+len(y_100))

len x: 36502
sum len x: 36502
len y: 36502
sum len x: 36502


In [28]:
# Save X
with open('X_fused1.pkl', 'wb') as f:
    pickle.dump(x_25, f)

# Save y
with open('y_fused1.pkl', 'wb') as f:
    pickle.dump(y_25, f)

In [None]:
# Save X
with open('X_fused2.pkl', 'wb') as f:
    pickle.dump(x_50, f)

# Save y
with open('y_fused2.pkl', 'wb') as f:
    pickle.dump(y_50, f)

In [29]:
# Save X
with open('X_fused.pkl', 'wb') as f:
    pickle.dump(X, f)

# Save y
with open('y_fused.pkl', 'wb') as f:
    pickle.dump(y, f)

In [32]:
os.remove("/kaggle/working/X_fused1.pkl")

In [None]:
# Load X
with open('/kaggle/input/pickle-files/X_fused.pkl', 'rb') as f:
    X = pickle.load(f)

# Load y
with open('/kaggle/input/pickle-files/y_fused.pkl', 'rb') as f:
    y = pickle.load(f)
    
