# Feature Extraction
This notebook handles the **feature extraction** from the datasets, preparing them for model training and evaluation.

## What This Notebook Does:
- Counts Audio Files:
    - Counts the number of audio files in each dataset and their respective splits.
    - Provides an overview of the data distribution (fake vs real, training/validation/testing).
- Extracts Audio Features:
    - Extracts features such as chroma, spectral, and MFCC using Librosa.
- Saves the features in separate CSV files for each dataset:
    - for_norm_features.csv  (features for the FoR (for-norm) dataset).
    - release_in_wild_features.csv  (features for the In-The-Wild dataset, which is named as release-In-The-Wild).
- Handles Errors:
    - Logs any errors into an error log file (error_log.txt).
    - Ensures continuous execution even if some files fail to process.
- Uses a progress bar (`tqdm`) for better tracking.

In [1]:
# Importing required libraries
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm

In [4]:
# Dataset paths (downloaded inside the data folder)
datasets = {
    "for-norm": "data/fake_or_real_dataset/for-norm/for-norm",
    "release-in-the-wild": "data/processed_data"
}

# Splits for the for-norm dataset
splits = ["training", "validation", "testing"]
labels = {"fake": 0, "real": 1}

# Output directory
processed_output_dir = "Processed_Features"
os.makedirs(processed_output_dir, exist_ok=True)

# Error log
error_log = "error_log.txt"

In [8]:
# Function to count files in the dataset
def count_files_in_dataset(base_dir, splits, labels):
    file_counts = {}
    total_files = 0

    if splits:  # Structured splits (for-norm)
        for split in splits:
            split_counts = {}
            for label in labels:
                folder_path = os.path.join(base_dir, split, label)
                if os.path.exists(folder_path):
                    count = len([f for f in os.listdir(folder_path) if f.endswith(".wav")])
                else:
                    count = 0
                split_counts[label] = count
                total_files += count
            file_counts[split] = split_counts
    else:  # Structure of release-in-the-wild
        for label in labels:
            folder_path = os.path.join(base_dir, label)
            if os.path.exists(folder_path):
                count = len([f for f in os.listdir(folder_path) if f.endswith(".wav")])
            else:
                count = 0
            file_counts[label] = count
            total_files += count

    return file_counts, total_files

# Count files for both datasets
for_norm_counts, for_norm_total = count_files_in_dataset(datasets["for-norm"], splits, labels)
release_in_wild_counts, release_in_wild_total = count_files_in_dataset(datasets["release-in-the-wild"], None, labels)

# Display counts
print("For-Norm Dataset File Counts:")
for split, split_counts in for_norm_counts.items():
    print(f"{split.capitalize()}:")
    for label, count in split_counts.items():
        print(f"  - {label.capitalize()}: {count} files")
print(f"Total files in for-norm dataset: {for_norm_total}")

print("\nRelease-In-the-Wild Dataset File Counts:")
for label, count in release_in_wild_counts.items():
    print(f"  - {label.capitalize()}: {count} files")
print(f"Total files in release-in-the-wild dataset: {release_in_wild_total}")

For-Norm Dataset File Counts:
Training:
  - Fake: 26927 files
  - Real: 26941 files
Validation:
  - Fake: 5398 files
  - Real: 5400 files
Testing:
  - Fake: 2370 files
  - Real: 2264 files
Total files in for-norm dataset: 69300

Release-In-the-Wild Dataset File Counts:
  - Fake: 11816 files
  - Real: 19963 files
Total files in release-in-the-wild dataset: 31779


In [10]:
# Function to extract features from audio files
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        features = {
            "chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr)),
            "rms": np.mean(librosa.feature.rms(y=y)),
            "spectral_centroid": np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
            "spectral_bandwidth": np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)),
            "rolloff": np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)),
            "zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(y))
        }
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        for i in range(1, 21):
            features[f"mfcc{i}"] = np.mean(mfccs[i - 1])
        return features
    except Exception as e:
        with open(error_log, "a") as log_file:
            log_file.write(f"Error processing {file_path}: {e}\n")
        return None

In [11]:
# Feature extraction for for-norm dataset
def process_audio_files_for_norm(base_dir):
    all_features = []
    total_files = sum(len(os.listdir(os.path.join(base_dir, split, label)))
                      for split in splits
                      for label in labels if os.path.exists(os.path.join(base_dir, split, label)))
    progress_bar = tqdm(total=total_files, desc="Processing for-norm")

    for split in splits:
        for label in labels:
            folder_path = os.path.join(base_dir, split, label)
            if not os.path.exists(folder_path):
                continue
            for file in os.listdir(folder_path):
                if file.endswith(".wav"):
                    file_path = os.path.join(folder_path, file)
                    features = extract_features(file_path)
                    if features:
                        features["filename"] = file
                        features["split"] = split
                        features["label"] = label
                        features["LABEL"] = labels[label]
                        all_features.append(features)
                    progress_bar.update(1)

    progress_bar.close()
    return all_features

# Run feature extraction
for_norm_features = process_audio_files_for_norm(datasets["for-norm"])
for_norm_df = pd.DataFrame(for_norm_features)
for_norm_csv = os.path.join(processed_output_dir, "for_norm_features.csv")
for_norm_df.to_csv(for_norm_csv, index=False)
print(f"For-Norm features saved to: {for_norm_csv}")

  return pitch_tuning(
Processing for-norm: 100%|██████████| 69300/69300 [50:46<00:00, 22.75it/s]


For-Norm features saved to: Processed_Features/for_norm_features.csv


In [12]:
# Feature extraction for release_in-the-wild (In the wild dataset)
def process_audio_files_release_in_wild(base_dir):
    all_features = []
    total_files = sum(len(os.listdir(os.path.join(base_dir, label)))
                      for label in labels if os.path.exists(os.path.join(base_dir, label)))
    progress_bar = tqdm(total=total_files, desc="Processing release-in-the-wild")

    for label in labels:
        folder_path = os.path.join(base_dir, label)
        if not os.path.exists(folder_path):
            continue
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):
                file_path = os.path.join(folder_path, file)
                features = extract_features(file_path)
                if features:
                    features["filename"] = file
                    features["label"] = label
                    features["LABEL"] = labels[label]
                    all_features.append(features)
                progress_bar.update(1)

    progress_bar.close()
    return all_features

# Run feature extraction
release_in_wild_features = process_audio_files_release_in_wild(datasets["release-in-the-wild"])
release_in_wild_df = pd.DataFrame(release_in_wild_features)
release_in_wild_csv = os.path.join(processed_output_dir, "release_in_wild_features.csv")
release_in_wild_df.to_csv(release_in_wild_csv, index=False)
print(f"Release-In-The-Wild features saved to: {release_in_wild_csv}")

Processing release-in-the-wild: 100%|██████████| 31779/31779 [32:39<00:00, 16.22it/s] 


Release-In-The-Wild features saved to: Processed_Features/release_in_wild_features.csv
