# Preprocess Raw Audio

## Import

In [None]:
import os
import numpy as np
import pandas as pd
import librosa

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

In [9]:
AUDIO_ROOT = "../data/AUDIO"
OUTPUT_CSV = "../data/preprocessed.csv"

## Extract features

In [3]:
def extract_features_per_segment(audio_segment, sr=22050):
    segment_samples = sr
    try:
        # Pad or truncate to ensure segment is correct length
        if len(audio_segment) < segment_samples :
            audio_segment = np.pad(audio_segment, (0, segment_samples - len(audio_segment)))
        elif len(audio_segment) > segment_samples:
            audio_segment = audio_segment[:segment_samples]
        
        # Extract 6 statistical features
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio_segment, sr=sr))
        rms = np.mean(librosa.feature.rms(y=audio_segment))
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_segment, sr=sr))
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio_segment, sr=sr))
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio_segment, sr=sr))
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio_segment))

        # Extract 20 MFCCs
        mfccs = librosa.feature.mfcc(y=audio_segment, sr=sr, n_mfcc=20)
        mfccs_mean = np.mean(mfccs, axis=1)

        # Combine all features
        features = np.array([chroma_stft, rms, spectral_centroid, spectral_bandwidth, spectral_rolloff, zcr, *mfccs_mean], dtype=np.float32)

        return features
    except Exception as e:
        raise Exception(f"Feature extraction failed: {str(e)}")

In [4]:
def extract_feature(file_path, sr=22050):
    """
    Extract feature entire audio file
    """
    try:
        # Load audio
        y, sr = librosa.load(file_path, sr=sr, mono=True)

        # Calculate number of 1-second segments
        segment_len = sr
        num_segments = int(np.ceil(len(y) / segment_len))

        features_list = []
        
        # Extract feature of 1-second segment audio
        for i in range(num_segments):
            start = i * segment_len
            end = min((i + 1) * segment_len, len(y))
            y_segment = y[start:end]

            features = extract_features_per_segment(y_segment, sr)
            features_list.append(features)

        return np.array(features_list)
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

In [5]:
folders = os.listdir(AUDIO_ROOT)
data = []
labels = []

for folder in folders:
    files = os.listdir(os.path.join(AUDIO_ROOT, folder))
    for file in tqdm(files, desc=f"Processing {folder}"):
        file_path = os.path.join(AUDIO_ROOT, folder, file)
        features = extract_feature(file_path)


        if features is not None:
            # Append all segments from this file
            data.extend(features)
            labels.extend([folder] * len(features))

# Convert to numpy array
data = np.array(data)
labels = np.array(labels)

print(f"Total samples: {len(data)}")
print(f"Feature shape: {data.shape}")

Processing REAL: 100%|██████████| 8/8 [01:18<00:00,  9.78s/it]
Processing FAKE: 100%|██████████| 56/56 [09:33<00:00, 10.25s/it]

Total samples: 29965
Feature shape: (29965, 26)





## Oversampling with RandomOverSampler

In [6]:
print("Class distribution before oversampling:")
unique, counts = np.unique(labels, return_counts=True)
for label, count in zip(unique, counts):
    print(f"{label}: {count}")

Class distribution before oversampling:
FAKE: 26215
REAL: 3750


In [8]:
# Encode label (REAL -> 1, FAKE -> 0)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Apply oversampling
print("Applying oversampling...")
ros = RandomOverSampler(random_state=42)
data_resampled, labels_encoded_resampled = ros.fit_resample(data, labels_encoded)

# Decode labels back
labels_resampled = label_encoder.inverse_transform(labels_encoded_resampled)

print(f"Total samples after oversampling: {len(data_resampled)}")
print(f"Class distribution after oversampling:")
unique, counts = np.unique(labels_resampled, return_counts=True)
for label, count in zip(unique, counts):
    print(f"{label}: {count}")

Applying oversampling...
Total samples after oversampling: 52430
Class distribution after oversampling:
FAKE: 26215
REAL: 26215


## Save to CSV

In [10]:
# Create DataFrame
feature_columns = ["chroma_stft", "rms", "spectral_centroid", "spectral_bandwidth", 
                    "rolloff", "zcr"] + [f"mfcc{i+1}" for i in range(20)]

df = pd.DataFrame(data_resampled, columns=feature_columns)
df["label"] = labels_resampled
print(f"Total rows: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

# Save to CSV
df.to_csv(OUTPUT_CSV, index=False)
print(f"Dataset saved to {OUTPUT_CSV}")

Total rows: 52430
Columns: ['chroma_stft', 'rms', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zcr', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15', 'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20', 'label']
Dataset saved to ../data/preprocessed.csv
