# 01. Setup and Feature Extraction

This notebook handles the initial setup of the environment and performs feature extraction on the audio dataset.

## Steps:
1.  Mount Google Drive.
2.  Install dependencies.
3.  Configure feature extraction parameters.
4.  Extract features from audio files (Real vs. Fake).
5.  Save extracted features for training.

In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 2. Setup Project Path
import os
import sys

# CHANGE THIS PATH to where your project is located in Google Drive
PROJECT_PATH = '/content/drive/MyDrive/TCC'

if not os.path.exists(PROJECT_PATH):
    print(f"WARNING: Path {PROJECT_PATH} does not exist. Please check your Drive structure.")
else:
    os.chdir(PROJECT_PATH)
    sys.path.append(PROJECT_PATH)
    print(f"Current working directory: {os.getcwd()}")

In [None]:
# 3. Install Dependencies
!pip install -r requirements.txt

In [None]:
# 4. Imports
import logging
import glob
import numpy as np
import joblib
from pathlib import Path
from tqdm.notebook import tqdm

# App imports
from app.core.interfaces.audio import AudioData, FeatureType
from app.domain.services.feature_extraction_service import AudioFeatureExtractionService
from app.domain.services.feature_extraction.types import ExtractionConfig

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# 5. Configuration

# Define paths for data
DATA_DIR = os.path.join(PROJECT_PATH, 'data')
RAW_DIR = os.path.join(DATA_DIR, 'raw')
PROCESSED_DIR = os.path.join(DATA_DIR, 'processed')

os.makedirs(PROCESSED_DIR, exist_ok=True)

# Define feature extraction config
config = ExtractionConfig(
    feature_types=[FeatureType.MFCC, FeatureType.SPECTRAL], # Adjust as needed
    sample_rate=16000,
    include_deltas=True
)

service = AudioFeatureExtractionService()
print("Feature Extraction Service initialized.")

In [None]:
# 6. Extraction Helper Function

def process_dataset(data_path, label, save_name):
    audio_files = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True) + \
                  glob.glob(os.path.join(data_path, '**', '*.mp3'), recursive=True) + \
                  glob.glob(os.path.join(data_path, '**', '*.flac'), recursive=True)
    
    print(f"Found {len(audio_files)} files in {data_path}")
    
    features_list = []
    labels_list = []
    
    for file_path in tqdm(audio_files, desc=f"Processing {label}"):
        try:
            # Load audio
            audio = AudioData.from_file(file_path, sr=config.sample_rate)
            
            # Extract features
            result = service.extract_features(audio, config)
            
            if result.is_success and result.data:
                # Append features (assuming we want to flatten or keep structured)
                # For simplicity, we store the whole Feature object or specific arrays
                features_list.append(result.data.features)
                labels_list.append(label)
            else:
                print(f"Failed to process {file_path}: {result.errors}")
                
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            
    # Save processed data
    output_path = os.path.join(PROCESSED_DIR, f"{save_name}.joblib")
    joblib.dump({'features': features_list, 'labels': labels_list}, output_path)
    print(f"Saved {len(features_list)} features to {output_path}")
    return features_list, labels_list

In [None]:
# 7. Run Extraction

# Assume structure: data/raw/real and data/raw/fake
REAL_PATH = os.path.join(RAW_DIR, 'real')
FAKE_PATH = os.path.join(RAW_DIR, 'fake')

# Check if directories exist
if os.path.exists(REAL_PATH):
    process_dataset(REAL_PATH, label=0, save_name='real_features')
else:
    print(f"Directory {REAL_PATH} not found.")

if os.path.exists(FAKE_PATH):
    process_dataset(FAKE_PATH, label=1, save_name='fake_features')
else:
    print(f"Directory {FAKE_PATH} not found.")