<a href="https://colab.research.google.com/github/varshacvenkat-web/Varsha-Venkatapathy-Engineering-Portfolio-/blob/main/Article_3_Traffic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import shutil
import numpy as np
import librosa
import scipy.signal
import random
import logging

# -------------------- Parameters -------------------- #
SAMPLE_RATE = 16000  # Target sample rate
FRAME_SIZE = 512     # Frame length for STFT
OVERLAP = 256        # Hop length for STFT (50% overlap)
TARGET_SHAPE = (FRAME_SIZE // 2 + 1, 100)  # (257, 100) - entire spectrogram shape before splitting
OUTPUT_DIR = "processed_data_traffic_article3"  # Updated output directory for traffic data
CLEAN_AUDIO_PATH = r"C:\Users\enhance\converted_clean_wav"  # Clean audio folder
NOISE_PARENT_DIR = r"C:\Users\enhance\Downloads\traffic_noise_wav" # Updated folder for traffic noise
SNR_VALUES = [-5, 0, 5]  # SNR values to process
CHECKPOINT_DIR = "checkpoints_article3"  # Directory to store checkpoints

# Create required directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# -------------------- Generic Logging and Checkpointing -------------------- #
def setup_logger(log_file=None, level=logging.INFO):
    """
    Set up a logger that logs messages to both the console and an optional file.
    """
    logger = logging.getLogger()
    logger.setLevel(level)

    # Clear existing handlers
    if logger.hasHandlers():
        logger.handlers.clear()

    formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')

    # Console handler
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    # File handler if log_file is specified
    if log_file:
        fh = logging.FileHandler(log_file)
        fh.setFormatter(formatter)
        logger.addHandler(fh)

    return logger

def checkpoint_exists(checkpoint_path):
    """Check if a checkpoint file exists."""
    return os.path.exists(checkpoint_path)

def write_checkpoint(checkpoint_path, content="done"):
    """Write a checkpoint file."""
    with open(checkpoint_path, "w") as f:
        f.write(content)

# Set up logger (logs to both console and 'preprocessing_article3.log')
logger = setup_logger(log_file="preprocessing_article3.log", level=logging.INFO)

# -------------------- Helper Functions -------------------- #
def clear_folder_contents(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path, exist_ok=True)
    else:
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)  # Remove file
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)  # Remove directory
            except Exception as e:
                logger.error(f"Failed to delete {file_path}. Reason: {e}")
    logger.info(f"Cleared contents of folder: {folder_path}")

def extract_files(parent_dir, extension=".wav"):
    files = []
    for root, dirs, filenames in os.walk(parent_dir):
        for file in filenames:
            if file.lower().endswith(extension):
                files.append(os.path.join(root, file))
    logger.info(f"Found {len(files)} files in {parent_dir}.")
    return files

def adjust_shape(array, target_shape):
    padded_array = np.zeros(target_shape)
    min_rows = min(array.shape[0], target_shape[0])
    min_cols = min(array.shape[1], target_shape[1])
    padded_array[:min_rows, :min_cols] = array[:min_rows, :min_cols]
    return padded_array

def mix_clean_noisy(clean_audio, noise_audio, snr_db):
    clean_power = np.mean(clean_audio ** 2)
    noise_power = np.mean(noise_audio ** 2)
    scaling_factor = np.sqrt(clean_power / (noise_power * 10 ** (snr_db / 10)))
    noisy_audio = clean_audio + scaling_factor * noise_audio
    return noisy_audio

def compute_stft_features(audio, frame_size, overlap, sr):
    f, t, Zxx = scipy.signal.stft(audio, fs=sr, nperseg=frame_size, noverlap=overlap)
    real_part = adjust_shape(np.real(Zxx), TARGET_SHAPE)
    imag_part = adjust_shape(np.imag(Zxx), TARGET_SHAPE)
    return real_part, imag_part

def preprocess_audio(clean_path, noise_path, snr, sr=SAMPLE_RATE):
    """
    Preprocess a clean file with a noise file at a given SNR.
    Returns combined clean and noisy STFT features.
    """
    try:
        clean_audio, _ = librosa.load(clean_path, sr=sr)
        noise_audio, _ = librosa.load(noise_path, sr=sr)

        # Adjust noise length to match clean length
        if len(noise_audio) > len(clean_audio):
            start_idx = np.random.randint(0, len(noise_audio) - len(clean_audio))
            noise_audio = noise_audio[start_idx:start_idx + len(clean_audio)]
        else:
            noise_audio = np.pad(noise_audio, (0, len(clean_audio) - len(noise_audio)), mode="wrap")

        # Mix clean and noisy audio
        noisy_audio = mix_clean_noisy(clean_audio, noise_audio, snr)

        # Compute STFT features for clean and noisy audio
        clean_real, clean_imag = compute_stft_features(clean_audio, FRAME_SIZE, OVERLAP, sr)
        noisy_real, noisy_imag = compute_stft_features(noisy_audio, FRAME_SIZE, OVERLAP, sr)

        # Transpose to treat each time frame (column) as an individual sample
        clean_real_frames = clean_real.T  # shape: (100, 257)
        clean_imag_frames = clean_imag.T  # shape: (100, 257)
        noisy_real_frames = noisy_real.T  # shape: (100, 257)
        noisy_imag_frames = noisy_imag.T  # shape: (100, 257)

        # Concatenate the real and imaginary parts for each frame
        combined_clean = np.concatenate([clean_real_frames, clean_imag_frames], axis=-1)  # shape: (100, 514)
        combined_noisy = np.concatenate([noisy_real_frames, noisy_imag_frames], axis=-1)  # shape: (100, 514)

        return combined_clean, combined_noisy
    except Exception as e:
        logger.error(f"Error in preprocess_audio for clean: {clean_path}, noise: {noise_path}, SNR: {snr} -> {e}")
        return None, None

# -------------------- Main Execution -------------------- #
if __name__ == "__main__":
    # Clear output directory
    clear_folder_contents(OUTPUT_DIR)

    # Extract files
    clean_files = extract_files(CLEAN_AUDIO_PATH)
    noise_files = extract_files(NOISE_PARENT_DIR)

    # Calculate total number of tasks (for progress reporting)
    total_tasks = len(clean_files) * len(noise_files) * len(SNR_VALUES)
    current_task = 0

    logger.info("Starting preprocessing of audio files...")

    for clean_file in clean_files:
        clean_filename = os.path.basename(clean_file).replace(".wav", "")
        clean_output_dir = os.path.join(OUTPUT_DIR, clean_filename)
        os.makedirs(clean_output_dir, exist_ok=True)

        for noise_file in noise_files:
            noise_filename = os.path.basename(noise_file).replace(".wav", "")

            for snr in SNR_VALUES:
                current_task += 1
                progress_percent = (current_task / total_tasks) * 100
                logger.info(f"Processing task {current_task} of {total_tasks} ({progress_percent:.2f}%): Clean: {clean_file}, Noise: {noise_file}, SNR: {snr}")

                # Create a unique checkpoint filename for this task
                checkpoint_name = f"{clean_filename}_{noise_filename}_{snr}.chk"
                checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint_name)

                if checkpoint_exists(checkpoint_path):
                    logger.info(f"Checkpoint exists for task {checkpoint_name}. Skipping.")
                    continue

                combined_clean, combined_noisy = preprocess_audio(clean_file, noise_file, snr)
                if combined_clean is None or combined_noisy is None:
                    logger.error(f"Failed processing for Clean: {clean_file}, Noise: {noise_file}, SNR: {snr}")
                    continue

                # Save the combined features
                clean_combined_file = os.path.join(clean_output_dir, f"clean_combined_{snr}_{noise_filename}.npy")
                noisy_combined_file = os.path.join(clean_output_dir, f"noisy_combined_{snr}_{noise_filename}.npy")

                np.save(clean_combined_file, combined_clean)
                np.save(noisy_combined_file, combined_noisy)

                logger.info(f"Saved combined STFT features (clean shape: {combined_clean.shape}, noisy shape: {combined_noisy.shape}) for SNR {snr} in {clean_output_dir}.")
                write_checkpoint(checkpoint_path)

    logger.info("Preprocessing complete.")
