## Columns Preprocessing

In [1]:
! pip install librosa soundfile pandas numpy tqdm




[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
import pandas as pd
import json
import librosa
import soundfile as sf
import numpy as np
import shutil
from tqdm import tqdm

# Define paths
raw_splits_dir = "../raw_splits"
processed_dir = "../processed"  
splits = ["train", "validation", "test"]

# Create the processed directory and split subdirectories
os.makedirs(processed_dir, exist_ok=True)
for split in splits:
    split_dir = os.path.join(processed_dir, split)
    os.makedirs(split_dir, exist_ok=True)

# Function to preprocess JSON data and save to the new location
def preprocess_json(input_json_path, output_json_path):
    # Load JSON data
    with open(input_json_path, 'r') as f:
        data = json.load(f)
    
    # Create a copy of the data to modify
    processed_data = []
    
    # Process each item
    for item in data:
        processed_item = {}
        
        # Extract original_text from nested json and add as separate field
        processed_item['text'] = item['json']['original_text'].lower()
        
        # Copy the flac path but will update it later
        processed_item['flac'] = item['flac']
        
        # Add any other fields you want to keep
        processed_data.append(processed_item)
    
    # Save the processed JSON
    with open(output_json_path, 'w') as f:
        json.dump(processed_data, f, indent=2)
    
    return processed_data

# Function to standardize audio files
def standardize_audio(input_audio_path, output_audio_path, target_sr=16000, normalize=True, min_duration_sec=1.0):
    try:
        # Load audio file
        y, sr = librosa.load(input_audio_path, sr=None)
        
        # Resample if needed
        if sr != target_sr:
            y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        
        # Normalize audio if requested
        if normalize:
            y = librosa.util.normalize(y)
        
        # Pad if audio is too short
        duration = len(y) / target_sr
        if duration < min_duration_sec:
            pad_length = int((min_duration_sec - duration) * target_sr)
            y = np.pad(y, (0, pad_length), mode='constant')
            duration = min_duration_sec
        
        # Save the standardized audio to the new path
        sf.write(output_audio_path, y, target_sr)
        
        return True, duration  # Return success and duration in seconds
    except Exception as e:
        print(f"Error processing {input_audio_path}: {e}")
        return False, 0

# Main processing loop
for split in splits:
    print(f"\nProcessing {split} split:")
    
    # Setup input and output paths
    input_json_path = os.path.join(raw_splits_dir, f"{split}_data.json")
    output_json_path = os.path.join(processed_dir, f"{split}_data.json")
    output_csv_path = os.path.join(processed_dir, f"{split}_data.csv")
    
    input_split_dir = os.path.join(raw_splits_dir, split)
    output_split_dir = os.path.join(processed_dir, split)
    
    # Process JSON
    if os.path.exists(input_json_path):
        print(f"Preprocessing JSON: {input_json_path}")
        processed_json = preprocess_json(input_json_path, output_json_path)
        print(f"Processed {len(processed_json)} JSON records")
        
        # Get list of audio files to process
        audio_files = []
        for item in processed_json:
            if isinstance(item['flac'], str) and os.path.exists(item['flac']):
                audio_files.append(item['flac'])
        
        # Process audio files
        print(f"Standardizing and padding audio files for {split} split")
        durations = []
        processed_count = 0
        
        # Update the JSON with new audio paths
        for i, item in enumerate(tqdm(processed_json, desc="Processing audio files")):
            input_audio_path = item['flac']
            
            if isinstance(input_audio_path, str) and os.path.exists(input_audio_path):
                # Create the output path
                filename = os.path.basename(input_audio_path)
                output_audio_path = os.path.join(output_split_dir, filename)
                
                # Process the audio
                success, duration = standardize_audio(
                    input_audio_path, 
                    output_audio_path, 
                    target_sr=16000, 
                    normalize=True,
                    min_duration_sec=1.0
                )
                
                if success:
                    # Update the path in the JSON
                    processed_json[i]['flac'] = output_audio_path
                    durations.append(duration)
                    processed_count += 1
        
        # Save the updated JSON with corrected paths
        with open(output_json_path, 'w') as f:
            json.dump(processed_json, f, indent=2)
        
        # Create a CSV from the processed JSON
        df = pd.DataFrame(processed_json)
        df.to_csv(output_csv_path, index=False)
        
        # Print statistics
        if durations:
            print(f"Processed {processed_count} audio files")
            print(f"Min duration: {min(durations):.2f}s")
            print(f"Max duration: {max(durations):.2f}s")
            print(f"Mean duration: {np.mean(durations):.2f}s")
    else:
        print(f"No JSON file found for {split} split: {input_json_path}")

print("\nPreprocessing complete!")
print(f"Processed data saved to: {processed_dir}")


Processing train split:
Preprocessing JSON: ../raw_splits\train_data.json
Processed 80 JSON records
Standardizing and padding audio files for train split


Processing audio files:   0%|          | 0/80 [00:00<?, ?it/s]

Processing audio files: 100%|██████████| 80/80 [00:18<00:00,  4.27it/s]


Processed 80 audio files
Min duration: 1.95s
Max duration: 10.43s
Mean duration: 5.25s

Processing validation split:
Preprocessing JSON: ../raw_splits\validation_data.json
Processed 10 JSON records
Standardizing and padding audio files for validation split


Processing audio files: 100%|██████████| 10/10 [00:00<00:00, 11.05it/s]


Processed 10 audio files
Min duration: 2.63s
Max duration: 11.42s
Mean duration: 6.06s

Processing test split:
Preprocessing JSON: ../raw_splits\test_data.json
Processed 10 JSON records
Standardizing and padding audio files for test split


Processing audio files: 100%|██████████| 10/10 [00:00<00:00, 10.77it/s]

Processed 10 audio files
Min duration: 3.35s
Max duration: 9.20s
Mean duration: 5.06s

Preprocessing complete!
Processed data saved to: ../processed



