## Columns Preprocessing

In [1]:
! pip install librosa soundfile pandas numpy tqdm




[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import os
import pandas as pd
import json
import numpy as np
import soundfile as sf
import librosa
from tqdm import tqdm

# Input directory where the saved data is located
input_dir = "../raw"
# Output directory for preprocessed data
output_dir = "../processed"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Target sampling rate for audio standardization
TARGET_SAMPLING_RATE = 16000
# Target audio duration in seconds
TARGET_DURATION = 5  # 5 seconds
# Normalization factor
NORMALIZATION_FACTOR = 0.95

# Process each data file
for file_type in ["train", "validation", "test"]:
    # Try to find the file
    input_file = os.path.join(input_dir, f"{file_type}_data.csv")
    if not os.path.exists(input_file):
        print(f"Skipping {file_type}, file not found: {input_file}")
        continue
    
    print(f"Processing {file_type} data...")
    
    # Load the data
    df = pd.read_csv(input_file)
    
    # Step 1: Drop key and url columns if they exist
    columns_to_drop = [col for col in df.columns if col in ['key', 'url'] or 'key' in col.lower() or 'url' in col.lower()]
    if columns_to_drop:
        df = df.drop(columns=columns_to_drop)
        print(f"Dropped columns: {columns_to_drop}")
    
    # Step 2: Extract 'original_text' from the json column
    print("Extracting 'original_text' from json column...")
    
    # Create a new text column
    df['text'] = None
    
    # Process each row
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        if 'json' in df.columns:
            try:
                json_str = row['json']
                
                # Handle different formats of the json data
                if isinstance(json_str, str):
                    if json_str.strip().startswith('{'):
                        # It's a JSON string
                        json_data = json.loads(json_str)
                    else:
                        # It might be a string representation of a Python dict
                        json_data = eval(json_str)
                elif isinstance(json_str, dict):
                    # It's already a dictionary
                    json_data = json_str
                else:
                    print(f"Unknown JSON format in row {idx}: {type(json_str)}")
                    continue
                
                # Extract original_text
                if 'original_text' in json_data:
                    df.at[idx, 'text'] = json_data['original_text']
                    print(f"Found text: {json_data['original_text']}")
                else:
                    print(f"'original_text' not found in json data: {json_data.keys()}")
            except Exception as e:
                print(f"Error processing json in row {idx}: {e}")
                print(f"Raw json content: {row['json']}")
    
    # Now drop the json column
    if 'json' in df.columns:
        df = df.drop(columns=['json'])
        print("Dropped 'json' column after extracting text")
    
    # Step 3: Ensure all text is lowercase
    print("Converting all text to lowercase...")
    if 'text' in df.columns:
        df['text'] = df['text'].str.lower()
    
    # Step 4: Standardize audio files
    print("Standardizing audio files...")
    
    # Create audio directory
    audio_dir = os.path.join(output_dir, f"{file_type}_audio")
    os.makedirs(audio_dir, exist_ok=True)
    
    # New column for standardized audio paths
    df['standardized_audio'] = None
    
    # Process each audio file
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        if 'flac' in df.columns and pd.notna(row['flac']):
            try:
                # Load the audio file
                audio_path = row['flac']
                
                # Check if it's a string (path) or something else
                if isinstance(audio_path, str) and os.path.exists(audio_path):
                    # Load the audio file
                    audio, sr = librosa.load(audio_path, sr=None)
                    
                    # Resample if needed
                    if sr != TARGET_SAMPLING_RATE:
                        audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SAMPLING_RATE)
                    
                    # Normalize audio
                    if np.abs(audio).max() > 0:
                        audio = audio / np.abs(audio).max() * NORMALIZATION_FACTOR
                    
                    # Step 5: Handle variable length - pad or trim
                    target_length = int(TARGET_DURATION * TARGET_SAMPLING_RATE)
                    
                    if len(audio) < target_length:
                        # Pad with zeros if too short
                        padding = np.zeros(target_length - len(audio))
                        audio = np.concatenate([audio, padding])
                    elif len(audio) > target_length:
                        # Trim if too long (take the first TARGET_DURATION seconds)
                        audio = audio[:target_length]
                    
                    # Save the standardized audio
                    output_audio_path = os.path.join(audio_dir, f"{file_type}_{idx}.flac")
                    sf.write(output_audio_path, audio, TARGET_SAMPLING_RATE)
                    
                    # Update the dataframe
                    df.at[idx, 'standardized_audio'] = output_audio_path
                else:
                    print(f"Audio file not found or invalid: {audio_path}")
            except Exception as e:
                print(f"Error processing audio in row {idx}: {e}")
    
    # Drop the original flac column
    if 'flac' in df.columns:
        df = df.drop(columns=['flac'])
    
    # Save the preprocessed data
    output_file = os.path.join(output_dir, f"{file_type}_preprocessed.csv")
    df.to_csv(output_file, index=False)
    
    # Also save as JSON for convenience
    output_json = os.path.join(output_dir, f"{file_type}_preprocessed.json")
    df.to_json(output_json, orient='records', indent=2)
    
    print(f"Saved preprocessed {file_type} data to {output_file} and {output_json}")
    
    # Verify text extraction
    non_null_text = df['text'].count()
    print(f"Number of rows with extracted text: {non_null_text} out of {len(df)}")

print("Preprocessing complete!")

Processing train data...
Dropped columns: ['__key__', '__url__']
Extracting 'original_text' from json column...


100%|██████████| 10/10 [00:00<00:00, 9988.82it/s]


Error processing json in row 0: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Raw json content: {'ChannelID': 0, 'RecordingID': 1, 'SessionID': 0, 'SpeakerID': 402, 'original_text': 'Mary and her family were moving to another city.', 'read_text': 'Mary and her family were moving to another city'}
Error processing json in row 1: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Raw json content: {'ChannelID': 0, 'RecordingID': 57, 'SessionID': 0, 'SpeakerID': 402, 'original_text': 'The witch put a spell on the prince, but it was the wrong one.', 'read_text': 'the witch put a spell on the prince but it was the wrong one'}
Error processing json in row 2: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Raw json content: {'ChannelID': 0, 'RecordingID': 108, 'SessionID': 0, 'SpeakerID': 402, 'original_text': 'According to the Japanese doctors, it is impossible to determine how long my dad will remain comatose

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:05<00:00,  1.69it/s]

Saved preprocessed train data to ../processed\train_preprocessed.csv and ../processed\train_preprocessed.json
Number of rows with extracted text: 0 out of 10
Skipping validation, file not found: ../raw\validation_data.csv
Skipping test, file not found: ../raw\test_data.csv
Preprocessing complete!



