In [1]:
import textgrid
import pandas as pd
from collections import Counter
import os

def load_textgrid(file_path):
    """Load a TextGrid file and return the TextGrid object."""
    try:
        return textgrid.TextGrid.fromFile(file_path)
    except Exception as e:
        raise ValueError(f"Error loading TextGrid file: {e}")

def get_unique_phones(phones_tier):
    """Extract all unique phone labels from the phones tier."""
    return sorted(set(interval.mark for interval in phones_tier if interval.mark))

def extract_features_for_iu(iu_interval, phones_tier, cs_tier, unique_phones, source_file):
    """Extract features for a single IU interval, including source file name."""
    iu_xmin, iu_xmax = iu_interval.minTime, iu_interval.maxTime
    iu_text = iu_interval.mark

    # Initialize phone counter
    phone_counts = Counter()

    # Collect phones within IU time boundaries
    for phone in phones_tier:
        if phone.minTime >= iu_xmin and phone.maxTime <= iu_xmax and phone.mark:
            phone_counts[phone.mark] += 1

    # Create phone frequency vector
    phone_features = {phone: phone_counts.get(phone, 0) for phone in unique_phones}

    # CS feature: set to 1 if any CS annotation exists within the IU
    cs_features = {
        'cs_non_english': int('<unk>' in iu_text.lower())
    }


    # Combine features
    features = {
        'source_file': source_file.replace('.TextGrid', ''),
        'iu_text': iu_text,
        'iu_start': iu_xmin,
        'iu_end': iu_xmax,
        **phone_features,
        **cs_features
    }

    return features

def process_textgrid_to_features(file_path, unique_phones=None):
    """Process a TextGrid file and return features for all IUs."""
    # Load TextGrid
    tg = load_textgrid(file_path)

    # Print available tier names for debugging
    tier_names = [tier.name for tier in tg]
    print(f"Available tiers in TextGrid {file_path}: {tier_names}")

    # Get tiers (case-insensitive matching)
    iu_tier = None
    phones_tier = None
    cs_tier = None
    for tier in tg:
        tier_name_lower = tier.name.lower()
        if tier_name_lower == 'iu':
            iu_tier = tier
        elif tier_name_lower == 'phone':  # Match 'phone' as seen in tier names
            phones_tier = tier
        elif tier_name_lower == 'cs':
            cs_tier = tier

    # Check for missing tiers
    missing_tiers = []
    if not iu_tier:
        missing_tiers.append('IU')
    if not phones_tier:
        missing_tiers.append('phone')
    if not cs_tier:
        missing_tiers.append('CS')
    
    if missing_tiers:
        raise ValueError(f"Missing required tiers in {file_path}: {', '.join(missing_tiers)}. Available tiers: {tier_names}")

    # If unique_phones is not provided, compute it
    if unique_phones is None:
        unique_phones = get_unique_phones(phones_tier)

    # Extract features for each IU
    features_list = []
    for iu in iu_tier:
        features = extract_features_for_iu(iu, phones_tier, cs_tier, unique_phones, os.path.basename(file_path))
        features_list.append(features)
    return features_list, unique_phones

def process_directory_to_features(directory, output_csv):
    """Process all TextGrid files in a directory and save features to a single CSV."""
    all_features = []
    unique_phones = None

    # Walk through directory to find TextGrid files
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.TextGrid'):
                file_path = os.path.join(root, file)
                print(f"Processing {file_path}...")
                # Process file and update unique_phones
                features, unique_phones = process_textgrid_to_features(file_path, unique_phones)
                all_features.extend(features)

    if not all_features:
        print("No TextGrid files found or no features extracted.")
        return

    # Create DataFrame
    df = pd.DataFrame(all_features)

    # Save to CSV
    df.to_csv(output_csv, index=False)
    print(f"All features saved to {output_csv}")

def main():
    # Example usage
    input_directory = "../interview_textgrids_iu_and_cs_intervals"  # Current directory
    output_csv = "all_textgrid_features.csv"
    
    if not os.path.exists(input_directory):
        raise FileNotFoundError(f"Directory {input_directory} not found")
    
    process_directory_to_features(input_directory, output_csv)

if __name__ == "__main__":
    main()

Processing ../interview_textgrids_iu_and_cs_intervals/VF32A_English_I2_20190213.TextGrid...
Available tiers in TextGrid ../interview_textgrids_iu_and_cs_intervals/VF32A_English_I2_20190213.TextGrid: ['task', 'IU', 'convenience-IU', 'word', 'phone', 'CS']
Processing ../interview_textgrids_iu_and_cs_intervals/VF20B_English_I2_20181203.TextGrid...
Available tiers in TextGrid ../interview_textgrids_iu_and_cs_intervals/VF20B_English_I2_20181203.TextGrid: ['task', 'IU', 'convenience-IU', 'word', 'phone', 'CS']
Processing ../interview_textgrids_iu_and_cs_intervals/VF19B_English_I1_20190213.TextGrid...
Available tiers in TextGrid ../interview_textgrids_iu_and_cs_intervals/VF19B_English_I1_20190213.TextGrid: ['task', 'IU', 'convenience-IU', 'word', 'phone', 'CS']
Processing ../interview_textgrids_iu_and_cs_intervals/VM34A_English_I2_20191028.TextGrid...
Available tiers in TextGrid ../interview_textgrids_iu_and_cs_intervals/VM34A_English_I2_20191028.TextGrid: ['task', 'IU', 'convenience-IU', 'wo

In [2]:
data = pd.read_csv("all_textgrid_features.csv")
data['cs_non_english'].value_counts()

cs_non_english
0    33962
1      596
Name: count, dtype: int64