In [2]:
import os
import pandas as pd
import numpy as np 
import whisper
import glob
import torch
from tqdm import tqdm
import time
import json
import re
import traceback
import sys 

# --- Configuration ---
IS_TEST_RUN = False
NUM_FILES_TEST_RUN = 3

# --- Paths ---
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    print("Warning: __file__ not defined. Using current working directory.")
    script_dir = os.getcwd()

GROUP_NAME = "PureDutchChildren_7_11_Large_3_Turbo"
SPEAKER_CODES_FILE = os.path.join(script_dir, 'output_codes', 'pure_dutch_children_7_11_codes.txt')
OUTPUT_DIR = os.path.join(script_dir, 'whisper_transcriptions', GROUP_NAME)

# Base paths for JASMIN data relative to script location
base_data_path = os.path.join(script_dir, "jasmin-data/Data/data/meta/text")
audio_root_dir = os.path.join(script_dir, "jasmin-data/Data/data/") # Root containing 'audio' folder
jasmin_ort_base_path = os.path.join(script_dir, "jasmin-data/Data/data/annot/text/ort") # Base path for finding .ort files

# Input recording metadata paths
nl_recordings_path = os.path.join(base_data_path, "nl/recordings.txt")
vl_recordings_path = os.path.join(base_data_path, "vl/recordings.txt")

# Model and Language
WHISPER_MODEL_NAME = "large-v3-turbo" # e.g., "base", "medium", "large-v3"
TRANSCRIPTION_LANGUAGE = "nl"   # Language code for transcription (e.g., "nl" for Dutch/Flemish)

# Component Filter (Set to None or empty string "" to process all components)
TARGET_COMPONENT = "comp-q" # e.g., "comp-q" or None

REC_META_SPEAKER_COL = "SpeakerID" # Column name for speaker ID in recordings.txt
REC_META_COMPONENT_COL = "Component"   # Column name for component in recordings.txt
REC_META_FILEROOT_COL = "Root"   # Column name for the unique file identifier in recordings.txt

# --- Data Loading Function (Robust Version - from previous script) ---
def load_data_with_delimiters(file_path, potential_delimiters=['\t', r'\s+'], encoding='ISO-8859-1', expected_cols=None):
    """Attempts to load a CSV/text file using a list of potential delimiters."""
    last_exception = None
    encodings_to_try = ['utf-8', encoding] # Try UTF-8 first

    for enc in encodings_to_try:
        for delim_raw in potential_delimiters:
            delim_repr = repr(delim_raw)
            try:
                engine = 'python' if delim_raw == r'\s+' else None
                df = pd.read_csv(file_path, sep=delim_raw, encoding=enc,
                                 engine=engine, on_bad_lines='warn', low_memory=False,
                                 skipinitialspace=True, comment='#', skip_blank_lines=True)

                if df.empty: continue
                df.columns = df.columns.str.strip()

                if expected_cols:
                    missing_cols = [col for col in expected_cols if col not in df.columns]
                    if not missing_cols:
                        first_col_name = df.columns[0]
                        if first_col_name and df[first_col_name].notna().any():
                            print(f"Successfully loaded {os.path.basename(file_path)} with delimiter {delim_repr}, encoding '{enc}'.")
                            return df
                elif df.shape[1] > 1:
                     first_col_name = df.columns[0]
                     if first_col_name and df[first_col_name].notna().any():
                         print(f"Successfully loaded {os.path.basename(file_path)} with delimiter {delim_repr}, encoding '{enc}' ({df.shape[1]} columns found).")
                         return df
            except pd.errors.ParserError as pe: last_exception = pe
            except Exception as e: last_exception = e

    print(f"Error: Could not successfully load file {file_path} with any specified delimiter/encoding.")
    if last_exception: print(f"Last error encountered: {last_exception}")
    return None

# --- Helper Functions (Adapted from Whisper script) ---

def load_speaker_codes(filepath):
    """Loads speaker codes from a text file (one code per line)."""
    if not os.path.exists(filepath):
        print(f"Error: Speaker codes file not found at '{filepath}'")
        return []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            codes = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(codes)} speaker codes from {os.path.basename(filepath)}")
        return codes
    except Exception as e:
        print(f"Error reading speaker codes file '{filepath}': {e}")
        return []

def find_ort_file(speaker_code, component, file_root, config):
    """
    Finds the corresponding .ort file path based on known JASMIN structure.
    Args: speaker_code, component, file_root, config dict with JASMIN_ORT_BASE_PATH.
    Returns: Full path to .ort file or None.
    """
    if not all([speaker_code, component, file_root]):
        print(f"  Warning: Missing info for .ort lookup (Speaker: {speaker_code}, Comp: {component}, Root: {file_root}).")
        return None
    ort_filename = f"{file_root}.ort"
    region = 'nl' if speaker_code.startswith('N') else 'vl' if speaker_code.startswith('V') else None
    if not region:
        print(f"  Warning: Cannot determine region from speaker code '{speaker_code}' for .ort lookup.")
        return None
    ort_path = os.path.join(config['JASMIN_ORT_BASE_PATH'], component, region, ort_filename)
    ort_path = os.path.normpath(ort_path)
    return ort_path if os.path.exists(ort_path) else None

def parse_ort_file(ort_path):
    """ Parses a .ort file (potentially Praat TextGrid), trying multiple encodings. """
    if not ort_path or not os.path.exists(ort_path): return None
    encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    content = None; detected_encoding = None
    for encoding in encodings_to_try:
        try:
            with open(ort_path, 'r', encoding=encoding) as f: content = f.read()
            detected_encoding = encoding; break
        except UnicodeDecodeError: continue
        except Exception as e:
            print(f"  Warning: Error reading .ort file {ort_path} with encoding {encoding}: {e}")
            if not isinstance(e, UnicodeDecodeError): return None
    if content is None:
        print(f"  Warning: Could not read .ort file '{os.path.basename(ort_path)}' with tried encodings.")
        return None

    segments = []; lines = content.strip().split('\n')
    try:
        # --- Strategy 1: Parse as Praat TextGrid ---
        if len(lines) > 1 and ("TextGrid" in lines[1] or "ooTextFile" in lines[0]):
            in_item = False; is_interval_tier = False; current_segment = {}
            tier_name = None
            # Find the relevant tier (e.g., 'ORT-MAU', 'ORT', 'transcript') - case insensitive
            tier_found = False
            target_tier_names = ['ort-mau', 'ort', 'transcript'] # Add other potential names
            for i, line in enumerate(lines):
                 line_lower = line.strip().lower()
                 if line_lower.startswith("item ["): tier_found = False # Reset for new item
                 if line_lower.startswith("name ="):
                     current_tier_name_match = re.search(r'"(.*)"', line.strip())
                     if current_tier_name_match:
                         current_tier_name = current_tier_name_match.group(1).lower()
                         if current_tier_name in target_tier_names:
                             tier_name = current_tier_name
                             tier_found = True
                 if tier_found and "IntervalTier" in line: is_interval_tier = True; continue
                 if tier_found and is_interval_tier and line.strip().startswith("item ["): in_item = True; current_segment = {}; continue
                 if in_item and is_interval_tier:
                    line_strip = line.strip()
                    if line_strip.startswith("xmin ="): current_segment['start'] = float(line_strip.split('=')[1].strip())
                    elif line_strip.startswith("xmax ="): current_segment['end'] = float(line_strip.split('=')[1].strip())
                    elif line_strip.startswith("text ="):
                        text_match = re.search(r'"(.*)"', line_strip)
                        text = text_match.group(1).strip() if text_match else ""
                        current_segment['text'] = text
                        if 'start' in current_segment and 'end' in current_segment and 'text' in current_segment:
                            # Only add segments with actual text content
                            if current_segment['text'] and not current_segment['text'].isspace():
                                segments.append(current_segment)
                            in_item = False # Reset for next interval in the tier
            if segments: print(f"  Parsed {len(segments)} segments from TextGrid tier '{tier_name}'.")

        # --- Strategy 2: Simple format (Timestamp, Timestamp, "Text") ---
        if not segments:
            i = 0
            while i < len(lines):
                try:
                    if i + 2 < len(lines):
                        start_time = float(lines[i].strip())
                        end_time = float(lines[i+1].strip())
                        text_match = re.match(r'\s*"(.*)"\s*', lines[i+2])
                        if text_match:
                            text = text_match.group(1).strip()
                            if text and not text.isspace(): segments.append({'start': start_time, 'end': end_time, 'text': text})
                            i += 3; continue
                except (ValueError, IndexError): pass
                i += 1
            if segments: print(f"  Parsed {len(segments)} segments using simple timestamp format.")

        if segments: return segments
        else:
            print(f"  Warning: Could not parse segments from '{os.path.basename(ort_path)}' (encoding: {detected_encoding}).")
            return None
    except Exception as e:
        print(f"  Error during parsing .ort file {ort_path}: {e}"); traceback.print_exc(); return None

def format_whisper_segments(result):
    """Converts Whisper result segments into a standardized list of dictionaries."""
    segments_out = []
    for seg in result.get('segments', []):
        segments_out.append({
            'start': seg.get('start'), 'end': seg.get('end'),
            'text': seg.get('text', '').strip(), 'words': seg.get('words', [])
        })
    return segments_out

def clean_text_for_wer(text):
    """Basic text cleaning for WER calculation (uppercase, remove punctuation)."""
    if not isinstance(text, str): return ""
    text = text.upper()
    text = re.sub(r'[^\w\s*]', '', text) # Keep '*' for potential markers
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def prepare_wer_data(whisper_segments, ort_segments):
    """Prepares reference and hypothesis strings for WER analysis file."""
    if not ort_segments or not whisper_segments: return []
    ref_full = " ".join([seg['text'] for seg in ort_segments if seg.get('text')])
    hyp_full = " ".join([seg['text'] for seg in whisper_segments if seg.get('text')])
    ref_clean = clean_text_for_wer(ref_full)
    hyp_clean = clean_text_for_wer(hyp_full)
    ops = "?"; csid = "0 0 0 0" # Placeholders, real WER tool needed
    return [{'ref': ref_clean, 'hyp': hyp_clean, 'op': ops, 'csid': csid}]

def save_results(output_dir, file_base_name, data_dict):
    """Saves detailed JSON and simple TXT results."""
    os.makedirs(output_dir, exist_ok=True) # Ensure dir exists
    json_path = os.path.join(output_dir, f"{file_base_name}.json")
    txt_path = os.path.join(output_dir, f"{file_base_name}.txt")
    try:
        with open(json_path, "w", encoding="utf-8") as f:
            # Convert numpy types for JSON serialization
            def convert_numpy(obj):
                if isinstance(obj, np.integer): return int(obj)
                elif isinstance(obj, np.floating): return float(obj)
                elif isinstance(obj, np.ndarray): return obj.tolist()
                return obj
            json.dump(data_dict, f, indent=2, ensure_ascii=False, default=convert_numpy)
    except Exception as e: print(f"  Error saving JSON to {json_path}: {e}")
    try:
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(data_dict.get('transcription', ''))
    except Exception as e: print(f"  Error saving TXT to {txt_path}: {e}")

def generate_wer_analysis_file(results_list, output_dir, group_name):
    """Generates a text file suitable for input to WER calculation tools."""
    wer_output_path = os.path.join(output_dir, f"wer_analysis_{group_name}.txt")
    entries_written = 0
    valid_results = [r for r in results_list if r.get('reference_parsed', False)]
    if not valid_results:
        print(f"No results with parsed references found for group '{group_name}', skipping WER analysis file.")
        return

    print(f"Generating WER analysis file for '{group_name}': {wer_output_path}")
    try:
        with open(wer_output_path, "w", encoding="utf-8") as f:
            for result in valid_results:
                file_id = f"{result['speaker_code']}_{result['component']}_{result['file_root']}"
                json_file = os.path.join(output_dir, f"{file_id}.json")
                wer_data_list = []
                if os.path.exists(json_file):
                    try:
                        with open(json_file, "r", encoding="utf-8") as jf:
                             detailed_data = json.load(jf)
                             wer_data_list = detailed_data.get('wer_data', [])
                    except Exception as e: print(f"  Warn: Could not reload JSON {json_file} for WER output: {e}")

                if wer_data_list:
                     for wer_item in wer_data_list:
                         f.write(f"{file_id} ref {wer_item.get('ref','')}\n")
                         f.write(f"{file_id} hyp {wer_item.get('hyp','')}\n")
                         f.write(f"{file_id} op  {wer_item.get('op','?')}\n")
                         f.write(f"{file_id} #csid {wer_item.get('csid','0 0 0 0')}\n")
                         entries_written += 1
                else: print(f"  Warn: No WER data found in {json_file} for {file_id}")
        if entries_written > 0: print(f"WER analysis file generated with {entries_written // 4} entries.")
        else: print(f"WER analysis file generation attempted, but no valid entries found/written.")
    except Exception as e: print(f"Error generating WER analysis file: {e}")


# --- Main Execution ---
def main():
    config = {
        'IS_TEST_RUN': IS_TEST_RUN,
        'NUM_FILES_TEST_RUN': NUM_FILES_TEST_RUN,
        'OUTPUT_DIR': OUTPUT_DIR, # Specific output dir for the group
        'WHISPER_MODEL_NAME': WHISPER_MODEL_NAME,
        'TRANSCRIPTION_LANGUAGE': TRANSCRIPTION_LANGUAGE,
        'JASMIN_ORT_BASE_PATH': jasmin_ort_base_path,
        'AUDIO_ROOT_DIR': audio_root_dir,
        'TARGET_COMPONENT': TARGET_COMPONENT,
        'GROUP_NAME': GROUP_NAME, # Add group name to config
    }

    # --- Setup ---
    os.makedirs(config['OUTPUT_DIR'], exist_ok=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Starting Transcription for Group: {config['GROUP_NAME']} ---")
    print(f"Using device: {device}")
    overall_start_time = time.time()

    # --- Check Input Paths ---
    input_paths_to_check = [nl_recordings_path, vl_recordings_path, SPEAKER_CODES_FILE]
    missing_paths = [p for p in input_paths_to_check if not os.path.exists(p)]
    if missing_paths:
        print("\nError: Required input files not found:")
        for p in missing_paths: print(f"- {os.path.abspath(p)}")
        sys.exit(1)
    if not os.path.isdir(config['AUDIO_ROOT_DIR']):
        print(f"\nError: Audio root directory not found: {os.path.abspath(config['AUDIO_ROOT_DIR'])}")
        sys.exit(1)
    print("\nAll required input files and directories found.")

    # --- Load Speaker Codes for the target group ---
    target_speaker_codes = load_speaker_codes(SPEAKER_CODES_FILE)
    if not target_speaker_codes:
        print(f"No speaker codes loaded for group {config['GROUP_NAME']}. Exiting.")
        sys.exit(1)

    # --- Load Recordings Metadata ---
    print("\n--- Loading Recording Metadata ---")
    expected_rec_cols = [REC_META_SPEAKER_COL, REC_META_COMPONENT_COL, REC_META_FILEROOT_COL]
    nl_recordings = load_data_with_delimiters(nl_recordings_path, expected_cols=expected_rec_cols)
    vl_recordings = load_data_with_delimiters(vl_recordings_path, expected_cols=expected_rec_cols)
    all_recordings_df = None
    loaded_rec_dfs = []
    if nl_recordings is not None: loaded_rec_dfs.append(nl_recordings)
    if vl_recordings is not None: loaded_rec_dfs.append(vl_recordings)
    if loaded_rec_dfs:
        all_recordings_df = pd.concat(loaded_rec_dfs, ignore_index=True)
        print(f"Combined {len(loaded_rec_dfs)} recording metadata file(s): {len(all_recordings_df)} total entries.")
    else:
        print("\nError: Failed to load any recording metadata. Exiting.")
        sys.exit(1)

    # --- Filter Recordings Metadata for Target Group ---
    print(f"\n--- Filtering Recordings for Group: {config['GROUP_NAME']} ---")
    speaker_codes_set = set(target_speaker_codes)
    group_recordings_df = all_recordings_df[all_recordings_df[REC_META_SPEAKER_COL].isin(speaker_codes_set)].copy()

    # Optional: Filter by component
    if config['TARGET_COMPONENT']:
        print(f"Filtering recordings for component: '{config['TARGET_COMPONENT']}'")
        group_recordings_df = group_recordings_df[group_recordings_df[REC_META_COMPONENT_COL] == config['TARGET_COMPONENT']].copy()
        print(f"Found {len(group_recordings_df)} recordings matching speakers and component.")
    else:
        print(f"Found {len(group_recordings_df)} recordings matching speakers (all components).")

    if group_recordings_df.empty:
        print(f"No recordings found for group {config['GROUP_NAME']} after filtering. Exiting.")
        sys.exit(1)

    # --- Load Whisper Model ---
    print(f"\n--- Loading Whisper Model '{config['WHISPER_MODEL_NAME']}' ---")
    try:
        model = whisper.load_model(config['WHISPER_MODEL_NAME'], device=device)
        print("Model loaded.")
    except Exception as e:
        print(f"Error loading Whisper model: {e}")
        return

    # --- Select files for processing (Test Run or All) ---
    if config['IS_TEST_RUN']:
        recordings_to_process = group_recordings_df.head(config['NUM_FILES_TEST_RUN'])
        print(f"--- TEST RUN: Processing first {len(recordings_to_process)} recordings for {config['GROUP_NAME']} ---")
    else:
        recordings_to_process = group_recordings_df
        print(f"--- Processing all {len(recordings_to_process)} recordings for {config['GROUP_NAME']} ---")

    # --- Process Recordings ---
    results_summary = []
    start_time_group = time.time()
    processed_count = 0
    skipped_count = 0
    ort_found_parsed_count = 0
    ort_not_found_count = 0
    ort_found_not_parsed_count = 0

    for index, row in tqdm(recordings_to_process.iterrows(), total=len(recordings_to_process), desc=f"Transcribing {config['GROUP_NAME']}"):
        try:
            # Extract info from recordings metadata row
            speaker_code = str(row[REC_META_SPEAKER_COL])
            file_root = str(row[REC_META_FILEROOT_COL])
            component = str(row[REC_META_COMPONENT_COL])
            region = 'nl' if speaker_code.startswith('N') else 'vl' if speaker_code.startswith('V') else None

            if not region:
                print(f"  Warning: Cannot determine region for SpeakerID '{speaker_code}'. Skipping.")
                skipped_count += 1
                continue

            # Construct original audio path
            wav_path = os.path.join(config['AUDIO_ROOT_DIR'], 'audio', 'wav', component, region, f"{file_root}.wav")
            wav_path = os.path.normpath(wav_path)
            file_name = os.path.basename(wav_path)
            file_base_name = f"{speaker_code}_{component}_{file_root}"

            print(f"\nProcessing: {file_name} (Speaker: {speaker_code}, Comp: {component})")

            # Check if audio file exists
            if not os.path.exists(wav_path):
                print(f"  CRITICAL: Audio file not found at '{wav_path}'. Skipping.")
                fallback_pattern = os.path.join(config['AUDIO_ROOT_DIR'], 'audio', 'wav', component, region, f"{file_root}*.wav")
                matching_files = glob.glob(fallback_pattern)
                if matching_files:
                    print(f"  INFO: Found alternative audio file via fallback: {os.path.basename(matching_files[0])}")
                    wav_path = matching_files[0] # Use the first match
                    file_name = os.path.basename(wav_path) 
                else:
                    skipped_count += 1
                    continue 

            # Find and Parse Reference (.ort) File
            ort_path = find_ort_file(speaker_code, component, file_root, config)
            ort_segments = parse_ort_file(ort_path) if ort_path else None
            reference_parsed = ort_segments is not None
            reference_found_path = ort_path if ort_path else ""

            if reference_parsed:
                print(f"  Reference found and parsed: {os.path.basename(ort_path)}")
                ort_found_parsed_count +=1
            elif ort_path:
                print(f"  Reference file found ({os.path.basename(ort_path)}) but FAILED TO PARSE.")
                ort_found_not_parsed_count += 1
            else:
                print(f"  Reference .ort file NOT FOUND.")
                ort_not_found_count += 1

            # Transcribe with Whisper
            try:
                result = model.transcribe(
                    wav_path, language=config['TRANSCRIPTION_LANGUAGE'],
                    word_timestamps=True, verbose=False # Set verbose=True temporarily for more Whisper output
                )
                transcription = result["text"]
                whisper_segments = format_whisper_segments(result)
                print(f"  Transcription finished.")
            except Exception as e:
                print(f"  ERROR during Whisper transcription: {e}")
    
                import traceback
                traceback.print_exc() 
                skipped_count += 1
                continue

            # Prepare Data for WER analysis file
            wer_data = prepare_wer_data(whisper_segments, ort_segments if reference_parsed else [])

            # Save detailed JSON and simple TXT
            output_data = {
                'original_wav_path': wav_path, # Store original path
                'speaker_code': speaker_code,
                'component': component,
                'file_root': file_root,
                'transcription': transcription,
                'whisper_segments': whisper_segments,
                'ort_segments': ort_segments if reference_parsed else [],
                'wer_data': wer_data,
                'reference_found_path': reference_found_path,
                'reference_parsed': reference_parsed,
            }
            save_results(config['OUTPUT_DIR'], file_base_name, output_data)

            # Collect summary info
            results_summary.append({
                "processed_wav_file": file_name, # Name of the file actually processed
                "speaker_code": speaker_code,
                "component": component,
                "file_root": file_root,
                "transcription": transcription,
                "detected_language": result.get("language", ""),
                "reference_found_path": reference_found_path,
                "reference_parsed": reference_parsed,
                "ref_text_preview": (" ".join(s['text'] for s in ort_segments)[:80]+"..." if reference_parsed else ""),
                "num_whisper_segments": len(whisper_segments),
                "num_ort_segments": len(ort_segments) if reference_parsed else 0,
            })
            processed_count += 1

        except Exception as loop_error:
             print(f"!! Unexpected Error processing recording at index {index}: {loop_error}")
             print(f"   Row data: {row.to_dict()}")
             traceback.print_exc() # This one is good, it's already there for the outer loop
             skipped_count += 1

    # --- Group Summary ---
    end_time_group = time.time()
    total_time_group = end_time_group - start_time_group
    num_processed_successfully = processed_count

    print(f"\n--- {config['GROUP_NAME']} Processing Summary ---")
    print(f"Selected {len(recordings_to_process)} recordings for processing.")
    print(f"Successfully processed: {num_processed_successfully}")
    print(f"Skipped due to errors/missing files: {skipped_count}")
    print(f"Total processing time: {total_time_group:.2f} seconds")
    if num_processed_successfully > 0:
        print(f"Avg. time per processed file: {total_time_group / num_processed_successfully:.2f} seconds")

    print(f"\n--- {config['GROUP_NAME']} Reference File Summary ---")
    print(f"Found & Parsed: {ort_found_parsed_count}")
    print(f"Found but Failed to Parse: {ort_found_not_parsed_count}")
    print(f"Not Found: {ort_not_found_count}")
    if num_processed_successfully > 0:
        parse_success_rate = (ort_found_parsed_count / num_processed_successfully) * 100
        print(f"Parse Success Rate (of processed files): {parse_success_rate:.1f}%")

    # --- Save Group Summary CSV ---
    if results_summary:
        summary_df = pd.DataFrame(results_summary)
        csv_path = os.path.join(config['OUTPUT_DIR'], f"transcription_summary_{config['GROUP_NAME']}.csv")
        try:
            summary_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
            print(f"\nSummary results for {config['GROUP_NAME']} saved to: {csv_path}")
        except Exception as e:
            print(f"\nError saving summary CSV for {config['GROUP_NAME']}: {e}")

    # --- Generate Group WER Analysis File ---
    generate_wer_analysis_file(results_summary, config['OUTPUT_DIR'], config['GROUP_NAME'])

    print(f"===== Finished Group: {config['GROUP_NAME']} =====")


if __name__ == "__main__":
    main()


--- Starting Transcription for Group: PureDutchChildren_7_11_Large_2_Turbo ---
Using device: cuda

All required input files and directories found.
Loaded 79 speaker codes from pure_dutch_children_7_11_codes.txt

--- Loading Recording Metadata ---
Successfully loaded recordings.txt with delimiter '\t', encoding 'utf-8'.
Successfully loaded recordings.txt with delimiter '\t', encoding 'utf-8'.
Combined 2 recording metadata file(s): 995 total entries.

--- Filtering Recordings for Group: PureDutchChildren_7_11_Large_2_Turbo ---
Filtering recordings for component: 'comp-q'
Found 78 recordings matching speakers and component.

--- Loading Whisper Model 'large-v2-turbo' ---
Error loading Whisper model: Model large-v2-turbo not found; available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large', 'large-v3-turbo', 'turbo']
