In [15]:
import os
import pandas as pd
import numpy as np 
from transformers import pipeline 
import glob
import torch
from tqdm import tqdm
import time
import json
import re
import traceback 
import sys 

# --- Configuration ---
# Set IS_TEST_RUN to True to process only NUM_FILES_TEST_RUN files
IS_TEST_RUN = False
NUM_FILES_TEST_RUN = 3
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# --- Paths ---
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    print("Warning: __file__ not defined. Using current working directory.")
    script_dir = os.getcwd()

GROUP_NAME = "PureDutchChildren_7_11_Large_3_finetuned" 
SPEAKER_CODES_FILE = os.path.join(script_dir, 'output_codes', 'pure_dutch_children_7_11_codes.txt')
OUTPUT_DIR = os.path.join(script_dir, 'whisper_transcriptions', GROUP_NAME)

# Base paths for JASMIN data relative to script location
base_data_path = os.path.join(script_dir, "jasmin-data/Data/data/meta/text")
audio_root_dir = os.path.join(script_dir, "jasmin-data/Data/data/") # Root containing 'audio' folder
jasmin_ort_base_path = os.path.join(script_dir, "jasmin-data/Data/data/annot/text/ort") # Base path for finding .ort files

# Input recording metadata paths
nl_recordings_path = os.path.join(base_data_path, "nl/recordings.txt")
vl_recordings_path = os.path.join(base_data_path, "vl/recordings.txt")

# Model Path (Path to your fine-tuned model directory) and Language
FINETUNED_MODEL_PATH = "/home/tdamen/whisper-finetuned-synthetic-only-large-v3" # Path to your fine-tuned model
TRANSCRIPTION_LANGUAGE = "nl"   # Language code for transcription (e.g., "nl" for Dutch/Flemish)

# Component Filter (Set to None or empty string "" to process all components)
TARGET_COMPONENT = "comp-q" # e.g., "comp-q" or None

REC_META_SPEAKER_COL = "SpeakerID" # Column name for speaker ID in recordings.txt
REC_META_COMPONENT_COL = "Component"   # Column name for component in recordings.txt
REC_META_FILEROOT_COL = "Root"   # Column name for the unique file identifier in recordings.txt

# --- Data Loading Function (Robust Version - from previous script) ---
def load_data_with_delimiters(file_path, potential_delimiters=['\t', r'\s+'], encoding='ISO-8859-1', expected_cols=None):
    """Attempts to load a CSV/text file using a list of potential delimiters."""
    last_exception = None
    encodings_to_try = ['utf-8', encoding] # Try UTF-8 first

    for enc in encodings_to_try:
        for delim_raw in potential_delimiters:
            delim_repr = repr(delim_raw)
            try:
                engine = 'python' if delim_raw == r'\s+' else None
                df = pd.read_csv(file_path, sep=delim_raw, encoding=enc,
                                 engine=engine, on_bad_lines='warn', low_memory=False,
                                 skipinitialspace=True, comment='#', skip_blank_lines=True)

                if df.empty: continue
                df.columns = df.columns.str.strip()

                if expected_cols:
                    missing_cols = [col for col in expected_cols if col not in df.columns]
                    if not missing_cols:
                        first_col_name = df.columns[0]
                        if first_col_name and df[first_col_name].notna().any():
                            print(f"Successfully loaded {os.path.basename(file_path)} with delimiter {delim_repr}, encoding '{enc}'.")
                            return df
                elif df.shape[1] > 1:
                     first_col_name = df.columns[0]
                     if first_col_name and df[first_col_name].notna().any():
                         print(f"Successfully loaded {os.path.basename(file_path)} with delimiter {delim_repr}, encoding '{enc}' ({df.shape[1]} columns found).")
                         return df
            except pd.errors.ParserError as pe: last_exception = pe
            except Exception as e: last_exception = e

    print(f"Error: Could not successfully load file {file_path} with any specified delimiter/encoding.")
    if last_exception: print(f"Last error encountered: {last_exception}")
    return None

# --- Helper Functions ---
def load_speaker_codes(filepath):
    """Loads speaker codes from a text file (one code per line)."""
    if not os.path.exists(filepath):
        print(f"Error: Speaker codes file not found at '{filepath}'")
        return []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            codes = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(codes)} speaker codes from {os.path.basename(filepath)}")
        return codes
    except Exception as e:
        print(f"Error reading speaker codes file '{filepath}': {e}")
        return []

def find_ort_file(speaker_code, component, file_root, config):
    """
    Finds the corresponding .ort file path based on known JASMIN structure.
    Args: speaker_code, component, file_root, config dict with JASMIN_ORT_BASE_PATH.
    Returns: Full path to .ort file or None.
    """
    if not all([speaker_code, component, file_root]):
        print(f"  Warning: Missing info for .ort lookup (Speaker: {speaker_code}, Comp: {component}, Root: {file_root}).")
        return None
    ort_filename = f"{file_root}.ort"
    region = 'nl' if speaker_code.startswith('N') else 'vl' if speaker_code.startswith('V') else None
    if not region:
        print(f"  Warning: Cannot determine region from speaker code '{speaker_code}' for .ort lookup.")
        return None
    ort_path = os.path.join(config['JASMIN_ORT_BASE_PATH'], component, region, ort_filename)
    ort_path = os.path.normpath(ort_path)
    return ort_path if os.path.exists(ort_path) else None

def parse_ort_file(ort_path):
    """ Parses a .ort file (potentially Praat TextGrid), trying multiple encodings. """
    if not ort_path or not os.path.exists(ort_path): return None
    encodings_to_try = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    content = None; detected_encoding = None
    for encoding in encodings_to_try:
        try:
            with open(ort_path, 'r', encoding=encoding) as f: content = f.read()
            detected_encoding = encoding; break
        except UnicodeDecodeError: continue
        except Exception as e:
            print(f"  Warning: Error reading .ort file {ort_path} with encoding {encoding}: {e}")
            if not isinstance(e, UnicodeDecodeError): return None
    if content is None:
        print(f"  Warning: Could not read .ort file '{os.path.basename(ort_path)}' with tried encodings.")
        return None

    segments = []; lines = content.strip().split('\n')
    try:
        # --- Strategy 1: Parse as Praat TextGrid ---
        if len(lines) > 1 and ("TextGrid" in lines[1] or "ooTextFile" in lines[0]):
            in_item = False; is_interval_tier = False; current_segment = {}
            tier_name = None
            # Find the relevant tier (e.g., 'ORT-MAU', 'ORT', 'transcript') - case insensitive
            tier_found = False
            target_tier_names = ['ort-mau', 'ort', 'transcript'] # Add other potential names
            for i, line in enumerate(lines):
                 line_lower = line.strip().lower()
                 if line_lower.startswith("item ["): tier_found = False # Reset for new item
                 if line_lower.startswith("name ="):
                     current_tier_name_match = re.search(r'"(.*)"', line.strip())
                     if current_tier_name_match:
                         current_tier_name = current_tier_name_match.group(1).lower()
                         if current_tier_name in target_tier_names:
                             tier_name = current_tier_name
                             tier_found = True
                 if tier_found and "IntervalTier" in line: is_interval_tier = True; continue
                 if tier_found and is_interval_tier and line.strip().startswith("item ["): in_item = True; current_segment = {}; continue
                 if in_item and is_interval_tier:
                    line_strip = line.strip()
                    if line_strip.startswith("xmin ="): current_segment['start'] = float(line_strip.split('=')[1].strip())
                    elif line_strip.startswith("xmax ="): current_segment['end'] = float(line_strip.split('=')[1].strip())
                    elif line_strip.startswith("text ="):
                        text_match = re.search(r'"(.*)"', line_strip)
                        text = text_match.group(1).strip() if text_match else ""
                        current_segment['text'] = text
                        if 'start' in current_segment and 'end' in current_segment and 'text' in current_segment:
                            # Only add segments with actual text content
                            if current_segment['text'] and not current_segment['text'].isspace():
                                segments.append(current_segment)
                            in_item = False # Reset for next interval in the tier
            if segments: print(f"  Parsed {len(segments)} segments from TextGrid tier '{tier_name}'.")

        # --- Strategy 2: Simple format (Timestamp, Timestamp, "Text") ---
        if not segments:
            i = 0
            while i < len(lines):
                try:
                    if i + 2 < len(lines):
                        start_time = float(lines[i].strip())
                        end_time = float(lines[i+1].strip())
                        text_match = re.match(r'\s*"(.*)"\s*', lines[i+2])
                        if text_match:
                            text = text_match.group(1).strip()
                            if text and not text.isspace(): segments.append({'start': start_time, 'end': end_time, 'text': text})
                            i += 3; continue
                except (ValueError, IndexError): pass
                i += 1
            if segments: print(f"  Parsed {len(segments)} segments using simple timestamp format.")

        if segments: return segments
        else:
            print(f"  Warning: Could not parse segments from '{os.path.basename(ort_path)}' (encoding: {detected_encoding}).")
            return None
    except Exception as e:
        print(f"  Error during parsing .ort file {ort_path}: {e}"); traceback.print_exc(); return None

def format_whisper_segments(result_dict):
    """
    Converts Whisper result segments (from Hugging Face pipeline or openai-whisper)
    into a standardized list of dictionaries.
    For Hugging Face pipeline with return_timestamps=True, result_dict["segments"] will be
    a list of {'text': ..., 'start': ..., 'end': ...}. The 'words' key will be missing.
    """
    segments_out = []
    for seg in result_dict.get('segments', []):
        segments_out.append({
            'start': seg.get('start'),
            'end': seg.get('end'),
            'text': seg.get('text', '').strip(),
            'words': seg.get('words', []) # This will be empty if input `seg` doesn't have 'words'
        })
    return segments_out

def clean_text_for_wer(text):
    """Basic text cleaning for WER calculation (uppercase, remove punctuation)."""
    if not isinstance(text, str): return ""
    text = text.upper()
    text = re.sub(r'[^\w\s*]', '', text) # Keep '*' for potential markers
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def prepare_wer_data(whisper_segments, ort_segments):
    """Prepares reference and hypothesis strings for WER analysis file."""
    if not ort_segments or not whisper_segments: return []
    ref_full = " ".join([seg['text'] for seg in ort_segments if seg.get('text')])
    hyp_full = " ".join([seg['text'] for seg in whisper_segments if seg.get('text')])
    ref_clean = clean_text_for_wer(ref_full)
    hyp_clean = clean_text_for_wer(hyp_full)
    ops = "?"; csid = "0 0 0 0" # Placeholders, real WER tool needed
    return [{'ref': ref_clean, 'hyp': hyp_clean, 'op': ops, 'csid': csid}]

def save_results(output_dir, file_base_name, data_dict):
    """Saves detailed JSON and simple TXT results."""
    os.makedirs(output_dir, exist_ok=True) # Ensure dir exists
    json_path = os.path.join(output_dir, f"{file_base_name}.json")
    txt_path = os.path.join(output_dir, f"{file_base_name}.txt")
    try:
        with open(json_path, "w", encoding="utf-8") as f:
            # Convert numpy types for JSON serialization
            def convert_numpy(obj):
                if isinstance(obj, np.integer): return int(obj)
                elif isinstance(obj, np.floating): return float(obj)
                elif isinstance(obj, np.ndarray): return obj.tolist()
                return obj
            json.dump(data_dict, f, indent=2, ensure_ascii=False, default=convert_numpy)
    except Exception as e: print(f"  Error saving JSON to {json_path}: {e}")
    try:
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(data_dict.get('transcription', ''))
    except Exception as e: print(f"  Error saving TXT to {txt_path}: {e}")

def generate_wer_analysis_file(results_list, output_dir, group_name):
    """Generates a text file suitable for input to WER calculation tools."""
    wer_output_path = os.path.join(output_dir, f"wer_analysis_{group_name}.txt")
    entries_written = 0
    valid_results = [r for r in results_list if r.get('reference_parsed', False)]
    if not valid_results:
        print(f"No results with parsed references found for group '{group_name}', skipping WER analysis file.")
        return

    print(f"Generating WER analysis file for '{group_name}': {wer_output_path}")
    try:
        with open(wer_output_path, "w", encoding="utf-8") as f:
            for result in valid_results:
                file_id = f"{result['speaker_code']}_{result['component']}_{result['file_root']}"
                json_file = os.path.join(output_dir, f"{file_id}.json")
                wer_data_list = []
                if os.path.exists(json_file):
                    try:
                        with open(json_file, "r", encoding="utf-8") as jf:
                             detailed_data = json.load(jf)
                             wer_data_list = detailed_data.get('wer_data', [])
                    except Exception as e: print(f"  Warn: Could not reload JSON {json_file} for WER output: {e}")

                if wer_data_list:
                     for wer_item in wer_data_list:
                         f.write(f"{file_id} ref {wer_item.get('ref','')}\n")
                         f.write(f"{file_id} hyp {wer_item.get('hyp','')}\n")
                         f.write(f"{file_id} op  {wer_item.get('op','?')}\n")
                         f.write(f"{file_id} #csid {wer_item.get('csid','0 0 0 0')}\n")
                         entries_written += 1
                else: print(f"  Warn: No WER data found in {json_file} for {file_id}")
        if entries_written > 0: print(f"WER analysis file generated with {entries_written // 4} entries.")
        else: print(f"WER analysis file generation attempted, but no valid entries found/written.")
    except Exception as e: print(f"Error generating WER analysis file: {e}")


# --- Main Execution ---
def main():
    # Combine configuration into a dictionary
    config = {
        'IS_TEST_RUN': IS_TEST_RUN,
        'NUM_FILES_TEST_RUN': NUM_FILES_TEST_RUN,
        'OUTPUT_DIR': OUTPUT_DIR, # Specific output dir for the group
        'FINETUNED_MODEL_PATH': FINETUNED_MODEL_PATH, # Use the path to your fine-tuned model
        'TRANSCRIPTION_LANGUAGE': TRANSCRIPTION_LANGUAGE,
        'JASMIN_ORT_BASE_PATH': jasmin_ort_base_path,
        'AUDIO_ROOT_DIR': audio_root_dir,
        'TARGET_COMPONENT': TARGET_COMPONENT,
        'GROUP_NAME': GROUP_NAME, # Add group name to config
    }

    # --- Setup ---
    os.makedirs(config['OUTPUT_DIR'], exist_ok=True)
    device_string = "cuda" if torch.cuda.is_available() else "cpu"
    # For Hugging Face pipeline, device can be 0 for cuda:0, 1 for cuda:1, or -1 for CPU
    device_pipeline = 0 if torch.cuda.is_available() else -1
    print(f"--- Starting Transcription for Group: {config['GROUP_NAME']} ---")
    print(f"Using device: {device_string}")
    overall_start_time = time.time()

    # --- Check Input Paths ---
    input_paths_to_check = [nl_recordings_path, vl_recordings_path, SPEAKER_CODES_FILE, config['FINETUNED_MODEL_PATH']]
    missing_paths = [p for p in input_paths_to_check if not os.path.exists(p)]
    if missing_paths:
        print("\nError: Required input files/directories not found:")
        for p in missing_paths: print(f"- {os.path.abspath(p)}")
        sys.exit(1)
    if not os.path.isdir(config['AUDIO_ROOT_DIR']):
        print(f"\nError: Audio root directory not found: {os.path.abspath(config['AUDIO_ROOT_DIR'])}")
        sys.exit(1)
    print("\nAll required input files and directories found.")

    # --- Load Speaker Codes for the target group ---
    target_speaker_codes = load_speaker_codes(SPEAKER_CODES_FILE)
    if not target_speaker_codes:
        print(f"No speaker codes loaded for group {config['GROUP_NAME']}. Exiting.")
        sys.exit(1)

    # --- Load Recordings Metadata ---
    print("\n--- Loading Recording Metadata ---")
    expected_rec_cols = [REC_META_SPEAKER_COL, REC_META_COMPONENT_COL, REC_META_FILEROOT_COL]
    nl_recordings = load_data_with_delimiters(nl_recordings_path, expected_cols=expected_rec_cols)
    vl_recordings = load_data_with_delimiters(vl_recordings_path, expected_cols=expected_rec_cols)
    all_recordings_df = None
    loaded_rec_dfs = []
    if nl_recordings is not None: loaded_rec_dfs.append(nl_recordings)
    if vl_recordings is not None: loaded_rec_dfs.append(vl_recordings)
    if loaded_rec_dfs:
        all_recordings_df = pd.concat(loaded_rec_dfs, ignore_index=True)
        print(f"Combined {len(loaded_rec_dfs)} recording metadata file(s): {len(all_recordings_df)} total entries.")
    else:
        print("\nError: Failed to load any recording metadata. Exiting.")
        sys.exit(1)

    # --- Filter Recordings Metadata for Target Group ---
    print(f"\n--- Filtering Recordings for Group: {config['GROUP_NAME']} ---")
    speaker_codes_set = set(target_speaker_codes)
    group_recordings_df = all_recordings_df[all_recordings_df[REC_META_SPEAKER_COL].isin(speaker_codes_set)].copy()

    # Filter by component
    if config['TARGET_COMPONENT']:
        print(f"Filtering recordings for component: '{config['TARGET_COMPONENT']}'")
        group_recordings_df = group_recordings_df[group_recordings_df[REC_META_COMPONENT_COL] == config['TARGET_COMPONENT']].copy()
        print(f"Found {len(group_recordings_df)} recordings matching speakers and component.")
    else:
        print(f"Found {len(group_recordings_df)} recordings matching speakers (all components).")

    if group_recordings_df.empty:
        print(f"No recordings found for group {config['GROUP_NAME']} after filtering. Exiting.")
        sys.exit(1)

    # --- Load Fine-tuned Whisper Model using Hugging Face Pipeline ---
    print(f"\n--- Loading Fine-tuned Whisper Model from '{config['FINETUNED_MODEL_PATH']}' ---")
    try:
        # Using Hugging Face pipeline for automatic speech recognition
        asr_pipeline = pipeline(
            "automatic-speech-recognition",
            model=config['FINETUNED_MODEL_PATH'],
            device=device_pipeline # Use 0, 1, etc. for GPU, -1 for CPU
        )
        print("Fine-tuned model loaded into ASR pipeline.")
    except Exception as e:
        print(f"Error loading fine-tuned Whisper model with Hugging Face pipeline: {e}")
        import traceback
        traceback.print_exc()
        return

    # --- Select files for processing (Test Run or All) ---
    if config['IS_TEST_RUN']:
        recordings_to_process = group_recordings_df.head(config['NUM_FILES_TEST_RUN'])
        print(f"--- TEST RUN: Processing first {len(recordings_to_process)} recordings for {config['GROUP_NAME']} ---")
    else:
        recordings_to_process = group_recordings_df
        print(f"--- Processing all {len(recordings_to_process)} recordings for {config['GROUP_NAME']} ---")

    # --- Process Recordings ---
    results_summary = []
    start_time_group = time.time()
    processed_count = 0
    skipped_count = 0
    ort_found_parsed_count = 0
    ort_not_found_count = 0
    ort_found_not_parsed_count = 0

    for index, row in tqdm(recordings_to_process.iterrows(), total=len(recordings_to_process), desc=f"Transcribing {config['GROUP_NAME']}"):
        try:
            # Extract info from recordings metadata row
            speaker_code = str(row[REC_META_SPEAKER_COL])
            file_root = str(row[REC_META_FILEROOT_COL])
            component = str(row[REC_META_COMPONENT_COL])
            region = 'nl' if speaker_code.startswith('N') else 'vl' if speaker_code.startswith('V') else None

            if not region:
                print(f"  Warning: Cannot determine region for SpeakerID '{speaker_code}'. Skipping.")
                skipped_count += 1
                continue

            # Construct original audio path
            wav_path = os.path.join(config['AUDIO_ROOT_DIR'], 'audio', 'wav', component, region, f"{file_root}.wav")
            wav_path = os.path.normpath(wav_path)
            file_name = os.path.basename(wav_path)
            # Use a consistent, unique base name for output files
            file_base_name = f"{speaker_code}_{component}_{file_root}"

            print(f"\nProcessing: {file_name} (Speaker: {speaker_code}, Comp: {component})")

            # Check if audio file exists
            if not os.path.exists(wav_path):
                print(f"  CRITICAL: Audio file not found at '{wav_path}'. Skipping.")
                fallback_pattern = os.path.join(config['AUDIO_ROOT_DIR'], 'audio', 'wav', component, region, f"{file_root}*.wav")
                matching_files = glob.glob(fallback_pattern)
                if matching_files:
                    print(f"  INFO: Found alternative audio file via fallback: {os.path.basename(matching_files[0])}")
                    wav_path = matching_files[0]
                    file_name = os.path.basename(wav_path)
                else:
                    skipped_count += 1
                    continue

            # Find and Parse Reference (.ort) File
            ort_path = find_ort_file(speaker_code, component, file_root, config)
            ort_segments = parse_ort_file(ort_path) if ort_path else None
            reference_parsed = ort_segments is not None
            reference_found_path = ort_path if ort_path else ""

            if reference_parsed:
                print(f"  Reference found and parsed: {os.path.basename(ort_path)}")
                ort_found_parsed_count +=1
            elif ort_path:
                print(f"  Reference file found ({os.path.basename(ort_path)}) but FAILED TO PARSE.")
                ort_found_not_parsed_count += 1
            else:
                print(f"  Reference .ort file NOT FOUND.")
                ort_not_found_count += 1

            # Transcribe with Whisper (using Hugging Face Pipeline)
            print(f"  Starting transcription with fine-tuned model...")

            try:
                pipeline_output = asr_pipeline(
                    wav_path,
                    chunk_length_s=30, # Recommended for long audio files
                    generate_kwargs={
                        "language": config['TRANSCRIPTION_LANGUAGE'], # Make sure this matches your fine-tuned model's language
                        "task": "transcribe"
                        # "forced_decoder_ids": None, # Keep if you didn't modify generation_config.json, otherwise optional
                        # "suppress_tokens": [], # Explicitly set if you want to override model's config
                        # "begin_suppress_tokens": [220, 50257] # Explicitly set if you want to override model's config
                    },
                    return_timestamps=True # Gets segment-level timestamps. Use "word" for word-level.
                )


                
                transcription = pipeline_output["text"]

                # Adapt whisper_segments generation
                # pipeline_output["chunks"] (with return_timestamps=True) is a list of dicts:
                # [{'text': 'segment text', 'timestamp': (start_time, end_time)}, ...]
                # 'words' will be empty as segment-level pipeline output doesn't provide them directly here.
                temp_hf_segments = []
                for chunk in pipeline_output.get("chunks", []): # .get("chunks", []) is safer
                    temp_hf_segments.append({
                        'text': chunk['text'],
                        'start': chunk['timestamp'][0],
                        'end': chunk['timestamp'][1],
                        'words': [] # Word-level details not directly available here
                    })
                # Pass it in the expected dict structure to format_whisper_segments
                whisper_segments = format_whisper_segments({"segments": temp_hf_segments})
                print(f"  Transcription finished.")
                # print(f"  DEBUG: Whisper Segments: {whisper_segments[:2]}") # For debugging
            except Exception as e:
                print(f"  ERROR during Whisper transcription: {e}")
                import traceback
                traceback.print_exc()
                skipped_count += 1
                continue

            # Prepare Data for WER analysis file
            wer_data = prepare_wer_data(whisper_segments, ort_segments if reference_parsed else [])

            # Save detailed JSON and simple TXT
            output_data = {
                'original_wav_path': wav_path,
                'speaker_code': speaker_code,
                'component': component,
                'file_root': file_root,
                'transcription': transcription,
                'whisper_segments': whisper_segments,
                'ort_segments': ort_segments if reference_parsed else [],
                'wer_data': wer_data,
                'reference_found_path': reference_found_path,
                'reference_parsed': reference_parsed,
                # The pipeline itself doesn't usually return 'language' in the main output dict
                # The language is part of the model's config or specified in generate_kwargs
                'language_used_for_generation': config['TRANSCRIPTION_LANGUAGE'],
            }
            save_results(config['OUTPUT_DIR'], file_base_name, output_data)

            # Collect summary info
            results_summary.append({
                "processed_wav_file": file_name,
                "speaker_code": speaker_code,
                "component": component,
                "file_root": file_root,
                "transcription": transcription,
                "reference_found_path": reference_found_path,
                "reference_parsed": reference_parsed,
                "ref_text_preview": (" ".join(s['text'] for s in ort_segments)[:80]+"..." if reference_parsed and ort_segments else ""),
                "num_whisper_segments": len(whisper_segments),
                "num_ort_segments": len(ort_segments) if reference_parsed and ort_segments else 0,
            })
            processed_count += 1

        except Exception as loop_error:
             print(f"!! Unexpected Error processing recording at index {index}: {loop_error}")
             print(f"   Row data: {row.to_dict()}")
             traceback.print_exc()
             skipped_count += 1

    # --- Group Summary ---
    end_time_group = time.time()
    total_time_group = end_time_group - start_time_group
    num_processed_successfully = processed_count

    print(f"\n--- {config['GROUP_NAME']} Processing Summary ---")
    print(f"Selected {len(recordings_to_process)} recordings for processing.")
    print(f"Successfully processed: {num_processed_successfully}")
    print(f"Skipped due to errors/missing files: {skipped_count}")
    print(f"Total processing time: {total_time_group:.2f} seconds")
    if num_processed_successfully > 0:
        print(f"Avg. time per processed file: {total_time_group / num_processed_successfully:.2f} seconds")

    print(f"\n--- {config['GROUP_NAME']} Reference File Summary ---")
    print(f"Found & Parsed: {ort_found_parsed_count}")
    print(f"Found but Failed to Parse: {ort_found_not_parsed_count}")
    print(f"Not Found: {ort_not_found_count}")
    if num_processed_successfully > 0:
        parse_success_rate = (ort_found_parsed_count / num_processed_successfully) * 100
        print(f"Parse Success Rate (of processed files): {parse_success_rate:.1f}%")

    # --- Save Group Summary CSV ---
    if results_summary:
        summary_df = pd.DataFrame(results_summary)
        csv_path = os.path.join(config['OUTPUT_DIR'], f"transcription_summary_{config['GROUP_NAME']}.csv")
        try:
            summary_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
            print(f"\nSummary results for {config['GROUP_NAME']} saved to: {csv_path}")
        except Exception as e:
            print(f"\nError saving summary CSV for {config['GROUP_NAME']}: {e}")

    # --- Generate Group WER Analysis File ---
    generate_wer_analysis_file(results_summary, config['OUTPUT_DIR'], config['GROUP_NAME'])

    print(f"===== Finished Group: {config['GROUP_NAME']} =====")


if __name__ == "__main__":
    main()


--- Starting Transcription for Group: PureDutchChildren_7_11_Large_3_finetuned ---
Using device: cuda

All required input files and directories found.
Loaded 79 speaker codes from pure_dutch_children_7_11_codes.txt

--- Loading Recording Metadata ---
Successfully loaded recordings.txt with delimiter '\t', encoding 'utf-8'.
Successfully loaded recordings.txt with delimiter '\t', encoding 'utf-8'.
Combined 2 recording metadata file(s): 995 total entries.

--- Filtering Recordings for Group: PureDutchChildren_7_11_Large_3_finetuned ---
Filtering recordings for component: 'comp-q'
Found 78 recordings matching speakers and component.

--- Loading Fine-tuned Whisper Model from '/home/tdamen/whisper-finetuned-synthetic-only-large-v3' ---


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Fine-tuned model loaded into ASR pipeline.
--- Processing all 78 recordings for PureDutchChildren_7_11_Large_3_finetuned ---


Transcribing PureDutchChildren_7_11_Large_3_finetuned:   0%|          | 0/78 [00:00<?, ?it/s]


Processing: fn000049.wav (Speaker: N000025, Comp: comp-q)
  Parsed 164 segments using simple timestamp format.
  Reference found and parsed: fn000049.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:   1%|▏         | 1/78 [00:46<1:00:03, 46.80s/it]

  Transcription finished.

Processing: fn000051.wav (Speaker: N000026, Comp: comp-q)
  Parsed 218 segments using simple timestamp format.
  Reference found and parsed: fn000051.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:   3%|▎         | 2/78 [02:03<1:21:15, 64.15s/it]

  Transcription finished.

Processing: fn000060.wav (Speaker: N000028, Comp: comp-q)
  Parsed 187 segments using simple timestamp format.
  Reference found and parsed: fn000060.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:   4%|▍         | 3/78 [02:57<1:14:47, 59.83s/it]

  Transcription finished.

Processing: fn000062.wav (Speaker: N000029, Comp: comp-q)
  Parsed 235 segments using simple timestamp format.
  Reference found and parsed: fn000062.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:   5%|▌         | 4/78 [04:14<1:21:54, 66.41s/it]

  Transcription finished.

Processing: fn000064.wav (Speaker: N000030, Comp: comp-q)
  Parsed 224 segments using simple timestamp format.
  Reference found and parsed: fn000064.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:   6%|▋         | 5/78 [05:14<1:17:53, 64.02s/it]

  Transcription finished.

Processing: fn000066.wav (Speaker: N000031, Comp: comp-q)
  Parsed 244 segments using simple timestamp format.
  Reference found and parsed: fn000066.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:   8%|▊         | 6/78 [06:08<1:12:52, 60.73s/it]

  Transcription finished.

Processing: fn000068.wav (Speaker: N000032, Comp: comp-q)
  Parsed 168 segments using simple timestamp format.
  Reference found and parsed: fn000068.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:   9%|▉         | 7/78 [06:56<1:07:02, 56.65s/it]

  Transcription finished.

Processing: fn000070.wav (Speaker: N000033, Comp: comp-q)
  Parsed 136 segments using simple timestamp format.
  Reference found and parsed: fn000070.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  10%|█         | 8/78 [07:39<1:00:55, 52.23s/it]

  Transcription finished.

Processing: fn000073.wav (Speaker: N000034, Comp: comp-q)
  Parsed 167 segments using simple timestamp format.
  Reference found and parsed: fn000073.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  12%|█▏        | 9/78 [08:35<1:01:20, 53.34s/it]

  Transcription finished.

Processing: fn000074.wav (Speaker: N000027, Comp: comp-q)
  Parsed 176 segments using simple timestamp format.
  Reference found and parsed: fn000074.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  13%|█▎        | 10/78 [09:19<57:27, 50.70s/it] 

  Transcription finished.

Processing: fn000076.wav (Speaker: N000036, Comp: comp-q)
  Parsed 166 segments using simple timestamp format.
  Reference found and parsed: fn000076.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  14%|█▍        | 11/78 [10:16<58:30, 52.40s/it]

  Transcription finished.

Processing: fn000078.wav (Speaker: N000037, Comp: comp-q)
  Parsed 158 segments using simple timestamp format.
  Reference found and parsed: fn000078.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  15%|█▌        | 12/78 [11:20<1:01:45, 56.15s/it]

  Transcription finished.

Processing: fn000080.wav (Speaker: N000038, Comp: comp-q)
  Parsed 175 segments using simple timestamp format.
  Reference found and parsed: fn000080.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  17%|█▋        | 13/78 [12:25<1:03:39, 58.77s/it]

  Transcription finished.

Processing: fn000082.wav (Speaker: N000039, Comp: comp-q)
  Parsed 224 segments using simple timestamp format.
  Reference found and parsed: fn000082.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  18%|█▊        | 14/78 [13:41<1:08:09, 63.90s/it]

  Transcription finished.

Processing: fn000084.wav (Speaker: N000040, Comp: comp-q)
  Parsed 170 segments using simple timestamp format.
  Reference found and parsed: fn000084.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  19%|█▉        | 15/78 [15:01<1:12:09, 68.72s/it]

  Transcription finished.

Processing: fn000087.wav (Speaker: N000041, Comp: comp-q)
  Parsed 164 segments using simple timestamp format.
  Reference found and parsed: fn000087.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  21%|██        | 16/78 [16:27<1:16:31, 74.05s/it]

  Transcription finished.

Processing: fn000090.wav (Speaker: N000042, Comp: comp-q)
  Parsed 169 segments using simple timestamp format.
  Reference found and parsed: fn000090.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  22%|██▏       | 17/78 [17:35<1:13:20, 72.15s/it]

  Transcription finished.

Processing: fn000092.wav (Speaker: N000043, Comp: comp-q)
  Parsed 162 segments using simple timestamp format.
  Reference found and parsed: fn000092.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  23%|██▎       | 18/78 [18:38<1:09:24, 69.42s/it]

  Transcription finished.

Processing: fn000094.wav (Speaker: N000044, Comp: comp-q)
  Parsed 217 segments using simple timestamp format.
  Reference found and parsed: fn000094.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  24%|██▍       | 19/78 [20:02<1:12:28, 73.71s/it]

  Transcription finished.

Processing: fn000100.wav (Speaker: N000047, Comp: comp-q)
  Parsed 194 segments using simple timestamp format.
  Reference found and parsed: fn000100.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  26%|██▌       | 20/78 [21:08<1:08:58, 71.36s/it]

  Transcription finished.

Processing: fn000102.wav (Speaker: N000048, Comp: comp-q)
  Parsed 174 segments using simple timestamp format.
  Reference found and parsed: fn000102.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  27%|██▋       | 21/78 [22:29<1:10:45, 74.48s/it]

  Transcription finished.

Processing: fn000104.wav (Speaker: N000049, Comp: comp-q)
  Parsed 140 segments using simple timestamp format.
  Reference found and parsed: fn000104.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  28%|██▊       | 22/78 [23:40<1:08:28, 73.37s/it]

  Transcription finished.

Processing: fn000108.wav (Speaker: N000054, Comp: comp-q)
  Parsed 288 segments using simple timestamp format.
  Reference found and parsed: fn000108.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  29%|██▉       | 23/78 [25:16<1:13:27, 80.13s/it]

  Transcription finished.

Processing: fn000110.wav (Speaker: N000051, Comp: comp-q)
  Parsed 208 segments using simple timestamp format.
  Reference found and parsed: fn000110.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  31%|███       | 24/78 [26:40<1:13:01, 81.14s/it]

  Transcription finished.

Processing: fn000112.wav (Speaker: N000052, Comp: comp-q)
  Parsed 186 segments using simple timestamp format.
  Reference found and parsed: fn000112.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  32%|███▏      | 25/78 [27:50<1:08:46, 77.87s/it]

  Transcription finished.

Processing: fn000117.wav (Speaker: N000055, Comp: comp-q)
  Parsed 178 segments using simple timestamp format.
  Reference found and parsed: fn000117.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  33%|███▎      | 26/78 [28:47<1:02:01, 71.56s/it]

  Transcription finished.

Processing: fn000119.wav (Speaker: N000056, Comp: comp-q)
  Parsed 141 segments using simple timestamp format.
  Reference found and parsed: fn000119.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  35%|███▍      | 27/78 [29:36<55:04, 64.80s/it]  

  Transcription finished.

Processing: fn000121.wav (Speaker: N000057, Comp: comp-q)
  Parsed 124 segments using simple timestamp format.
  Reference found and parsed: fn000121.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  36%|███▌      | 28/78 [30:20<48:53, 58.66s/it]

  Transcription finished.

Processing: fn000123.wav (Speaker: N000058, Comp: comp-q)
  Parsed 137 segments using simple timestamp format.
  Reference found and parsed: fn000123.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  37%|███▋      | 29/78 [30:56<42:14, 51.72s/it]

  Transcription finished.

Processing: fn000125.wav (Speaker: N000059, Comp: comp-q)
  Parsed 119 segments using simple timestamp format.
  Reference found and parsed: fn000125.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  38%|███▊      | 30/78 [31:34<38:06, 47.63s/it]

  Transcription finished.

Processing: fn000127.wav (Speaker: N000060, Comp: comp-q)
  Parsed 145 segments using simple timestamp format.
  Reference found and parsed: fn000127.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  40%|███▉      | 31/78 [32:18<36:26, 46.52s/it]

  Transcription finished.

Processing: fn000129.wav (Speaker: N000061, Comp: comp-q)
  Parsed 215 segments using simple timestamp format.
  Reference found and parsed: fn000129.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  41%|████      | 32/78 [33:36<42:59, 56.07s/it]

  Transcription finished.

Processing: fn000133.wav (Speaker: N000062, Comp: comp-q)
  Parsed 140 segments using simple timestamp format.
  Reference found and parsed: fn000133.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  42%|████▏     | 33/78 [34:31<41:54, 55.88s/it]

  Transcription finished.

Processing: fn000135.wav (Speaker: N000063, Comp: comp-q)
  Parsed 142 segments using simple timestamp format.
  Reference found and parsed: fn000135.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  44%|████▎     | 34/78 [35:19<39:11, 53.44s/it]

  Transcription finished.

Processing: fn000137.wav (Speaker: N000064, Comp: comp-q)
  Parsed 166 segments using simple timestamp format.
  Reference found and parsed: fn000137.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  45%|████▍     | 35/78 [36:28<41:32, 57.97s/it]

  Transcription finished.

Processing: fn000143.wav (Speaker: N000066, Comp: comp-q)
  Parsed 112 segments using simple timestamp format.
  Reference found and parsed: fn000143.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  46%|████▌     | 36/78 [37:13<37:49, 54.03s/it]

  Transcription finished.

Processing: fn000145.wav (Speaker: N000067, Comp: comp-q)
  Parsed 210 segments using simple timestamp format.
  Reference found and parsed: fn000145.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  47%|████▋     | 37/78 [38:24<40:26, 59.19s/it]

  Transcription finished.

Processing: fn000149.wav (Speaker: N000070, Comp: comp-q)
  Parsed 205 segments using simple timestamp format.
  Reference found and parsed: fn000149.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  49%|████▊     | 38/78 [39:26<39:59, 59.99s/it]

  Transcription finished.

Processing: fn000151.wav (Speaker: N000069, Comp: comp-q)
  Parsed 117 segments using simple timestamp format.
  Reference found and parsed: fn000151.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  50%|█████     | 39/78 [40:20<37:52, 58.27s/it]

  Transcription finished.

Processing: fn000494.wav (Speaker: N000194, Comp: comp-q)
  Parsed 223 segments using simple timestamp format.
  Reference found and parsed: fn000494.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  51%|█████▏    | 40/78 [41:44<41:43, 65.88s/it]

  Transcription finished.

Processing: fn000497.wav (Speaker: N000195, Comp: comp-q)
  Parsed 236 segments using simple timestamp format.
  Reference found and parsed: fn000497.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  53%|█████▎    | 41/78 [43:08<43:59, 71.34s/it]

  Transcription finished.

Processing: fn000503.wav (Speaker: N000197, Comp: comp-q)
  Parsed 240 segments using simple timestamp format.
  Reference found and parsed: fn000503.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  54%|█████▍    | 42/78 [44:33<45:22, 75.62s/it]

  Transcription finished.

Processing: fn000510.wav (Speaker: N000199, Comp: comp-q)
  Parsed 182 segments using simple timestamp format.
  Reference found and parsed: fn000510.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  55%|█████▌    | 43/78 [45:47<43:49, 75.12s/it]

  Transcription finished.

Processing: fn000515.wav (Speaker: N000200, Comp: comp-q)
  Parsed 194 segments using simple timestamp format.
  Reference found and parsed: fn000515.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  56%|█████▋    | 44/78 [47:14<44:35, 78.68s/it]

  Transcription finished.

Processing: fn000518.wav (Speaker: N000201, Comp: comp-q)
  Parsed 225 segments using simple timestamp format.
  Reference found and parsed: fn000518.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  58%|█████▊    | 45/78 [48:25<42:02, 76.42s/it]

  Transcription finished.

Processing: fn000521.wav (Speaker: N000202, Comp: comp-q)
  Parsed 191 segments using simple timestamp format.
  Reference found and parsed: fn000521.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  59%|█████▉    | 46/78 [49:24<37:56, 71.13s/it]

  Transcription finished.

Processing: fn000524.wav (Speaker: N000203, Comp: comp-q)
  Parsed 253 segments using simple timestamp format.
  Reference found and parsed: fn000524.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  60%|██████    | 47/78 [50:52<39:22, 76.21s/it]

  Transcription finished.

Processing: fn000534.wav (Speaker: N000206, Comp: comp-q)
  Parsed 188 segments using simple timestamp format.
  Reference found and parsed: fn000534.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  62%|██████▏   | 48/78 [52:17<39:21, 78.72s/it]

  Transcription finished.

Processing: fn000537.wav (Speaker: N000207, Comp: comp-q)
  Parsed 183 segments using simple timestamp format.
  Reference found and parsed: fn000537.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  63%|██████▎   | 49/78 [53:33<37:41, 77.99s/it]

  Transcription finished.

Processing: fn000546.wav (Speaker: N000210, Comp: comp-q)
  Parsed 181 segments using simple timestamp format.
  Reference found and parsed: fn000546.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  64%|██████▍   | 50/78 [54:42<35:06, 75.23s/it]

  Transcription finished.

Processing: fn000549.wav (Speaker: N000211, Comp: comp-q)
  Parsed 254 segments using simple timestamp format.
  Reference found and parsed: fn000549.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  65%|██████▌   | 51/78 [56:00<34:12, 76.02s/it]

  Transcription finished.

Processing: fn000555.wav (Speaker: N000213, Comp: comp-q)
  Parsed 184 segments using simple timestamp format.
  Reference found and parsed: fn000555.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  67%|██████▋   | 52/78 [57:03<31:20, 72.31s/it]

  Transcription finished.

Processing: fn000558.wav (Speaker: N000214, Comp: comp-q)
  Parsed 193 segments using simple timestamp format.
  Reference found and parsed: fn000558.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  68%|██████▊   | 53/78 [58:45<33:49, 81.20s/it]

  Transcription finished.

Processing: fv160205.wav (Speaker: V000215, Comp: comp-q)
  Parsed 141 segments using simple timestamp format.
  Reference found and parsed: fv160205.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  Transcription finished.

Processing: fv170058.wav (Speaker: V000072, Comp: comp-q)
  Parsed 112 segments using simple timestamp format.
  Reference found and parsed: fv170058.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  71%|███████   | 55/78 [1:00:03<22:48, 59.52s/it]

  Transcription finished.

Processing: fv170061.wav (Speaker: V000074, Comp: comp-q)
  Parsed 145 segments using simple timestamp format.
  Reference found and parsed: fv170061.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  Transcription finished.

Processing: fv170064.wav (Speaker: V000077, Comp: comp-q)
  Parsed 115 segments using simple timestamp format.
  Reference found and parsed: fv170064.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  73%|███████▎  | 57/78 [1:01:51<19:42, 56.32s/it]

  Transcription finished.

Processing: fv170073.wav (Speaker: V000086, Comp: comp-q)
  Parsed 118 segments using simple timestamp format.
  Reference found and parsed: fv170073.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  74%|███████▍  | 58/78 [1:02:08<14:48, 44.44s/it]

  Transcription finished.

Processing: fv170074.wav (Speaker: V000087, Comp: comp-q)
  Parsed 215 segments using simple timestamp format.
  Reference found and parsed: fv170074.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  76%|███████▌  | 59/78 [1:03:01<14:55, 47.12s/it]

  Transcription finished.

Processing: fv170075.wav (Speaker: V000088, Comp: comp-q)
  Parsed 199 segments using simple timestamp format.
  Reference found and parsed: fv170075.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  77%|███████▋  | 60/78 [1:03:53<14:31, 48.42s/it]

  Transcription finished.

Processing: fv170076.wav (Speaker: V000089, Comp: comp-q)
  Parsed 146 segments using simple timestamp format.
  Reference found and parsed: fv170076.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  78%|███████▊  | 61/78 [1:04:32<12:54, 45.57s/it]

  Transcription finished.

Processing: fv170077.wav (Speaker: V000090, Comp: comp-q)
  Parsed 173 segments using simple timestamp format.
  Reference found and parsed: fv170077.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  79%|███████▉  | 62/78 [1:05:24<12:40, 47.51s/it]

  Transcription finished.

Processing: fv170078.wav (Speaker: V000091, Comp: comp-q)
  Parsed 174 segments using simple timestamp format.
  Reference found and parsed: fv170078.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  81%|████████  | 63/78 [1:06:11<11:52, 47.49s/it]

  Transcription finished.

Processing: fv170079.wav (Speaker: V000092, Comp: comp-q)
  Parsed 165 segments using simple timestamp format.
  Reference found and parsed: fv170079.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  82%|████████▏ | 64/78 [1:07:10<11:54, 51.03s/it]

  Transcription finished.

Processing: fv170080.wav (Speaker: V000093, Comp: comp-q)
  Parsed 124 segments using simple timestamp format.
  Reference found and parsed: fv170080.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  83%|████████▎ | 65/78 [1:07:42<09:47, 45.23s/it]

  Transcription finished.

Processing: fv170081.wav (Speaker: V000094, Comp: comp-q)
  Parsed 198 segments using simple timestamp format.
  Reference found and parsed: fv170081.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  85%|████████▍ | 66/78 [1:08:51<10:27, 52.28s/it]

  Transcription finished.

Processing: fv170083.wav (Speaker: V000096, Comp: comp-q)
  Parsed 161 segments using simple timestamp format.
  Reference found and parsed: fv170083.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  86%|████████▌ | 67/78 [1:10:08<10:56, 59.68s/it]

  Transcription finished.

Processing: fv170084.wav (Speaker: V000097, Comp: comp-q)
  Parsed 147 segments using simple timestamp format.
  Reference found and parsed: fv170084.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  87%|████████▋ | 68/78 [1:11:20<10:34, 63.48s/it]

  Transcription finished.

Processing: fv170101.wav (Speaker: V000115, Comp: comp-q)
  Parsed 125 segments using simple timestamp format.
  Reference found and parsed: fv170101.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  88%|████████▊ | 69/78 [1:12:14<09:06, 60.73s/it]

  Transcription finished.

Processing: fv170107.wav (Speaker: V000121, Comp: comp-q)
  Parsed 160 segments using simple timestamp format.
  Reference found and parsed: fv170107.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  90%|████████▉ | 70/78 [1:13:05<07:40, 57.62s/it]

  Transcription finished.

Processing: fv170110.wav (Speaker: V000124, Comp: comp-q)
  Parsed 161 segments using simple timestamp format.
  Reference found and parsed: fv170110.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  91%|█████████ | 71/78 [1:13:40<05:55, 50.83s/it]

  Transcription finished.

Processing: fv170114.wav (Speaker: V000128, Comp: comp-q)
  Parsed 185 segments using simple timestamp format.
  Reference found and parsed: fv170114.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  92%|█████████▏| 72/78 [1:14:23<04:51, 48.64s/it]

  Transcription finished.

Processing: fv170115.wav (Speaker: V000129, Comp: comp-q)
  Parsed 153 segments using simple timestamp format.
  Reference found and parsed: fv170115.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  94%|█████████▎| 73/78 [1:15:22<04:17, 51.56s/it]

  Transcription finished.

Processing: fv170120.wav (Speaker: V000131, Comp: comp-q)
  Parsed 125 segments using simple timestamp format.
  Reference found and parsed: fv170120.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  95%|█████████▍| 74/78 [1:15:59<03:08, 47.17s/it]

  Transcription finished.

Processing: fv170128.wav (Speaker: V000138, Comp: comp-q)
  Parsed 133 segments using simple timestamp format.
  Reference found and parsed: fv170128.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  96%|█████████▌| 75/78 [1:16:52<02:27, 49.09s/it]

  Transcription finished.

Processing: fv170132.wav (Speaker: V000142, Comp: comp-q)
  Parsed 150 segments using simple timestamp format.
  Reference found and parsed: fv170132.ort
  Starting transcription with fine-tuned model...


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Transcribing PureDutchChildren_7_11_Large_3_finetuned:  97%|█████████▋| 76/78 [1:17:50<01:43, 51.83s/it]

  Transcription finished.

Processing: fv170196.wav (Speaker: V000222, Comp: comp-q)
  Parsed 155 segments using simple timestamp format.
  Reference found and parsed: fv170196.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned:  99%|█████████▊| 77/78 [1:19:01<00:57, 57.60s/it]

  Transcription finished.

Processing: fv170198.wav (Speaker: V000221, Comp: comp-q)
  Parsed 129 segments using simple timestamp format.
  Reference found and parsed: fv170198.ort
  Starting transcription with fine-tuned model...


Transcribing PureDutchChildren_7_11_Large_3_finetuned: 100%|██████████| 78/78 [1:20:26<00:00, 61.88s/it]

  Transcription finished.

--- PureDutchChildren_7_11_Large_3_finetuned Processing Summary ---
Selected 78 recordings for processing.
Successfully processed: 78
Skipped due to errors/missing files: 0
Total processing time: 4826.96 seconds
Avg. time per processed file: 61.88 seconds

--- PureDutchChildren_7_11_Large_3_finetuned Reference File Summary ---
Found & Parsed: 78
Found but Failed to Parse: 0
Not Found: 0
Parse Success Rate (of processed files): 100.0%

Summary results for PureDutchChildren_7_11_Large_3_finetuned saved to: /home/tdamen/whisper_transcriptions/PureDutchChildren_7_11_Large_3_finetuned/transcription_summary_PureDutchChildren_7_11_Large_3_finetuned.csv
Generating WER analysis file for 'PureDutchChildren_7_11_Large_3_finetuned': /home/tdamen/whisper_transcriptions/PureDutchChildren_7_11_Large_3_finetuned/wer_analysis_PureDutchChildren_7_11_Large_3_finetuned.txt
WER analysis file generated with 19 entries.
===== Finished Group: PureDutchChildren_7_11_Large_3_finetuned


