In [5]:
import json
import os
import re
import sys
from tqdm.notebook import tqdm 
import numpy as np 

# --- Normalization and helper functions ---
def normalize_jasmin_reference(text):
    """
    Special normalization for JASMIN reference transcripts with annotations.
    Handles all special markers and phenomena in the JASMIN corpus.
    Includes rules to remove common non-speech annotations.
    NOW INCLUDES *** REMOVAL.
    """
    if not text: return ""
    text = text.upper()
    # Remove interviewer labels and metadata-like content
    text = re.sub(r'INTERVALTIER|INTERVIEWSTER|GEÏNTERVIEWDE', '', text)
    # Remove speech disfluency markers (*A, *U, *F, *S, *Z, etc.)
    text = re.sub(r'\*[A-Z]', '', text)
    # Remove hesitation markers
    text = re.sub(r'\b(?:UH|UHM)\b', '', text)
    # Handle repeated words (e.g., "DE DE") - keep just one occurrence
    text = re.sub(r'\b(\w+)(\s+\1)+\b', r'\1', text)
    # Handle stutters (e.g., "NEDERL NEDERLAND" or "NEDER NEDERLAND")
    words = text.split()
    i = 0
    while i < len(words) - 1:
        if len(words[i]) >= 3 and words[i+1].startswith(words[i]) and len(words[i]) < len(words[i+1]):
            words.pop(i)
        else:
            i += 1
    text = ' '.join(words)

    # --- Added/Enhanced Rules for non-speech ---
    text = re.sub(r'\b(GGG|MMM|XXX|PFF|UHU|AH|JA|NEE|OKÉ)\b', '', text) # Remove specific annotations/fillers
    text = re.sub(r'^TEKST INLEIDING$', '', text) # Remove specific instructions
    text = re.sub(r'\b([A-Z])\1{2,}\b', '', text) # Remove 3+ repetitions of same letter like GGG
    text = re.sub(r'^[A-Z]$', '', text) # Remove lines containing only a single capital letter (often annotations)
    text = re.sub(r'U MAG MU XXX', '', text) # Remove specific complex annotation
    text = re.sub(r'CETAIT PAS XXX', '', text) # Remove specific complex annotation
    text = re.sub(r'POEH', '', text) # Remove specific filler
    text = re.sub(r'\s*\*\*\*\s*', ' ', text)

    # Remove punctuation except for apostrophes
    text = re.sub(r'[^\w\s\']', '', text) # Keep apostrophes within words
    # Replace multiple spaces with a single space and strip leading/trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_hypothesis(text):
    """
    Normalize hypothesis text (Whisper output) for fair comparison.
    NOW INCLUDES *** REMOVAL.
    """
    if not text: return ""
    text = text.upper()
    text = re.sub(r'\s*\*\*\*\s*', ' ', text)
    # Remove punctuation except for apostrophes
    text = re.sub(r'[^\w\s\']', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def process_file(json_file_path, ref_output_path, hyp_output_path):
    """
    Processes one JSON file to create ONE aligned ref.txt and hyp.txt entry
    by concatenating all valid segments within that file.
    Applies filtering to the hypothesis based on the reference end time.
    """
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error loading/decoding JSON {json_file_path}: {e}", file=sys.stderr)
        return 0 

    base_name = os.path.basename(json_file_path)
    file_id_match = re.match(r"([NV]\d+)(?:_comp-q)?_(fv\d+|fn\d+)", base_name)
    if file_id_match:
        file_id = f"{file_id_match.group(1)}_{file_id_match.group(2)}"
    else:
        file_id = os.path.splitext(base_name)[0]
        print(f"Warning: Using fallback file ID for {base_name}: {file_id}", file=sys.stderr)

    # --- Concatenate Reference Text (No change needed here) ---
    ort_segments = data.get('ort_segments', [])
    all_ref_texts = []
    max_ref_time = None # Initialize max reference time
    reference_parsed = data.get('reference_parsed', False)

    if reference_parsed and ort_segments:
        for seg in ort_segments:
            ref_text = seg.get('text', '')
            if ref_text:
                ref_text_norm = normalize_jasmin_reference(ref_text)
                if ref_text_norm:
                    all_ref_texts.append(ref_text_norm)
            # Find the maximum end time from valid segments
            try:
                end_time = seg.get('end')
                if end_time is not None:
                    current_end_time = float(end_time)
                    if max_ref_time is None or current_end_time > max_ref_time:
                        max_ref_time = current_end_time
            except (ValueError, TypeError) as time_err:
                print(f"Warning: Invalid end time '{seg.get('end')}' in ORT segment for {file_id}. Skipping for max_ref_time.", file=sys.stderr)
                pass 

        if max_ref_time is not None:
             print(f"  Info: Max reference end time for {file_id}: {max_ref_time:.3f}s")
        else:
             print(f"  Warning: Could not determine max reference end time for {file_id} despite reference_parsed=True. Using full hypothesis.")
             reference_parsed = False 

    elif reference_parsed and not ort_segments:
         print(f"  Warning: reference_parsed=True but ort_segments is empty for {file_id}. Using full hypothesis.")
         reference_parsed = False 
    else:
         pass 

    full_ref_norm = ' '.join(all_ref_texts)

    # --- Concatenate Hypothesis Text (WITH FILTERING) ---
    whisper_segments = data.get('whisper_segments', [])
    all_hyp_texts = []
    for seg in whisper_segments:
        hyp_text = seg.get('text', '')
        if hyp_text:
            if reference_parsed and max_ref_time is not None:
                try:
                    seg_start_time = seg.get('start')
                    if seg_start_time is not None and float(seg_start_time) <= max_ref_time:
                        hyp_text_norm = normalize_hypothesis(hyp_text)
                        if hyp_text_norm:
                            all_hyp_texts.append(hyp_text_norm)
              
                except (ValueError, TypeError) as time_err:
                     print(f"Warning: Invalid start time '{seg.get('start')}' in Whisper segment for {file_id}. Including segment by default.", file=sys.stderr)
        
                     hyp_text_norm = normalize_hypothesis(hyp_text)
                     if hyp_text_norm:
                         all_hyp_texts.append(hyp_text_norm)
            else:
 
                hyp_text_norm = normalize_hypothesis(hyp_text)
                if hyp_text_norm:
                    all_hyp_texts.append(hyp_text_norm)

    full_hyp_norm = ' '.join(all_hyp_texts)

    # --- Write ONE line per file ---
    try:
        # Only write if the reference was originally found/parsed, even if empty after normalization
        # This ensures ref/hyp files have corresponding lines for WER calculation.
        # If reference_parsed is False, we might still want to write the hyp for inspection,
        # but it won't be used in standard WER if the ref line is missing/empty.
        # Let's write both lines regardless, but WER tools will handle mismatches.
        with open(ref_output_path, 'a', encoding='utf-8') as f_ref, \
             open(hyp_output_path, 'a', encoding='utf-8') as f_hyp:
            # Write line even if text is empty after normalization/filtering
            f_ref.write(f"{file_id} {full_ref_norm}\n")
            f_hyp.write(f"{file_id} {full_hyp_norm}\n")
    except Exception as e:
        print(f"An error occurred during writing for {json_file_path}: {e}", file=sys.stderr)
        return 0 # Indicate failure

    return 1

print("Main processing function 'process_file' REVISED for file-level concatenation AND HYPOTHESIS FILTERING.")

# --- Configuration ---
notebook_base_dir = "."

output_folder_name = "kaldi_formatted_output_filtered/large3" # Suggest using a new name to avoid confusion
output_base_dir = os.path.join(notebook_base_dir, output_folder_name)

# --- Group 1: Dutch Speakers ---
# Assuming original JSONs are one level up from where the script runs
json_input_dir_nl = os.path.join(notebook_base_dir, "../whisper_transcriptions/PureDutchChildren_7_11_Large_3_finetuned")
ref_output_file_nl_unsorted = os.path.join(output_base_dir, "ref_nl_large3_finetuned_unsorted.txt")
hyp_output_file_nl_unsorted = os.path.join(output_base_dir, "hyp_nl_large3_finetuned_unsorted.txt")

# --- Group 2: French Speakers ---
json_input_dir_fr = os.path.join(notebook_base_dir, "../whisper_transcriptions/PureFrenchChildren_7_11_Large_3_finetuned")
ref_output_file_fr_unsorted = os.path.join(output_base_dir, "ref_fr_large3_finetuned_unsorted.txt")
hyp_output_file_fr_unsorted = os.path.join(output_base_dir, "hyp_fr_large3_finetuned_unsorted.txt")

# --- Function to run processing for a group (includes sorting) ---
def run_group_processing(group_name, json_dir, ref_out_unsorted, hyp_out_unsorted):
    print(f"\n--- Processing Group: {group_name} ---")
    print(f"Input JSON directory: {json_dir}")
    print(f"Output Ref file (unsorted): {ref_out_unsorted}")
    print(f"Output Hyp file (unsorted): {hyp_out_unsorted}")

    if not os.path.isdir(json_dir):
        print(f"Error: Input directory not found: {json_dir}", file=sys.stderr)
        print(f"Skipping processing for {group_name}.")
        return

    output_dir = os.path.dirname(ref_out_unsorted)
    if output_dir and not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
            print(f"Created output directory: {output_dir}")
        except OSError as e:
            print(f"Error creating output directory {output_dir}: {e}", file=sys.stderr)
            return

    ref_out_sorted = ref_out_unsorted.replace("_unsorted", "")
    hyp_out_sorted = hyp_out_unsorted.replace("_unsorted", "")
    if os.path.exists(ref_out_unsorted): os.remove(ref_out_unsorted)
    if os.path.exists(hyp_out_unsorted): os.remove(hyp_out_unsorted)
    if os.path.exists(ref_out_sorted): os.remove(ref_out_sorted)
    if os.path.exists(hyp_out_sorted): os.remove(hyp_out_sorted)
    print(f"Cleared existing output files for {group_name}.")

    try:
        json_files = sorted([f for f in os.listdir(json_dir) if f.endswith(".json")])
    except FileNotFoundError:
        print(f"Error: Cannot list files in {json_dir}. Check permissions.", file=sys.stderr)
        return
    if not json_files:
        print(f"Warning: No JSON files found in {json_dir}", file=sys.stderr)

    print(f"Found {len(json_files)} JSON files for {group_name}. Starting processing...")
    total_files_processed = 0
    for filename in tqdm(json_files, desc=f"Processing {group_name}"):
        json_path = os.path.join(json_dir, filename)
        total_files_processed += process_file(json_path, ref_out_unsorted, hyp_out_unsorted)
    print(f"Finished processing {group_name}. Total files processed: {total_files_processed}")

    print(f"Sorting output files for {group_name}...")
    ref_out_sorted = ref_out_unsorted.replace("_unsorted", "")
    hyp_out_sorted = hyp_out_unsorted.replace("_unsorted", "")

    try:
        sort_command = "LC_ALL=C sort" 
        if sys.platform == "win32":
             sort_command = "sort"

  
        if os.path.exists(ref_out_unsorted):
             os.system(f"{sort_command} -k1,1 {ref_out_unsorted} > {ref_out_sorted}")
        else:
      
             open(ref_out_sorted, 'a').close()

        if os.path.exists(hyp_out_unsorted):
             os.system(f"{sort_command} -k1,1 {hyp_out_unsorted} > {hyp_out_sorted}")
        else:
             open(hyp_out_sorted, 'a').close()

        print(f"Created sorted files: {ref_out_sorted} and {hyp_out_sorted}")

        # Remove intermediate unsorted files if they exist
        print(f"Removing intermediate unsorted files for {group_name}...")
        if os.path.exists(ref_out_unsorted): os.remove(ref_out_unsorted)
        if os.path.exists(hyp_out_unsorted): os.remove(hyp_out_unsorted)

    except Exception as e:
        print(f"Error sorting or removing files for {group_name}: {e}", file=sys.stderr)
        print("Please check the files manually.")

    print("-" * 30)


# --- Execute for all groups ---
run_group_processing("Dutch Speakers", json_input_dir_nl, ref_output_file_nl_unsorted, hyp_output_file_nl_unsorted)
run_group_processing("French Speakers", json_input_dir_fr, ref_output_file_fr_unsorted, hyp_output_file_fr_unsorted)

print("\nScript finished.")
print(f"IMPORTANT: Check the final sorted .txt files in '{output_folder_name}'.")
print("Each file should now contain one line per original JSON file processed, with *** removed.")
print("Use these final sorted files with compute-wer or Karwi analysis.")



Main processing function 'process_file' REVISED for file-level concatenation AND HYPOTHESIS FILTERING.

--- Processing Group: Dutch Speakers ---
Input JSON directory: ./../whisper_transcriptions/PureDutchChildren_7_11_Large_3_finetuned
Output Ref file (unsorted): ./kaldi_formatted_output_filtered/large3/ref_nl_large3_finetuned_unsorted.txt
Output Hyp file (unsorted): ./kaldi_formatted_output_filtered/large3/hyp_nl_large3_finetuned_unsorted.txt
Cleared existing output files for Dutch Speakers.
Found 78 JSON files for Dutch Speakers. Starting processing...


Processing Dutch Speakers:   0%|          | 0/78 [00:00<?, ?it/s]

  Info: Max reference end time for N000025_fn000049: 460.538s
  Info: Max reference end time for N000026_fn000051: 639.572s
  Info: Max reference end time for N000027_fn000074: 433.625s
  Info: Max reference end time for N000028_fn000060: 401.276s
  Info: Max reference end time for N000029_fn000062: 541.946s
  Info: Max reference end time for N000030_fn000064: 487.742s
  Info: Max reference end time for N000031_fn000066: 561.143s
  Info: Max reference end time for N000032_fn000068: 402.875s
  Info: Max reference end time for N000033_fn000070: 303.053s
  Info: Max reference end time for N000034_fn000073: 381.539s
  Info: Max reference end time for N000036_fn000076: 370.815s
  Info: Max reference end time for N000037_fn000078: 384.000s
  Info: Max reference end time for N000038_fn000080: 394.794s
  Info: Max reference end time for N000039_fn000082: 572.632s
  Info: Max reference end time for N000040_fn000084: 401.630s
  Info: Max reference end time for N000041_fn000087: 395.136s
  Info: 

Processing French Speakers:   0%|          | 0/26 [00:00<?, ?it/s]

  Info: Max reference end time for V000070_fv170059: 392.825s
  Info: Max reference end time for V000112_fv170099: 306.394s
  Info: Max reference end time for V000113_fv170098: 307.500s
  Info: Max reference end time for V000127_fv170113: 274.950s
  Info: Max reference end time for V000130_fv170116: 213.500s
  Info: Max reference end time for V000132_fv170121: 430.000s
  Info: Max reference end time for V000133_fv170119: 447.951s
  Info: Max reference end time for V000134_fv170124: 294.500s
  Info: Max reference end time for V000135_fv170125: 270.616s
  Info: Max reference end time for V000136_fv170126: 368.500s
  Info: Max reference end time for V000137_fv170127: 329.000s
  Info: Max reference end time for V000141_fv170131: 357.929s
  Info: Max reference end time for V000168_fv170154: 337.860s
  Info: Max reference end time for V000173_fv170159: 282.563s
  Info: Max reference end time for V000193_fv170174: 329.153s
  Info: Max reference end time for V000194_fv170165: 283.500s
  Info: 