In [9]:
import os
import pandas as pd
import numpy as np
import traceback
import sys # For sys.exit

# --- Constants ---
LANG_DUT = 'dut'
LANG_FRA = 'fra'
CHILD_MIN_AGE = 7
CHILD_MAX_AGE = 11

# --- Path Configuration ---
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    print("Warning: __file__ not defined. Using current working directory for relative paths.")
    script_dir = os.getcwd()

# Assumes the 'jasmin-data' folder is in the same directory as the script
base_data_path = os.path.join(script_dir, "jasmin-data/Data/data/meta/text")
output_base_dir = os.path.join(script_dir, 'output_codes') # Output folder for speaker codes lists

# Store paths in a dictionary
paths = {
    'input': {
        'nl_speakers': os.path.join(base_data_path, "nl/speakers.txt"),
        'vl_speakers': os.path.join(base_data_path, "vl/speakers.txt"),
    },
    'output': {
        'pure_dutch_children_codes': os.path.join(output_base_dir, 'pure_dutch_children_7_11_codes.txt'),
        # Updated French group output file name based on L1=fra, L2=fra
        'pure_french_children_codes': os.path.join(output_base_dir, 'pure_french_children_7_11_codes.txt'),
    }
}

# --- Check if input files exist ---
input_paths_to_check = list(paths['input'].values())
missing_paths = [p for p in input_paths_to_check if not os.path.exists(p)]

if missing_paths:
    print("\nError: Required input speaker data files not found:")
    for p in missing_paths:
        print(f"- {os.path.abspath(p)}")
    print("Please check the paths and ensure the 'jasmin-data' folder structure is correct relative to the script.")
    sys.exit(1)
else:
    print("\nAll required input speaker files found.")

# --- Ensure output directory exists ---
if not os.path.exists(output_base_dir):
    try:
        os.makedirs(output_base_dir)
        print(f"Created output directory: {output_base_dir}")
    except OSError as e:
        print(f"Error creating output directory {output_base_dir}: {e}")
        sys.exit(1)

# --- Data Loading Function (Robust Version) ---
def load_data_with_delimiters(file_path, potential_delimiters=['\t', r'\s+'], encoding='ISO-8859-1', expected_cols=None):
    """Attempts to load a CSV/text file using a list of potential delimiters."""
    last_exception = None
    encodings_to_try = ['utf-8', encoding] # Try UTF-8 first

    for enc in encodings_to_try:
        for delim_raw in potential_delimiters:
            delim_repr = repr(delim_raw)
            try:
                engine = 'python' if delim_raw == r'\s+' else None
                # Use comment='#' to ignore potential comment lines
                df = pd.read_csv(file_path, sep=delim_raw, encoding=enc,
                                 engine=engine, on_bad_lines='warn', low_memory=False,
                                 skipinitialspace=True, comment='#', skip_blank_lines=True)

                if df.empty:
                    continue

                # Clean column names immediately after loading
                df.columns = df.columns.str.strip()

                # Check if expected columns are present
                if expected_cols:
                    missing_cols = [col for col in expected_cols if col not in df.columns]
                    if not missing_cols:
                        # Check if first column looks like speaker ID
                        first_col_name = df.columns[0]
                        if first_col_name and df[first_col_name].astype(str).str.match(r'^[NV]\d+').any():
                            print(f"Successfully loaded {os.path.basename(file_path)} with delimiter {delim_repr}, encoding '{enc}'.")
                            return df
                        else:
                            pass # Continue trying
                    else:
                        pass # Continue trying
                # Fallback check if no expected columns provided
                elif df.shape[1] > 1:
                     first_col_name = df.columns[0]
                     if first_col_name and df[first_col_name].astype(str).str.match(r'^[NV]\d+').any():
                         print(f"Successfully loaded {os.path.basename(file_path)} with delimiter {delim_repr}, encoding '{enc}' ({df.shape[1]} columns found).")
                         return df
                     else:
                         pass # Continue trying

            except pd.errors.ParserError as pe:
                 last_exception = pe
            except Exception as e:
                last_exception = e

    print(f"Error: Could not successfully load file {file_path} with any specified delimiter/encoding.")
    if last_exception: print(f"Last error encountered: {last_exception}")
    return None


# --- Speaker Identification Function (Pure Dutch & Pure French Children 7-11) ---
def identify_pure_language_children(df, min_age=CHILD_MIN_AGE, max_age=CHILD_MAX_AGE):
    """
    Identifies Pure Dutch and Pure French children (ages min_age-max_age).
    - Pure Dutch: L1=dut, L2=dut or empty/NaN
    - Pure French: L1=fra, L2=fra
    """
    print(f"\n--- Identifying Pure Dutch & Pure French children (Age {min_age}-{max_age}) ---")
    if df is None or df.empty:
        print("Input DataFrame is empty. Cannot identify speakers.")
        return pd.DataFrame(), pd.DataFrame() # Return two empty DataFrames

    required_cols = ['Age', 'HomeLanguage1', 'HomeLanguage2', 'RegionSpeaker']
    missing_req = [col for col in required_cols if col not in df.columns]
    if missing_req:
        print(f"Error: Missing required columns: {missing_req}. Cannot proceed.")
        return pd.DataFrame(), pd.DataFrame()

    df_processed = df.copy()
    # Convert columns safely
    df_processed['Age'] = pd.to_numeric(df_processed['Age'], errors='coerce')
    # Ensure consistent NaN representation for language columns before checks
    df_processed['HomeLanguage1'] = df_processed['HomeLanguage1'].astype(str).str.lower().str.strip().replace(['nan', 'none', ''], np.nan)
    df_processed['HomeLanguage2'] = df_processed['HomeLanguage2'].astype(str).str.lower().str.strip().replace(['nan', 'none', ''], np.nan)
    df_processed['RegionSpeaker'] = df_processed['RegionSpeaker'].astype(str).str.strip()

    # --- Filtering ---
    # 1. Filter by Age (7-11)
    df_child = df_processed[
        (df_processed['Age'] >= min_age) & (df_processed['Age'] <= max_age)
    ].copy()
    print(f"Found {len(df_child)} speakers aged {min_age}-{max_age}.")
    if df_child.empty:
        print("No speakers found in the specified age range.")
        return pd.DataFrame(), pd.DataFrame()

    # 2. Define Language Conditions
    is_l1_dut = df_child['HomeLanguage1'] == LANG_DUT
    is_l1_fra = df_child['HomeLanguage1'] == LANG_FRA

    is_l2_dut = df_child['HomeLanguage2'] == LANG_DUT
    is_l2_fra = df_child['HomeLanguage2'] == LANG_FRA # Added check for L2 French
    is_l2_empty = df_child['HomeLanguage2'].isna() # Check for NaN/None

    # 3. Apply Filters to Create Groups
    # Pure Dutch: L1 is Dutch, AND (L2 is Dutch OR L2 is empty)
    pure_dutch_children = df_child[is_l1_dut & (is_l2_dut | is_l2_empty)].copy()

    # Pure French: L1 is French AND L2 is French
    pure_french_children = df_child[is_l1_fra & is_l2_fra].copy() # Updated condition

    print(f"Identified {len(pure_dutch_children)} Pure Dutch children (L1=dut, L2=dut/empty).")
    print(f"Identified {len(pure_french_children)} Pure French children (L1=fra, L2=fra).") # Updated print statement
    print("-" * 70)

    return pure_dutch_children, pure_french_children

# --- Function to Save Speaker Codes ---
def save_speaker_codes(speaker_df, output_filepath):
    """Extracts 'RegionSpeaker' codes and saves them to a file."""
    if speaker_df is None or speaker_df.empty:
        print(f"No speakers to save for {os.path.basename(output_filepath)}.")
        try:
            # Ensure directory exists before creating empty file
            os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
            with open(output_filepath, 'w', encoding='utf-8') as f:
                pass
            print(f"Created empty file: {output_filepath}")
        except Exception as e:
            print(f"Error creating empty file {output_filepath}: {e}")
        return 0

    if 'RegionSpeaker' not in speaker_df.columns:
        print(f"Error: 'RegionSpeaker' column not found. Cannot save codes for {os.path.basename(output_filepath)}.")
        return 0

    speaker_codes = speaker_df['RegionSpeaker'].unique().tolist()
    speaker_codes.sort()

    try:
        # Ensure directory exists before writing
        os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
        with open(output_filepath, 'w', encoding='utf-8') as f:
            for code in speaker_codes:
                f.write(f"{code}\n")
        print(f"Saved {len(speaker_codes)} unique speaker codes to: {output_filepath}")
        return len(speaker_codes)
    except Exception as e:
        print(f"Error writing speaker codes to {output_filepath}: {e}")
        return 0

# --- Main Execution ---
if __name__ == "__main__":
    print(f"Starting Speaker Identification for Pure Dutch & Pure French Children ({CHILD_MIN_AGE}-{CHILD_MAX_AGE})...")

    # Define expected columns for speaker files
    expected_speaker_cols = [
        'RegionSpeaker', 'ResPlace', 'Gender', 'Age', 'BirthPlace', 'Group',
        'HomeLanguage1', 'HomeLanguage2', 'CEF' # Add others if needed for loading
    ]

    # Load NL and VL speaker data
    nl_speakers = load_data_with_delimiters(paths['input']['nl_speakers'],
                                             potential_delimiters=[r'\s+', '\t'],
                                             expected_cols=expected_speaker_cols)
    vl_speakers = load_data_with_delimiters(paths['input']['vl_speakers'],
                                             potential_delimiters=[r'\s+', '\t'],
                                             expected_cols=expected_speaker_cols)

    # Combine DataFrames if both loaded successfully
    combined_df = None
    loaded_dfs = []
    if nl_speakers is not None:
        loaded_dfs.append(nl_speakers)
    if vl_speakers is not None:
        loaded_dfs.append(vl_speakers)

    if loaded_dfs:
        combined_df = pd.concat(loaded_dfs, ignore_index=True)
        print(f"\nCombined {len(loaded_dfs)} speaker file(s): {len(combined_df)} total entries.")
    else:
        print("\nError: Failed to load any speaker data. Exiting.")
        sys.exit(1)

    # Identify the target child groups using the updated function
    pure_dutch_df, pure_french_df = identify_pure_language_children(
        combined_df,
        min_age=CHILD_MIN_AGE,
        max_age=CHILD_MAX_AGE
    )

    # Save the speaker codes for each group
    print("\n--- Saving Speaker Codes ---")
    save_speaker_codes(pure_dutch_df, paths['output']['pure_dutch_children_codes'])
    # Save the Pure French group using the updated path key
    save_speaker_codes(pure_french_df, paths['output']['pure_french_children_codes'])

    print("\nScript finished.")



All required input speaker files found.
Created output directory: /home/tdamen/output_codes
Starting Speaker Identification for Pure Dutch & Pure French Children (7-11)...
Successfully loaded speakers.txt with delimiter '\t', encoding 'utf-8'.
Successfully loaded speakers.txt with delimiter '\t', encoding 'utf-8'.

Combined 2 speaker file(s): 509 total entries.

--- Identifying Pure Dutch & Pure French children (Age 7-11) ---
Found 136 speakers aged 7-11.
Identified 79 Pure Dutch children (L1=dut, L2=dut/empty).
Identified 26 Pure French children (L1=fra, L2=fra).
----------------------------------------------------------------------

--- Saving Speaker Codes ---
Saved 79 unique speaker codes to: /home/tdamen/output_codes/pure_dutch_children_7_11_codes.txt
Saved 26 unique speaker codes to: /home/tdamen/output_codes/pure_french_children_7_11_codes.txt

Script finished.
