In [None]:
#| default_exp data

In [None]:
%load_ext autoreload
%autoreload 2

# Data loading and validation

Functions for loading and validating linguistic corpus data.

In [None]:
#| export
import pandas as pd
from pathlib import Path
from typing import Set, Union

In [None]:
#| export
def read_linguistic_data(fp: Union[str, Path], 
                        expected_columns: Set[str] = None) -> pd.DataFrame:
    """Read and validate linguistic corpus data from Excel files.
    
    Args:
        fp: Path to Excel file
        expected_columns: Set of required column names. If None, uses default set.
    
    Returns:
        Cleaned DataFrame with validated columns
    
    Raises:
        ValueError: If required columns are missing
    """
    if expected_columns is None:
        # Default columns for 2025-05-19 format
        expected_columns = {
            'ID', 'text', 
            'transitivity', 'causativity', 'subject_animacy', 'subject_role',
            'gpt_transitivity', 'gpt_causativity', 'gpt_subject_animacy', 'gpt_subject_role',
            'pos', 'gpt_pos'
        }
    
    df = pd.read_excel(fp) 
    
    # Remove trailing spaces from column headers
    df.columns = df.columns.str.strip()
    
    # Check if all columns are present
    missing_columns = expected_columns - set(df.columns)
    if missing_columns:
        raise ValueError(f'{fp}: Missing required columns: {missing_columns}')
    
    # Replace non-breaking spaces with spaces
    df['text'] = df['text'].str.replace('\xa0', ' ', regex=False)  
    
    return df

In [None]:
# Helper function for loading multiple files

First, let's define a helper function, then test our data loading functions.

In [None]:
#| export
def load_experiment_data(data_dir: Union[str, Path], 
                        pattern: str = "*_sub.xlsx") -> list[Path]:
    """Load all data files matching a pattern from a directory.
    
    Args:
        data_dir: Directory containing data files
        pattern: Glob pattern for file matching
        
    Returns:
        List of paths to matching files
    """
    data_dir = Path(data_dir)
    return list(data_dir.glob(pattern))


In [None]:
# Test the data loading functions with sample data

# Test with a known file
test_files = load_experiment_data('../in/2025-05-19/Subsamples/', '*_sub.xlsx')
print(f"Found {len(test_files)} test files")

if test_files:
    # Test reading the first file
    df = read_linguistic_data(test_files[0])
    print(f"Successfully loaded {test_files[0].name}: {df.shape[0]} rows, {df.shape[1]} columns")
    
    # Verify ID column exists
    assert 'ID' in df.columns, "ID column should be present"
    print("✓ ID column found")
    
    # Verify sample ID format
    sample_id = df['ID'].iloc[0]
    assert isinstance(sample_id, str), "ID should be a string"
    print(f"✓ Sample ID format: {sample_id}")
else:
    print("⚠ No test files found - skipping data loading test")
