feat(data-loader): implement CSV loading with comprehensive validation pipeline and error handling

artemisTurintech · paulsbrookes · commit fb61df718a66 · 2025-10-25T15:34:21.000+01:00
diff --git a/clustering_toolkit/data_loader.py b/clustering_toolkit/data_loader.py
@@ -2,24 +2,171 @@
 Data Loader Module
 
 This module provides functionality for loading and validating CSV data files
-for clustering analysis. It handles:
+for clustering analysis.
 
-- CSV file reading with proper encoding and delimiter detection
-- Data validation (checking for required columns, data types, etc.)
-- Basic data quality checks (missing values, duplicates, etc.)
-- Data integrity verification before processing
-- Error handling for malformed or incompatible data files
+Key Features:
+- File existence validation
+- CSV format validation with clear error handling
+- Numerical data type verification
+- Empty file detection
+- Comprehensive error messages for debugging
 
 Typical usage:
-    from clustering_toolkit.data_loader import load_data, validate_data
+    from clustering_toolkit.data_loader import load_csv
     
-    data = load_data('path/to/data.csv')
-    is_valid, errors = validate_data(data)
+    # Load and validate a CSV file with numerical data
+    data = load_csv('path/to/data.csv')
+    
+    # The function will raise informative exceptions if:
+    # - File does not exist
+    # - File is not a valid CSV
+    # - File contains no data
+    # - File does not contain numerical columns
+
+Usage Notes:
+- All validation is performed automatically during loading
+- Error messages indicate specific issues and remediation steps
+- The function expects CSV files with numerical data suitable for clustering
 """
 
 import pandas as pd
 from pathlib import Path
-from typing import Optional, Tuple, List
+from typing import Tuple, List
+import numpy as np
+
+
+def load_csv(filepath: str) -> pd.DataFrame:
+    """
+    Load and validate a CSV file containing numerical data for clustering analysis.
+    
+    This function performs comprehensive validation including:
+    - File existence check
+    - CSV format validation
+    - Data type verification (numerical columns)
+    - Empty file detection
+    - Edge case handling (headers only, single columns)
+    
+    Args:
+        filepath: Path to the CSV file to load
+    
+    Returns:
+        pd.DataFrame: DataFrame containing validated numerical data
+    
+    Raises:
+        FileNotFoundError: If the specified file does not exist at the given path.
+            Remediation: Verify the file path is correct and the file exists.
+        
+        pd.errors.ParserError: If the file cannot be parsed as valid CSV.
+            Remediation: Check the file format, ensure proper CSV structure with
+            consistent delimiters and quoted fields.
+        
+        pd.errors.EmptyDataError: If the file is empty or contains only headers.
+            Remediation: Ensure the file contains data rows, not just column headers.
+        
+        ValueError: If the file does not contain numerical data suitable for clustering.
+            Remediation: Ensure all columns contain numerical (int/float) data types.
+            Non-numerical columns must be removed or converted before clustering.
+    
+    Examples:
+        >>> data = load_csv('customer_data.csv')
+        >>> print(data.shape)
+        (1000, 5)
+        
+        >>> # This will raise FileNotFoundError
+        >>> data = load_csv('nonexistent.csv')
+        
+        >>> # This will raise ValueError if columns are not numerical
+        >>> data = load_csv('text_data.csv')
+    """
+    # Step 1: File existence validation
+    path = Path(filepath)
+    if not path.exists():
+        raise FileNotFoundError(
+            f"File not found: {filepath}\n"
+            f"Remediation: Verify the file path is correct and the file exists at the specified location."
+        )
+    
+    # Step 2: CSV parsing with error handling
+    try:
+        df = pd.read_csv(filepath)
+    except pd.errors.ParserError as e:
+        raise pd.errors.ParserError(
+            f"Failed to parse CSV file: {filepath}\n"
+            f"Original error: {str(e)}\n"
+            f"Remediation: Check the file format - ensure it's a valid CSV with consistent "
+            f"delimiters (commas), properly quoted fields, and no malformed rows."
+        )
+    except pd.errors.EmptyDataError as e:
+        raise pd.errors.EmptyDataError(
+            f"CSV file is empty: {filepath}\n"
+            f"Remediation: Ensure the file contains data rows, not just headers or blank lines."
+        )
+    except UnicodeDecodeError as e:
+        raise ValueError(
+            f"Encoding error while reading CSV file: {filepath}\n"
+            f"Original error: {str(e)}\n"
+            f"Remediation: The file may have an incompatible encoding. "
+            f"Try saving the file with UTF-8 encoding or specify encoding parameter."
+        )
+    except Exception as e:
+        raise ValueError(
+            f"Unexpected error loading CSV file: {filepath}\n"
+            f"Original error: {type(e).__name__}: {str(e)}\n"
+            f"Remediation: Verify the file is a valid CSV and not corrupted."
+        )
+    
+    # Step 3: Empty DataFrame detection
+    if df.empty:
+        raise ValueError(
+            f"CSV file contains no data rows: {filepath}\n"
+            f"File shape: {df.shape}\n"
+            f"Remediation: Ensure the file contains at least one row of data beyond the header."
+        )
+    
+    # Step 4: Check for columns (edge case: file with no columns)
+    if len(df.columns) == 0:
+        raise ValueError(
+            f"CSV file contains no columns: {filepath}\n"
+            f"Remediation: Ensure the file has at least one column of data."
+        )
+    
+    # Step 5: Edge case - only headers, no data rows
+    if len(df) == 0:
+        raise ValueError(
+            f"CSV file contains only headers with no data rows: {filepath}\n"
+            f"Number of columns: {len(df.columns)}\n"
+            f"Column names: {list(df.columns)}\n"
+            f"Remediation: Add data rows to the CSV file."
+        )
+    
+    # Step 6: Numerical data validation
+    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    non_numerical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
+    
+    if len(numerical_cols) == 0:
+        raise ValueError(
+            f"CSV file contains no numerical columns: {filepath}\n"
+            f"File shape: {df.shape}\n"
+            f"Column names and types:\n"
+            f"{df.dtypes.to_string()}\n"
+            f"Remediation: Clustering requires numerical data. Ensure columns contain "
+            f"integer or float values. Convert or remove non-numerical columns (strings, dates, etc.)."
+        )
+    
+    if len(non_numerical_cols) > 0:
+        raise ValueError(
+            f"CSV file contains non-numerical columns: {filepath}\n"
+            f"File shape: {df.shape}\n"
+            f"Non-numerical columns ({len(non_numerical_cols)}): {non_numerical_cols}\n"
+            f"Numerical columns ({len(numerical_cols)}): {numerical_cols}\n"
+            f"Column types:\n"
+            f"{df.dtypes.to_string()}\n"
+            f"Remediation: Remove or convert non-numerical columns to numerical types. "
+            f"Clustering algorithms require all features to be numerical (int or float). "
+            f"Consider encoding categorical variables or removing text/date columns."
+        )
+    
+    return df
 
 
 def load_data(file_path: str, **kwargs) -> pd.DataFrame: