feat: add data loading and validation module with CSV handling, feature separation, and comprehensive error checking

artemisTurintech · artemisTurintech · commit 1c948f2ed3a7 · 2025-10-23T06:25:59.000Z
diff --git a/data_loader.py b/data_loader.py
@@ -0,0 +1,118 @@
+"""Data loading and validation module for ML CLI application."""
+
+import pandas as pd
+from pathlib import Path
+from typing import Tuple, List
+
+
+def load_and_validate_data(file_path: str, target_column: str) -> Tuple[pd.DataFrame, pd.Series, List[str]]:
+    """
+    Load and validate CSV data for machine learning tasks.
+    
+    This function performs comprehensive validation on the input CSV file,
+    ensuring it meets all requirements for model training or prediction.
+    It separates features from the target variable and identifies numeric columns.
+    
+    Parameters
+    ----------
+    file_path : str
+        Path to the CSV file to load. Must be a valid, readable CSV file.
+    target_column : str
+        Name of the target column to separate from features.
+        Must exist in the CSV file's columns.
+    
+    Returns
+    -------
+    Tuple[pd.DataFrame, pd.Series, List[str]]
+        A tuple containing:
+        - X (pd.DataFrame): Features DataFrame with target column removed
+        - y (pd.Series): Target column as a Series
+        - numeric_features (List[str]): List of numeric feature column names
+    
+    Raises
+    ------
+    FileNotFoundError
+        If the specified file does not exist or is not accessible.
+    ValueError
+        If the CSV file is empty, malformed, or fails validation checks:
+        - Target column does not exist in the DataFrame
+        - No feature columns remain after removing the target column
+        - CSV file has no data rows (empty after header)
+    pd.errors.ParserError
+        If the file cannot be parsed as a valid CSV format.
+    PermissionError
+        If the file exists but cannot be read due to permission issues.
+    
+    Examples
+    --------
+    >>> X, y, numeric_cols = load_and_validate_data('data.csv', 'target')
+    >>> print(f"Features shape: {X.shape}, Target shape: {y.shape}")
+    >>> print(f"Numeric columns: {numeric_cols}")
+    
+    Notes
+    -----
+    - Missing values (NaN) are preserved and not filtered out
+    - Non-numeric columns are retained in the features DataFrame
+    - The function identifies numeric columns using pandas dtypes (int64, float64, etc.)
+    - The target column is always removed from features, regardless of its dtype
+    """
+    # Validate file existence and readability
+    path = Path(file_path)
+    
+    if not path.exists():
+        raise FileNotFoundError(f"The file '{file_path}' does not exist.")
+    
+    if not path.is_file():
+        raise FileNotFoundError(f"The path '{file_path}' is not a file.")
+    
+    # Check file readability
+    try:
+        with open(path, 'r') as f:
+            pass  # Just checking if we can open it
+    except PermissionError as e:
+        raise PermissionError(f"Permission denied when trying to read '{file_path}'.") from e
+    except Exception as e:
+        raise ValueError(f"Unable to read file '{file_path}': {str(e)}") from e
+    
+    # Load CSV with error handling
+    try:
+        df = pd.read_csv(file_path)
+    except pd.errors.EmptyDataError as e:
+        raise ValueError(f"The CSV file '{file_path}' is empty.") from e
+    except pd.errors.ParserError as e:
+        raise pd.errors.ParserError(f"Failed to parse '{file_path}' as a valid CSV file: {str(e)}") from e
+    except Exception as e:
+        raise ValueError(f"Error reading CSV file '{file_path}': {str(e)}") from e
+    
+    # Validate that DataFrame is not empty
+    if df.empty:
+        raise ValueError(f"The CSV file '{file_path}' contains no data rows.")
+    
+    if len(df.columns) == 0:
+        raise ValueError(f"The CSV file '{file_path}' contains no columns.")
+    
+    # Validate target column exists
+    if target_column not in df.columns:
+        available_columns = "', '".join(df.columns.tolist())
+        raise ValueError(
+            f"Target column '{target_column}' not found in CSV file. "
+            f"Available columns: '{available_columns}'"
+        )
+    
+    # Separate target from features
+    y = df[target_column].copy()
+    X = df.drop(columns=[target_column])
+    
+    # Validate at least one feature column remains
+    if X.shape[1] == 0:
+        raise ValueError(
+            f"No feature columns remain after removing target column '{target_column}'. "
+            f"The CSV must contain at least one feature column in addition to the target."
+        )
+    
+    # Identify numeric columns in features
+    numeric_features = X.select_dtypes(include=['int64', 'int32', 'int16', 'int8', 
+                                                  'float64', 'float32', 'float16', 
+                                                  'uint64', 'uint32', 'uint16', 'uint8']).columns.tolist()
+    
+    return X, y, numeric_features