|
| 1 | +"""Data loading and validation module for ML CLI application.""" |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +from pathlib import Path |
| 5 | +from typing import Tuple, List |
| 6 | + |
| 7 | + |
| 8 | +def load_and_validate_data(file_path: str, target_column: str) -> Tuple[pd.DataFrame, pd.Series, List[str]]: |
| 9 | + """ |
| 10 | + Load and validate CSV data for machine learning tasks. |
| 11 | + |
| 12 | + This function performs comprehensive validation on the input CSV file, |
| 13 | + ensuring it meets all requirements for model training or prediction. |
| 14 | + It separates features from the target variable and identifies numeric columns. |
| 15 | + |
| 16 | + Parameters |
| 17 | + ---------- |
| 18 | + file_path : str |
| 19 | + Path to the CSV file to load. Must be a valid, readable CSV file. |
| 20 | + target_column : str |
| 21 | + Name of the target column to separate from features. |
| 22 | + Must exist in the CSV file's columns. |
| 23 | + |
| 24 | + Returns |
| 25 | + ------- |
| 26 | + Tuple[pd.DataFrame, pd.Series, List[str]] |
| 27 | + A tuple containing: |
| 28 | + - X (pd.DataFrame): Features DataFrame with target column removed |
| 29 | + - y (pd.Series): Target column as a Series |
| 30 | + - numeric_features (List[str]): List of numeric feature column names |
| 31 | + |
| 32 | + Raises |
| 33 | + ------ |
| 34 | + FileNotFoundError |
| 35 | + If the specified file does not exist or is not accessible. |
| 36 | + ValueError |
| 37 | + If the CSV file is empty, malformed, or fails validation checks: |
| 38 | + - Target column does not exist in the DataFrame |
| 39 | + - No feature columns remain after removing the target column |
| 40 | + - CSV file has no data rows (empty after header) |
| 41 | + pd.errors.ParserError |
| 42 | + If the file cannot be parsed as a valid CSV format. |
| 43 | + PermissionError |
| 44 | + If the file exists but cannot be read due to permission issues. |
| 45 | + |
| 46 | + Examples |
| 47 | + -------- |
| 48 | + >>> X, y, numeric_cols = load_and_validate_data('data.csv', 'target') |
| 49 | + >>> print(f"Features shape: {X.shape}, Target shape: {y.shape}") |
| 50 | + >>> print(f"Numeric columns: {numeric_cols}") |
| 51 | + |
| 52 | + Notes |
| 53 | + ----- |
| 54 | + - Missing values (NaN) are preserved and not filtered out |
| 55 | + - Non-numeric columns are retained in the features DataFrame |
| 56 | + - The function identifies numeric columns using pandas dtypes (int64, float64, etc.) |
| 57 | + - The target column is always removed from features, regardless of its dtype |
| 58 | + """ |
| 59 | + # Validate file existence and readability |
| 60 | + path = Path(file_path) |
| 61 | + |
| 62 | + if not path.exists(): |
| 63 | + raise FileNotFoundError(f"The file '{file_path}' does not exist.") |
| 64 | + |
| 65 | + if not path.is_file(): |
| 66 | + raise FileNotFoundError(f"The path '{file_path}' is not a file.") |
| 67 | + |
| 68 | + # Check file readability |
| 69 | + try: |
| 70 | + with open(path, 'r') as f: |
| 71 | + pass # Just checking if we can open it |
| 72 | + except PermissionError as e: |
| 73 | + raise PermissionError(f"Permission denied when trying to read '{file_path}'.") from e |
| 74 | + except Exception as e: |
| 75 | + raise ValueError(f"Unable to read file '{file_path}': {str(e)}") from e |
| 76 | + |
| 77 | + # Load CSV with error handling |
| 78 | + try: |
| 79 | + df = pd.read_csv(file_path) |
| 80 | + except pd.errors.EmptyDataError as e: |
| 81 | + raise ValueError(f"The CSV file '{file_path}' is empty.") from e |
| 82 | + except pd.errors.ParserError as e: |
| 83 | + raise pd.errors.ParserError(f"Failed to parse '{file_path}' as a valid CSV file: {str(e)}") from e |
| 84 | + except Exception as e: |
| 85 | + raise ValueError(f"Error reading CSV file '{file_path}': {str(e)}") from e |
| 86 | + |
| 87 | + # Validate that DataFrame is not empty |
| 88 | + if df.empty: |
| 89 | + raise ValueError(f"The CSV file '{file_path}' contains no data rows.") |
| 90 | + |
| 91 | + if len(df.columns) == 0: |
| 92 | + raise ValueError(f"The CSV file '{file_path}' contains no columns.") |
| 93 | + |
| 94 | + # Validate target column exists |
| 95 | + if target_column not in df.columns: |
| 96 | + available_columns = "', '".join(df.columns.tolist()) |
| 97 | + raise ValueError( |
| 98 | + f"Target column '{target_column}' not found in CSV file. " |
| 99 | + f"Available columns: '{available_columns}'" |
| 100 | + ) |
| 101 | + |
| 102 | + # Separate target from features |
| 103 | + y = df[target_column].copy() |
| 104 | + X = df.drop(columns=[target_column]) |
| 105 | + |
| 106 | + # Validate at least one feature column remains |
| 107 | + if X.shape[1] == 0: |
| 108 | + raise ValueError( |
| 109 | + f"No feature columns remain after removing target column '{target_column}'. " |
| 110 | + f"The CSV must contain at least one feature column in addition to the target." |
| 111 | + ) |
| 112 | + |
| 113 | + # Identify numeric columns in features |
| 114 | + numeric_features = X.select_dtypes(include=['int64', 'int32', 'int16', 'int8', |
| 115 | + 'float64', 'float32', 'float16', |
| 116 | + 'uint64', 'uint32', 'uint16', 'uint8']).columns.tolist() |
| 117 | + |
| 118 | + return X, y, numeric_features |
0 commit comments