|
2 | 2 | Data Loader Module |
3 | 3 |
|
4 | 4 | This module provides functionality for loading and validating CSV data files |
5 | | -for clustering analysis. It handles: |
| 5 | +for clustering analysis. |
6 | 6 |
|
7 | | -- CSV file reading with proper encoding and delimiter detection |
8 | | -- Data validation (checking for required columns, data types, etc.) |
9 | | -- Basic data quality checks (missing values, duplicates, etc.) |
10 | | -- Data integrity verification before processing |
11 | | -- Error handling for malformed or incompatible data files |
| 7 | +Key Features: |
| 8 | +- File existence validation |
| 9 | +- CSV format validation with clear error handling |
| 10 | +- Numerical data type verification |
| 11 | +- Empty file detection |
| 12 | +- Comprehensive error messages for debugging |
12 | 13 |
|
13 | 14 | Typical usage: |
14 | | - from clustering_toolkit.data_loader import load_data, validate_data |
| 15 | + from clustering_toolkit.data_loader import load_csv |
15 | 16 | |
16 | | - data = load_data('path/to/data.csv') |
17 | | - is_valid, errors = validate_data(data) |
| 17 | + # Load and validate a CSV file with numerical data |
| 18 | + data = load_csv('path/to/data.csv') |
| 19 | + |
| 20 | + # The function will raise informative exceptions if: |
| 21 | + # - File does not exist |
| 22 | + # - File is not a valid CSV |
| 23 | + # - File contains no data |
| 24 | + # - File does not contain numerical columns |
| 25 | +
|
| 26 | +Usage Notes: |
| 27 | +- All validation is performed automatically during loading |
| 28 | +- Error messages indicate specific issues and remediation steps |
| 29 | +- The function expects CSV files with numerical data suitable for clustering |
18 | 30 | """ |
19 | 31 |
|
20 | 32 | import pandas as pd |
21 | 33 | from pathlib import Path |
22 | | -from typing import Optional, Tuple, List |
| 34 | +from typing import Tuple, List |
| 35 | +import numpy as np |
| 36 | + |
| 37 | + |
| 38 | +def load_csv(filepath: str) -> pd.DataFrame: |
| 39 | + """ |
| 40 | + Load and validate a CSV file containing numerical data for clustering analysis. |
| 41 | + |
| 42 | + This function performs comprehensive validation including: |
| 43 | + - File existence check |
| 44 | + - CSV format validation |
| 45 | + - Data type verification (numerical columns) |
| 46 | + - Empty file detection |
| 47 | + - Edge case handling (headers only, single columns) |
| 48 | + |
| 49 | + Args: |
| 50 | + filepath: Path to the CSV file to load |
| 51 | + |
| 52 | + Returns: |
| 53 | + pd.DataFrame: DataFrame containing validated numerical data |
| 54 | + |
| 55 | + Raises: |
| 56 | + FileNotFoundError: If the specified file does not exist at the given path. |
| 57 | + Remediation: Verify the file path is correct and the file exists. |
| 58 | + |
| 59 | + pd.errors.ParserError: If the file cannot be parsed as valid CSV. |
| 60 | + Remediation: Check the file format, ensure proper CSV structure with |
| 61 | + consistent delimiters and quoted fields. |
| 62 | + |
| 63 | + pd.errors.EmptyDataError: If the file is empty or contains only headers. |
| 64 | + Remediation: Ensure the file contains data rows, not just column headers. |
| 65 | + |
| 66 | + ValueError: If the file does not contain numerical data suitable for clustering. |
| 67 | + Remediation: Ensure all columns contain numerical (int/float) data types. |
| 68 | + Non-numerical columns must be removed or converted before clustering. |
| 69 | + |
| 70 | + Examples: |
| 71 | + >>> data = load_csv('customer_data.csv') |
| 72 | + >>> print(data.shape) |
| 73 | + (1000, 5) |
| 74 | + |
| 75 | + >>> # This will raise FileNotFoundError |
| 76 | + >>> data = load_csv('nonexistent.csv') |
| 77 | + |
| 78 | + >>> # This will raise ValueError if columns are not numerical |
| 79 | + >>> data = load_csv('text_data.csv') |
| 80 | + """ |
| 81 | + # Step 1: File existence validation |
| 82 | + path = Path(filepath) |
| 83 | + if not path.exists(): |
| 84 | + raise FileNotFoundError( |
| 85 | + f"File not found: {filepath}\n" |
| 86 | + f"Remediation: Verify the file path is correct and the file exists at the specified location." |
| 87 | + ) |
| 88 | + |
| 89 | + # Step 2: CSV parsing with error handling |
| 90 | + try: |
| 91 | + df = pd.read_csv(filepath) |
| 92 | + except pd.errors.ParserError as e: |
| 93 | + raise pd.errors.ParserError( |
| 94 | + f"Failed to parse CSV file: {filepath}\n" |
| 95 | + f"Original error: {str(e)}\n" |
| 96 | + f"Remediation: Check the file format - ensure it's a valid CSV with consistent " |
| 97 | + f"delimiters (commas), properly quoted fields, and no malformed rows." |
| 98 | + ) |
| 99 | + except pd.errors.EmptyDataError as e: |
| 100 | + raise pd.errors.EmptyDataError( |
| 101 | + f"CSV file is empty: {filepath}\n" |
| 102 | + f"Remediation: Ensure the file contains data rows, not just headers or blank lines." |
| 103 | + ) |
| 104 | + except UnicodeDecodeError as e: |
| 105 | + raise ValueError( |
| 106 | + f"Encoding error while reading CSV file: {filepath}\n" |
| 107 | + f"Original error: {str(e)}\n" |
| 108 | + f"Remediation: The file may have an incompatible encoding. " |
| 109 | + f"Try saving the file with UTF-8 encoding or specify encoding parameter." |
| 110 | + ) |
| 111 | + except Exception as e: |
| 112 | + raise ValueError( |
| 113 | + f"Unexpected error loading CSV file: {filepath}\n" |
| 114 | + f"Original error: {type(e).__name__}: {str(e)}\n" |
| 115 | + f"Remediation: Verify the file is a valid CSV and not corrupted." |
| 116 | + ) |
| 117 | + |
| 118 | + # Step 3: Empty DataFrame detection |
| 119 | + if df.empty: |
| 120 | + raise ValueError( |
| 121 | + f"CSV file contains no data rows: {filepath}\n" |
| 122 | + f"File shape: {df.shape}\n" |
| 123 | + f"Remediation: Ensure the file contains at least one row of data beyond the header." |
| 124 | + ) |
| 125 | + |
| 126 | + # Step 4: Check for columns (edge case: file with no columns) |
| 127 | + if len(df.columns) == 0: |
| 128 | + raise ValueError( |
| 129 | + f"CSV file contains no columns: {filepath}\n" |
| 130 | + f"Remediation: Ensure the file has at least one column of data." |
| 131 | + ) |
| 132 | + |
| 133 | + # Step 5: Edge case - only headers, no data rows |
| 134 | + if len(df) == 0: |
| 135 | + raise ValueError( |
| 136 | + f"CSV file contains only headers with no data rows: {filepath}\n" |
| 137 | + f"Number of columns: {len(df.columns)}\n" |
| 138 | + f"Column names: {list(df.columns)}\n" |
| 139 | + f"Remediation: Add data rows to the CSV file." |
| 140 | + ) |
| 141 | + |
| 142 | + # Step 6: Numerical data validation |
| 143 | + numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
| 144 | + non_numerical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist() |
| 145 | + |
| 146 | + if len(numerical_cols) == 0: |
| 147 | + raise ValueError( |
| 148 | + f"CSV file contains no numerical columns: {filepath}\n" |
| 149 | + f"File shape: {df.shape}\n" |
| 150 | + f"Column names and types:\n" |
| 151 | + f"{df.dtypes.to_string()}\n" |
| 152 | + f"Remediation: Clustering requires numerical data. Ensure columns contain " |
| 153 | + f"integer or float values. Convert or remove non-numerical columns (strings, dates, etc.)." |
| 154 | + ) |
| 155 | + |
| 156 | + if len(non_numerical_cols) > 0: |
| 157 | + raise ValueError( |
| 158 | + f"CSV file contains non-numerical columns: {filepath}\n" |
| 159 | + f"File shape: {df.shape}\n" |
| 160 | + f"Non-numerical columns ({len(non_numerical_cols)}): {non_numerical_cols}\n" |
| 161 | + f"Numerical columns ({len(numerical_cols)}): {numerical_cols}\n" |
| 162 | + f"Column types:\n" |
| 163 | + f"{df.dtypes.to_string()}\n" |
| 164 | + f"Remediation: Remove or convert non-numerical columns to numerical types. " |
| 165 | + f"Clustering algorithms require all features to be numerical (int or float). " |
| 166 | + f"Consider encoding categorical variables or removing text/date columns." |
| 167 | + ) |
| 168 | + |
| 169 | + return df |
23 | 170 |
|
24 | 171 |
|
25 | 172 | def load_data(file_path: str, **kwargs) -> pd.DataFrame: |
|
0 commit comments