Skip to content

Commit fb61df7

Browse files
artemisTurintechpaulsbrookes
authored andcommitted
feat(data-loader): implement CSV loading with comprehensive validation pipeline and error handling
1 parent 28b183a commit fb61df7

File tree

1 file changed

+157
-10
lines changed

1 file changed

+157
-10
lines changed

clustering_toolkit/data_loader.py

Lines changed: 157 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,171 @@
22
Data Loader Module
33
44
This module provides functionality for loading and validating CSV data files
5-
for clustering analysis. It handles:
5+
for clustering analysis.
66
7-
- CSV file reading with proper encoding and delimiter detection
8-
- Data validation (checking for required columns, data types, etc.)
9-
- Basic data quality checks (missing values, duplicates, etc.)
10-
- Data integrity verification before processing
11-
- Error handling for malformed or incompatible data files
7+
Key Features:
8+
- File existence validation
9+
- CSV format validation with clear error handling
10+
- Numerical data type verification
11+
- Empty file detection
12+
- Comprehensive error messages for debugging
1213
1314
Typical usage:
14-
from clustering_toolkit.data_loader import load_data, validate_data
15+
from clustering_toolkit.data_loader import load_csv
1516
16-
data = load_data('path/to/data.csv')
17-
is_valid, errors = validate_data(data)
17+
# Load and validate a CSV file with numerical data
18+
data = load_csv('path/to/data.csv')
19+
20+
# The function will raise informative exceptions if:
21+
# - File does not exist
22+
# - File is not a valid CSV
23+
# - File contains no data
24+
# - File does not contain numerical columns
25+
26+
Usage Notes:
27+
- All validation is performed automatically during loading
28+
- Error messages indicate specific issues and remediation steps
29+
- The function expects CSV files with numerical data suitable for clustering
1830
"""
1931

2032
import pandas as pd
2133
from pathlib import Path
22-
from typing import Optional, Tuple, List
34+
from typing import Tuple, List
35+
import numpy as np
36+
37+
38+
def load_csv(filepath: str) -> pd.DataFrame:
39+
"""
40+
Load and validate a CSV file containing numerical data for clustering analysis.
41+
42+
This function performs comprehensive validation including:
43+
- File existence check
44+
- CSV format validation
45+
- Data type verification (numerical columns)
46+
- Empty file detection
47+
- Edge case handling (headers only, single columns)
48+
49+
Args:
50+
filepath: Path to the CSV file to load
51+
52+
Returns:
53+
pd.DataFrame: DataFrame containing validated numerical data
54+
55+
Raises:
56+
FileNotFoundError: If the specified file does not exist at the given path.
57+
Remediation: Verify the file path is correct and the file exists.
58+
59+
pd.errors.ParserError: If the file cannot be parsed as valid CSV.
60+
Remediation: Check the file format, ensure proper CSV structure with
61+
consistent delimiters and quoted fields.
62+
63+
pd.errors.EmptyDataError: If the file is empty or contains only headers.
64+
Remediation: Ensure the file contains data rows, not just column headers.
65+
66+
ValueError: If the file does not contain numerical data suitable for clustering.
67+
Remediation: Ensure all columns contain numerical (int/float) data types.
68+
Non-numerical columns must be removed or converted before clustering.
69+
70+
Examples:
71+
>>> data = load_csv('customer_data.csv')
72+
>>> print(data.shape)
73+
(1000, 5)
74+
75+
>>> # This will raise FileNotFoundError
76+
>>> data = load_csv('nonexistent.csv')
77+
78+
>>> # This will raise ValueError if columns are not numerical
79+
>>> data = load_csv('text_data.csv')
80+
"""
81+
# Step 1: File existence validation
82+
path = Path(filepath)
83+
if not path.exists():
84+
raise FileNotFoundError(
85+
f"File not found: {filepath}\n"
86+
f"Remediation: Verify the file path is correct and the file exists at the specified location."
87+
)
88+
89+
# Step 2: CSV parsing with error handling
90+
try:
91+
df = pd.read_csv(filepath)
92+
except pd.errors.ParserError as e:
93+
raise pd.errors.ParserError(
94+
f"Failed to parse CSV file: {filepath}\n"
95+
f"Original error: {str(e)}\n"
96+
f"Remediation: Check the file format - ensure it's a valid CSV with consistent "
97+
f"delimiters (commas), properly quoted fields, and no malformed rows."
98+
)
99+
except pd.errors.EmptyDataError as e:
100+
raise pd.errors.EmptyDataError(
101+
f"CSV file is empty: {filepath}\n"
102+
f"Remediation: Ensure the file contains data rows, not just headers or blank lines."
103+
)
104+
except UnicodeDecodeError as e:
105+
raise ValueError(
106+
f"Encoding error while reading CSV file: {filepath}\n"
107+
f"Original error: {str(e)}\n"
108+
f"Remediation: The file may have an incompatible encoding. "
109+
f"Try saving the file with UTF-8 encoding or specify encoding parameter."
110+
)
111+
except Exception as e:
112+
raise ValueError(
113+
f"Unexpected error loading CSV file: {filepath}\n"
114+
f"Original error: {type(e).__name__}: {str(e)}\n"
115+
f"Remediation: Verify the file is a valid CSV and not corrupted."
116+
)
117+
118+
# Step 3: Empty DataFrame detection
119+
if df.empty:
120+
raise ValueError(
121+
f"CSV file contains no data rows: {filepath}\n"
122+
f"File shape: {df.shape}\n"
123+
f"Remediation: Ensure the file contains at least one row of data beyond the header."
124+
)
125+
126+
# Step 4: Check for columns (edge case: file with no columns)
127+
if len(df.columns) == 0:
128+
raise ValueError(
129+
f"CSV file contains no columns: {filepath}\n"
130+
f"Remediation: Ensure the file has at least one column of data."
131+
)
132+
133+
# Step 5: Edge case - only headers, no data rows
134+
if len(df) == 0:
135+
raise ValueError(
136+
f"CSV file contains only headers with no data rows: {filepath}\n"
137+
f"Number of columns: {len(df.columns)}\n"
138+
f"Column names: {list(df.columns)}\n"
139+
f"Remediation: Add data rows to the CSV file."
140+
)
141+
142+
# Step 6: Numerical data validation
143+
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
144+
non_numerical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
145+
146+
if len(numerical_cols) == 0:
147+
raise ValueError(
148+
f"CSV file contains no numerical columns: {filepath}\n"
149+
f"File shape: {df.shape}\n"
150+
f"Column names and types:\n"
151+
f"{df.dtypes.to_string()}\n"
152+
f"Remediation: Clustering requires numerical data. Ensure columns contain "
153+
f"integer or float values. Convert or remove non-numerical columns (strings, dates, etc.)."
154+
)
155+
156+
if len(non_numerical_cols) > 0:
157+
raise ValueError(
158+
f"CSV file contains non-numerical columns: {filepath}\n"
159+
f"File shape: {df.shape}\n"
160+
f"Non-numerical columns ({len(non_numerical_cols)}): {non_numerical_cols}\n"
161+
f"Numerical columns ({len(numerical_cols)}): {numerical_cols}\n"
162+
f"Column types:\n"
163+
f"{df.dtypes.to_string()}\n"
164+
f"Remediation: Remove or convert non-numerical columns to numerical types. "
165+
f"Clustering algorithms require all features to be numerical (int or float). "
166+
f"Consider encoding categorical variables or removing text/date columns."
167+
)
168+
169+
return df
23170

24171

25172
def load_data(file_path: str, **kwargs) -> pd.DataFrame:

0 commit comments

Comments
 (0)