Skip to content

Commit 1c948f2

Browse files
feat: add data loading and validation module with CSV handling, feature separation, and comprehensive error checking
1 parent e2c1533 commit 1c948f2

File tree

1 file changed

+118
-0
lines changed

1 file changed

+118
-0
lines changed

data_loader.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
"""Data loading and validation module for ML CLI application."""
2+
3+
import pandas as pd
4+
from pathlib import Path
5+
from typing import Tuple, List
6+
7+
8+
def load_and_validate_data(file_path: str, target_column: str) -> Tuple[pd.DataFrame, pd.Series, List[str]]:
9+
"""
10+
Load and validate CSV data for machine learning tasks.
11+
12+
This function performs comprehensive validation on the input CSV file,
13+
ensuring it meets all requirements for model training or prediction.
14+
It separates features from the target variable and identifies numeric columns.
15+
16+
Parameters
17+
----------
18+
file_path : str
19+
Path to the CSV file to load. Must be a valid, readable CSV file.
20+
target_column : str
21+
Name of the target column to separate from features.
22+
Must exist in the CSV file's columns.
23+
24+
Returns
25+
-------
26+
Tuple[pd.DataFrame, pd.Series, List[str]]
27+
A tuple containing:
28+
- X (pd.DataFrame): Features DataFrame with target column removed
29+
- y (pd.Series): Target column as a Series
30+
- numeric_features (List[str]): List of numeric feature column names
31+
32+
Raises
33+
------
34+
FileNotFoundError
35+
If the specified file does not exist or is not accessible.
36+
ValueError
37+
If the CSV file is empty, malformed, or fails validation checks:
38+
- Target column does not exist in the DataFrame
39+
- No feature columns remain after removing the target column
40+
- CSV file has no data rows (empty after header)
41+
pd.errors.ParserError
42+
If the file cannot be parsed as a valid CSV format.
43+
PermissionError
44+
If the file exists but cannot be read due to permission issues.
45+
46+
Examples
47+
--------
48+
>>> X, y, numeric_cols = load_and_validate_data('data.csv', 'target')
49+
>>> print(f"Features shape: {X.shape}, Target shape: {y.shape}")
50+
>>> print(f"Numeric columns: {numeric_cols}")
51+
52+
Notes
53+
-----
54+
- Missing values (NaN) are preserved and not filtered out
55+
- Non-numeric columns are retained in the features DataFrame
56+
- The function identifies numeric columns using pandas dtypes (int64, float64, etc.)
57+
- The target column is always removed from features, regardless of its dtype
58+
"""
59+
# Validate file existence and readability
60+
path = Path(file_path)
61+
62+
if not path.exists():
63+
raise FileNotFoundError(f"The file '{file_path}' does not exist.")
64+
65+
if not path.is_file():
66+
raise FileNotFoundError(f"The path '{file_path}' is not a file.")
67+
68+
# Check file readability
69+
try:
70+
with open(path, 'r') as f:
71+
pass # Just checking if we can open it
72+
except PermissionError as e:
73+
raise PermissionError(f"Permission denied when trying to read '{file_path}'.") from e
74+
except Exception as e:
75+
raise ValueError(f"Unable to read file '{file_path}': {str(e)}") from e
76+
77+
# Load CSV with error handling
78+
try:
79+
df = pd.read_csv(file_path)
80+
except pd.errors.EmptyDataError as e:
81+
raise ValueError(f"The CSV file '{file_path}' is empty.") from e
82+
except pd.errors.ParserError as e:
83+
raise pd.errors.ParserError(f"Failed to parse '{file_path}' as a valid CSV file: {str(e)}") from e
84+
except Exception as e:
85+
raise ValueError(f"Error reading CSV file '{file_path}': {str(e)}") from e
86+
87+
# Validate that DataFrame is not empty
88+
if df.empty:
89+
raise ValueError(f"The CSV file '{file_path}' contains no data rows.")
90+
91+
if len(df.columns) == 0:
92+
raise ValueError(f"The CSV file '{file_path}' contains no columns.")
93+
94+
# Validate target column exists
95+
if target_column not in df.columns:
96+
available_columns = "', '".join(df.columns.tolist())
97+
raise ValueError(
98+
f"Target column '{target_column}' not found in CSV file. "
99+
f"Available columns: '{available_columns}'"
100+
)
101+
102+
# Separate target from features
103+
y = df[target_column].copy()
104+
X = df.drop(columns=[target_column])
105+
106+
# Validate at least one feature column remains
107+
if X.shape[1] == 0:
108+
raise ValueError(
109+
f"No feature columns remain after removing target column '{target_column}'. "
110+
f"The CSV must contain at least one feature column in addition to the target."
111+
)
112+
113+
# Identify numeric columns in features
114+
numeric_features = X.select_dtypes(include=['int64', 'int32', 'int16', 'int8',
115+
'float64', 'float32', 'float16',
116+
'uint64', 'uint32', 'uint16', 'uint8']).columns.tolist()
117+
118+
return X, y, numeric_features

0 commit comments

Comments
 (0)