Skip to content

Commit 6400a9f

Browse files
artemisTurintechpaulsbrookes
authored andcommitted
build: add project dependencies to pyproject.toml for clustering analysis package
Add click, pandas, scikit-learn, seaborn, matplotlib, and pyyaml dependencies with minimum version requirements for CLI, data processing, clustering, and visualization
1 parent acc6e7a commit 6400a9f

File tree

7 files changed

+832
-1
lines changed

7 files changed

+832
-1
lines changed

clustering_toolkit/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""
2+
Clustering Toolkit Package
3+
4+
A modular package for data clustering analysis, providing functionality for:
5+
- Data loading and validation
6+
- Data preprocessing and transformation
7+
- Clustering algorithm implementations
8+
- Cluster evaluation metrics
9+
- Visualization and reporting
10+
"""
11+
12+
__version__ = "0.1.0"
13+
14+
__all__ = [
15+
"data_loader",
16+
"preprocessor",
17+
"clustering",
18+
"evaluation",
19+
"visualization",
20+
]

clustering_toolkit/clustering.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
"""
2+
Clustering Module
3+
4+
This module provides implementations and wrappers for various clustering
5+
algorithms. It supports multiple clustering approaches:
6+
7+
- K-Means clustering
8+
- Hierarchical clustering (Agglomerative, Divisive)
9+
- DBSCAN (Density-Based Spatial Clustering)
10+
- Gaussian Mixture Models (GMM)
11+
- Spectral clustering
12+
- Mean Shift clustering
13+
14+
The module provides a unified interface for applying different clustering
15+
algorithms and comparing their results.
16+
17+
Typical usage:
18+
from clustering_toolkit.clustering import KMeansClustering, DBSCANClustering
19+
20+
kmeans = KMeansClustering(n_clusters=3)
21+
labels = kmeans.fit_predict(data)
22+
23+
dbscan = DBSCANClustering(eps=0.5, min_samples=5)
24+
labels = dbscan.fit_predict(data)
25+
"""
26+
27+
import pandas as pd
28+
import numpy as np
29+
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
30+
from sklearn.mixture import GaussianMixture
31+
from typing import Optional, Union, Literal
32+
from abc import ABC, abstractmethod
33+
34+
35+
class ClusteringAlgorithm(ABC):
36+
"""Base class for clustering algorithms."""
37+
38+
def __init__(self):
39+
self.labels_ = None
40+
self.model = None
41+
42+
@abstractmethod
43+
def fit(self, data: Union[pd.DataFrame, np.ndarray]):
44+
"""Fit the clustering model to the data."""
45+
pass
46+
47+
@abstractmethod
48+
def predict(self, data: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
49+
"""Predict cluster labels for the data."""
50+
pass
51+
52+
def fit_predict(self, data: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
53+
"""Fit the model and return cluster labels."""
54+
self.fit(data)
55+
return self.predict(data)
56+
57+
58+
class KMeansClustering(ClusteringAlgorithm):
59+
"""K-Means clustering implementation."""
60+
61+
def __init__(self, n_clusters: int = 3, random_state: Optional[int] = 42, **kwargs):
62+
super().__init__()
63+
self.n_clusters = n_clusters
64+
self.random_state = random_state
65+
self.model = KMeans(n_clusters=n_clusters, random_state=random_state, **kwargs)
66+
67+
def fit(self, data: Union[pd.DataFrame, np.ndarray]):
68+
"""Fit K-Means model to the data."""
69+
self.model.fit(data)
70+
self.labels_ = self.model.labels_
71+
return self
72+
73+
def predict(self, data: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
74+
"""Predict cluster labels."""
75+
return self.model.predict(data)
76+
77+
78+
class DBSCANClustering(ClusteringAlgorithm):
79+
"""DBSCAN clustering implementation."""
80+
81+
def __init__(self, eps: float = 0.5, min_samples: int = 5, **kwargs):
82+
super().__init__()
83+
self.eps = eps
84+
self.min_samples = min_samples
85+
self.model = DBSCAN(eps=eps, min_samples=min_samples, **kwargs)
86+
87+
def fit(self, data: Union[pd.DataFrame, np.ndarray]):
88+
"""Fit DBSCAN model to the data."""
89+
self.labels_ = self.model.fit_predict(data)
90+
return self
91+
92+
def predict(self, data: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
93+
"""Return cluster labels (DBSCAN doesn't support prediction on new data)."""
94+
return self.labels_
95+
96+
97+
class HierarchicalClustering(ClusteringAlgorithm):
98+
"""Hierarchical (Agglomerative) clustering implementation."""
99+
100+
def __init__(self, n_clusters: int = 3, linkage: str = 'ward', **kwargs):
101+
super().__init__()
102+
self.n_clusters = n_clusters
103+
self.linkage = linkage
104+
self.model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage, **kwargs)
105+
106+
def fit(self, data: Union[pd.DataFrame, np.ndarray]):
107+
"""Fit hierarchical clustering model to the data."""
108+
self.labels_ = self.model.fit_predict(data)
109+
return self
110+
111+
def predict(self, data: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
112+
"""Return cluster labels (Hierarchical clustering doesn't support prediction)."""
113+
return self.labels_
114+
115+
116+
def cluster_data(
117+
data: Union[pd.DataFrame, np.ndarray],
118+
algorithm: Literal['kmeans', 'dbscan', 'hierarchical'] = 'kmeans',
119+
**kwargs
120+
) -> np.ndarray:
121+
"""
122+
Cluster data using the specified algorithm.
123+
124+
Args:
125+
data: Data to cluster
126+
algorithm: Clustering algorithm to use
127+
**kwargs: Algorithm-specific parameters
128+
129+
Returns:
130+
Array of cluster labels
131+
"""
132+
if algorithm == 'kmeans':
133+
clusterer = KMeansClustering(**kwargs)
134+
elif algorithm == 'dbscan':
135+
clusterer = DBSCANClustering(**kwargs)
136+
elif algorithm == 'hierarchical':
137+
clusterer = HierarchicalClustering(**kwargs)
138+
else:
139+
raise ValueError(f"Unknown algorithm: {algorithm}")
140+
141+
return clusterer.fit_predict(data)

clustering_toolkit/data_loader.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""
2+
Data Loader Module
3+
4+
This module provides functionality for loading and validating CSV data files
5+
for clustering analysis. It handles:
6+
7+
- CSV file reading with proper encoding and delimiter detection
8+
- Data validation (checking for required columns, data types, etc.)
9+
- Basic data quality checks (missing values, duplicates, etc.)
10+
- Data integrity verification before processing
11+
- Error handling for malformed or incompatible data files
12+
13+
Typical usage:
14+
from clustering_toolkit.data_loader import load_data, validate_data
15+
16+
data = load_data('path/to/data.csv')
17+
is_valid, errors = validate_data(data)
18+
"""
19+
20+
import pandas as pd
21+
from pathlib import Path
22+
from typing import Optional, Tuple, List
23+
24+
25+
def load_data(file_path: str, **kwargs) -> pd.DataFrame:
26+
"""
27+
Load data from a CSV file.
28+
29+
Args:
30+
file_path: Path to the CSV file
31+
**kwargs: Additional arguments to pass to pandas.read_csv()
32+
33+
Returns:
34+
DataFrame containing the loaded data
35+
36+
Raises:
37+
FileNotFoundError: If the file doesn't exist
38+
pd.errors.ParserError: If the file cannot be parsed
39+
"""
40+
path = Path(file_path)
41+
if not path.exists():
42+
raise FileNotFoundError(f"File not found: {file_path}")
43+
44+
return pd.read_csv(file_path, **kwargs)
45+
46+
47+
def validate_data(data: pd.DataFrame) -> Tuple[bool, List[str]]:
48+
"""
49+
Validate the loaded data for clustering analysis.
50+
51+
Args:
52+
data: DataFrame to validate
53+
54+
Returns:
55+
Tuple of (is_valid, error_messages)
56+
"""
57+
errors = []
58+
59+
if data.empty:
60+
errors.append("Data is empty")
61+
62+
if len(data.columns) == 0:
63+
errors.append("No columns found in data")
64+
65+
return len(errors) == 0, errors

0 commit comments

Comments
 (0)