In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score,
    adjusted_rand_score
)
from sklearn.decomposition import PCA

import json
import warnings
warnings.filterwarnings("ignore")

plt.rcParams['figure.figsize'] = (8, 5)
sns.set(style="whitegrid")

# Создание папок
Path("artifacts/figures").mkdir(parents=True, exist_ok=True)
Path("artifacts/labels").mkdir(parents=True, exist_ok=True)

# Список датасетов
DATASET_PATHS = [
    "data/S07-hw-dataset-01.csv",
    "data/S07-hw-dataset-02.csv",
    "data/S07-hw-dataset-04.csv"
]

all_metrics = {}
best_configs = {}

In [17]:
def get_column_types(df):
    cols = [c for c in df.columns if c != "sample_id"]
    num_cols = df[cols].select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df[cols].select_dtypes(exclude=[np.number]).columns.tolist()
    return num_cols, cat_cols

def safe_silhouette(X, labels):
    unique_labels = np.unique(labels)
    if len(unique_labels) == 1:
        return -1  # неделимый кластер
    if -1 in unique_labels and len(unique_labels) == 2:
        return -1
    mask = labels != -1
    if mask.sum() == 0:
        return -1
    if mask.sum() == 1:
        return -1
    return silhouette_score(X[mask], labels[mask])

def compute_metrics(X, labels, algo_name):
    noise_ratio = np.mean(labels == -1) if -1 in labels else 0.0
    
    if algo_name == "DBSCAN":
        mask = labels != -1
        if mask.sum() < 2:
            sil = db = ch = -1
        else:
            X_clean = X[mask]
            labels_clean = labels[mask]
            sil = silhouette_score(X_clean, labels_clean) if len(np.unique(labels_clean)) > 1 else -1
            db = davies_bouldin_score(X_clean, labels_clean) if len(np.unique(labels_clean)) > 1 else np.inf
            ch = calinski_harabasz_score(X_clean, labels_clean) if len(np.unique(labels_clean)) > 1 else -1
    else:
        sil = silhouette_score(X, labels) if len(np.unique(labels)) > 1 else -1
        db = davies_bouldin_score(X, labels) if len(np.unique(labels)) > 1 else np.inf
        ch = calinski_harabasz_score(X, labels) if len(np.unique(labels)) > 1 else -1

    return {
        "silhouette": float(sil),
        "davies_bouldin": float(db),
        "calinski_harabasz": float(ch),
        "noise_ratio": float(noise_ratio)
    }

In [23]:
results = {}

for ds_path in DATASET_PATHS:
    print(f"\n{'='*60}")
    print(f"Обработка: {ds_path}")
    print('='*60)
    
    df = pd.read_csv(ds_path)
    sample_ids = df["sample_id"].copy()
    X_raw = df.drop(columns=["sample_id"])
    
    print("Форма:", X_raw.shape)
    print("Пропуски:\n", X_raw.isnull().mean())
    num_cols, cat_cols = get_column_types(df)
    print("Числовые признаки:", num_cols)
    print("Категориальные признаки:", cat_cols)
    
    transformers = []
    if num_cols:
        num_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ])
        transformers.append(("num", num_pipe, num_cols))
    if cat_cols:
        cat_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ])
        transformers.append(("cat", cat_pipe, cat_cols))
    
    preprocessor = ColumnTransformer(transformers, remainder="drop")
    X_processed = preprocessor.fit_transform(X_raw)
    print(f"\nПосле препроцессинга: {X_processed.shape}")
    
    results[ds_path] = {
        "sample_ids": sample_ids,
        "X_raw": X_raw,
        "X_processed": X_processed,
        "num_cols": num_cols,
        "cat_cols": cat_cols
    }


Обработка: data/S07-hw-dataset-01.csv
Форма: (12000, 8)
Пропуски:
 f01    0.0
f02    0.0
f03    0.0
f04    0.0
f05    0.0
f06    0.0
f07    0.0
f08    0.0
dtype: float64
Числовые признаки: ['f01', 'f02', 'f03', 'f04', 'f05', 'f06', 'f07', 'f08']
Категориальные признаки: []

После препроцессинга: (12000, 8)

Обработка: data/S07-hw-dataset-02.csv
Форма: (8000, 3)
Пропуски:
 x1         0.0
x2         0.0
z_noise    0.0
dtype: float64
Числовые признаки: ['x1', 'x2', 'z_noise']
Категориальные признаки: []

После препроцессинга: (8000, 3)

Обработка: data/S07-hw-dataset-04.csv
Форма: (10000, 32)
Пропуски:
 cat_a    0.0000
cat_b    0.0000
n01      0.0174
n02      0.0189
n03      0.0199
n04      0.0192
n05      0.0201
n06      0.0183
n07      0.0204
n08      0.0194
n09      0.0195
n10      0.0189
n11      0.0204
n12      0.0202
n13      0.0197
n14      0.0198
n15      0.0186
n16      0.0191
n17      0.0212
n18      0.0212
n19      0.0187
n20      0.0203
n21      0.0215
n22      0.0196
n23    

In [19]:
from itertools import product

metrics_summary = {}
best_configs = {}

for ds_path in DATASET_PATHS:
    
    X = results[ds_path]["X_processed"]
    sample_ids = results[ds_path]["sample_ids"]
    ds_name = Path(ds_path).stem  
    
    metrics_summary[ds_name] = {}
    best_score = -np.inf
    best_model_info = None
    best_labels = None
    
    k_range = range(2, 11)
    sil_scores_kmeans = []
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        sil = silhouette_score(X, labels)
        sil_scores_kmeans.append(sil)
        
        metrics = compute_metrics(X, labels, "KMeans")
        metrics_summary[ds_name][f"KMeans_k{k}"] = metrics
        
        if sil > best_score:
            best_score = sil
            best_model_info = {"algo": "KMeans", "params": {"k": k}}
            best_labels = labels.copy()
    
    plt.figure()
    plt.plot(k_range, sil_scores_kmeans, marker='o')
    plt.title(f"{ds_name}: Silhouette vs k (KMeans)")
    plt.xlabel("k")
    plt.ylabel("Silhouette Score")
    plt.savefig(f"artifacts/figures/sil_kmeans_{ds_name}.png", dpi=150, bbox_inches='tight')
    plt.close()
    
    eps_vals = np.linspace(0.3, 2.0, 10)
    min_samples_vals = [3, 5, 10]
    
    best_sil_dbscan = -1
    best_dbscan_params = None
    best_dbscan_labels = None
    
    for eps, min_samples in product(eps_vals, min_samples_vals):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        if n_clusters < 2:
            continue
        sil = safe_silhouette(X, labels)
        if sil > best_sil_dbscan:
            best_sil_dbscan = sil
            best_dbscan_params = {"eps": eps, "min_samples": min_samples}
            best_dbscan_labels = labels.copy()
    
    if best_dbscan_params is not None:
        metrics = compute_metrics(X, best_dbscan_labels, "DBSCAN")
        model_key = f"DBSCAN_eps{best_dbscan_params['eps']:.2f}_ms{best_dbscan_params['min_samples']}"
        metrics_summary[ds_name][model_key] = metrics
        
        if best_sil_dbscan > best_score:
            best_score = best_sil_dbscan
            best_model_info = {"algo": "DBSCAN", "params": best_dbscan_params}
            best_labels = best_dbscan_labels.copy()
    
    sil_vs_eps = []
    for eps in eps_vals:
        labels = DBSCAN(eps=eps, min_samples=5).fit_predict(X)
        sil = safe_silhouette(X, labels)
        sil_vs_eps.append(sil)
    plt.figure()
    plt.plot(eps_vals, sil_vs_eps, marker='o')
    plt.title(f"{ds_name}: Silhouette vs eps (DBSCAN, min_samples=5)")
    plt.xlabel("eps")
    plt.ylabel("Silhouette Score")
    plt.savefig(f"artifacts/figures/sil_dbscan_{ds_name}.png", dpi=150, bbox_inches='tight')
    plt.close()
    
    output_df = pd.DataFrame({
        "sample_id": sample_ids,
        "cluster_label": best_labels
    })
    output_df.to_csv(f"artifacts/labels/labels_hw07_{ds_name.split('-')[-1]}.csv", index=False)
    
    best_configs[ds_name] = best_model_info
    print(f"Лучший метод для {ds_name}: {best_model_info}")

Лучший метод для S07-hw-dataset-01: {'algo': 'KMeans', 'params': {'k': 2}}
Лучший метод для S07-hw-dataset-02: {'algo': 'DBSCAN', 'params': {'eps': np.float64(0.6777777777777778), 'min_samples': 3}}
Лучший метод для S07-hw-dataset-04: {'algo': 'DBSCAN', 'params': {'eps': np.float64(1.6222222222222222), 'min_samples': 10}}


In [20]:
for ds_path in DATASET_PATHS:
    ds_name = Path(ds_path).stem
    X = results[ds_path]["X_processed"]
    sample_ids = results[ds_path]["sample_ids"]
    
    label_file = f"artifacts/labels/labels_hw07_{ds_name.split('-')[-1]}.csv"
    labels_df = pd.read_csv(label_file)
    labels = labels_df["cluster_label"].values
    
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X)
    
    plt.figure()
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap="tab10", s=10)
    plt.colorbar(scatter)
    plt.title(f"{ds_name}: PCA (лучшая кластеризация)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.savefig(f"artifacts/figures/pca_{ds_name}.png", dpi=150, bbox_inches='tight')
    plt.close()

In [21]:
ds_path = DATASET_PATHS[0]  
ds_name = Path(ds_path).stem
X = results[ds_path]["X_processed"]

ari_scores = []
labels_list = []

for seed in range(5):
    kmeans = KMeans(n_clusters=best_configs[ds_name]["params"]["k"], random_state=seed, n_init=10)
    labels = kmeans.fit_predict(X)
    labels_list.append(labels)

for i in range(len(labels_list)):
    for j in range(i+1, len(labels_list)):
        ari = adjusted_rand_score(labels_list[i], labels_list[j])
        ari_scores.append(ari)

print(f"Средний ARI: {np.mean(ari_scores):.4f} ± {np.std(ari_scores):.4f}")

with open("artifacts/stability_ari.json", "w") as f:
    json.dump({
        "dataset": ds_name,
        "ari_scores": [float(x) for x in ari_scores],
        "mean_ari": float(np.mean(ari_scores)),
        "std_ari": float(np.std(ari_scores))
    }, f, indent=2)

Средний ARI: 1.0000 ± 0.0000


In [22]:
final_metrics = {}
for ds_path, metrics in metrics_summary.items():
    final_metrics[ds_path] = metrics

with open("artifacts/metrics_summary.json", "w") as f:
    json.dump(final_metrics, f, indent=2)

with open("artifacts/best_configs.json", "w") as f:
    json.dump(best_configs, f, indent=2)