In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_digits, fetch_california_housing
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import entropy, variation
from DynamicHybridSampler import DynamicHybridSampler  # Make sure your class is in this file or properly imported
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml

# List of dataset names to load from OpenML (UCI datasets are mirrored here)
dataset_names = [
    'iris',
    'wine',
    'breast-cancer',
    'heart-disease',
    'adult',
    'bank-marketing',
    'mushroom',
    'yeast',
    'ecoli',
    'abalone'
]

def load_datasets(names):
    datasets = {}
    for name in names:
        try:
            data = fetch_openml(name, version=1, as_frame=True)
            df = data.frame
            datasets[name] = df
        except Exception as e:
            print(f"Failed to load {name}: {e}")
    return datasets

datasets = load_datasets(dataset_names)

In [2]:
from scipy.stats import entropy

def compute_kl_divergence(original, sample, bins=10):
    scores = {}
    numeric_cols = original.select_dtypes(include=np.number).columns
    for col in numeric_cols:
        orig_hist, _ = np.histogram(original[col].dropna(), bins=bins, density=True)
        sample_hist, _ = np.histogram(sample[col].dropna(), bins=bins, density=True)
        orig_hist += 1e-6  # Avoid zero division
        sample_hist += 1e-6
        scores[col] = entropy(orig_hist, sample_hist)
    return scores

def category_coverage(original, sample):
    coverage = {}
    cat_cols = original.select_dtypes(include='object').columns
    for col in cat_cols:
        orig_vals = set(original[col].dropna().unique())
        sample_vals = set(sample[col].dropna().unique())
        coverage[col] = len(sample_vals & orig_vals) / max(len(orig_vals), 1)
    return coverage

results_summary = {}

for name, df in datasets.items():
    print(f"Processing dataset: {name}")
    try:
        sampler = DynamicHybridSampler()
        sampler.fit(df)
        sampled = sampler.get_sample()
        explanation = sampler.get_explanation()

        kl_scores = compute_kl_divergence(df, sampled)
        coverage_scores = category_coverage(df, sampled)

        results_summary[name] = {
            "original_size": len(df),
            "sample_size": len(sampled),
            "kl_divergence": kl_scores,
            "category_coverage": coverage_scores,
            "explanation_steps": explanation[:5]  # Preview first few steps
        }
    except Exception as e:
        print(f"Failed to process {name}: {e}")


Profiling data...
Sampling rare categories from categorical columns...
Applying stratified sampling on 'class'...
Stratified sampling on 'class' completed
No temporal data found, applying systematic sampling...
Systematic sampling applied with interval 1
Checking for multi-stage sampling opportunities...
Multistage sampling skipped due to missing hierarchical fields
Encoding categorical columns for clustering...
Running DBSCAN clustering...
Silhouette Score for DBSCAN: 0.21


Processing dataset: iris


Included 3 samples from noise (outlier)
Included 3 samples from cluster 0
Included 3 samples from cluster 1
Included 3 samples from cluster 2
Included 3 samples from cluster 3
Sampling from skewed numerical features using refined PPS...
Final sample size: 100
Sample size achieved: 100
Profiling data...
Sampling rare categories from categorical columns...
Applying stratified sampling on 'class'...
Stratified sampling on 'class' completed
No temporal data found, applying systematic sampling...
Systematic sampling applied with interval 1
Checking for multi-stage sampling opportunities...
Multistage sampling skipped due to missing hierarchical fields
Encoding categorical columns for clustering...
Running DBSCAN clustering...
DBSCAN clustering did not produce multiple clusters
Included 3 samples from noise (outlier)
Sampling from skewed numerical features using refined PPS...
RPPS sampling on skewed column 'Malic_acid'
RPPS sampling on skewed column 'Magnesium'
Final sample size: 100
Sample

Processing dataset: wine
Processing dataset: breast-cancer


Final sample size: 100
Sample size achieved: 100
Profiling data...
Sampling rare categories from categorical columns...
No temporal data found, applying systematic sampling...
Systematic sampling applied with interval 3
Checking for multi-stage sampling opportunities...
Multistage sampling skipped due to missing hierarchical fields
Encoding categorical columns for clustering...
Running DBSCAN clustering...
DBSCAN clustering did not produce multiple clusters
Included 3 samples from noise (outlier)
Sampling from skewed numerical features using refined PPS...
RPPS sampling on skewed column 'chol'
RPPS sampling on skewed column 'fbs'
RPPS sampling on skewed column 'oldpeak'
RPPS sampling on skewed column 'ca'
Final sample size: 100
Sample size achieved: 100
Profiling data...
Sampling rare categories from categorical columns...
Included 1 rare samples from column 'native-country'
Applying stratified sampling on 'occupation'...


Processing dataset: heart-disease
Processing dataset: adult


Stratified sampling on 'occupation' completed
No temporal data found, applying systematic sampling...
Systematic sampling applied with interval 488
Checking for multi-stage sampling opportunities...
Multistage sampling skipped due to missing hierarchical fields
Encoding categorical columns for clustering...
Running DBSCAN clustering...
Silhouette Score for DBSCAN: -0.43
Included 3 samples from noise (outlier)
Included 3 samples from cluster 0
Included 3 samples from cluster 1
Included 3 samples from cluster 2
Included 3 samples from cluster 3
Included 3 samples from cluster 4
Included 3 samples from cluster 5
Included 3 samples from cluster 6
Included 3 samples from cluster 7
Included 3 samples from cluster 8
Included 3 samples from cluster 9
Included 3 samples from cluster 10
Included 3 samples from cluster 11
Included 3 samples from cluster 12
Included 3 samples from cluster 13
Included 3 samples from cluster 14
Included 3 samples from cluster 15
Included 3 samples from cluster 16
In

Processing dataset: bank-marketing


Silhouette Score for DBSCAN: -0.34
Included 3 samples from noise (outlier)
Included 3 samples from cluster 0
Included 3 samples from cluster 1
Included 3 samples from cluster 2
Included 3 samples from cluster 3
Included 3 samples from cluster 4
Included 3 samples from cluster 5
Included 3 samples from cluster 6
Included 3 samples from cluster 7
Included 3 samples from cluster 8
Included 3 samples from cluster 9
Included 3 samples from cluster 10
Included 3 samples from cluster 11
Included 3 samples from cluster 12
Included 3 samples from cluster 13
Included 3 samples from cluster 14
Included 3 samples from cluster 15
Included 3 samples from cluster 16
Included 3 samples from cluster 17
Included 3 samples from cluster 18
Included 3 samples from cluster 19
Included 3 samples from cluster 20
Included 3 samples from cluster 21
Included 3 samples from cluster 22
Included 3 samples from cluster 23
Included 3 samples from cluster 24
Included 3 samples from cluster 25
Included 3 samples from c

Processing dataset: mushroom
Processing dataset: yeast


No temporal data found, applying systematic sampling...
Systematic sampling applied with interval 14
Checking for multi-stage sampling opportunities...
Multistage sampling skipped due to missing hierarchical fields
Encoding categorical columns for clustering...
Running DBSCAN clustering...
Silhouette Score for DBSCAN: -0.09
Included 3 samples from noise (outlier)
Included 3 samples from cluster 0
Included 3 samples from cluster 1
Included 3 samples from cluster 2
Included 3 samples from cluster 3
Included 3 samples from cluster 4
Included 3 samples from cluster 5
Included 3 samples from cluster 6
Sampling from skewed numerical features using refined PPS...
RPPS sampling on skewed column 'mit'
RPPS sampling on skewed column 'erl'
RPPS sampling on skewed column 'pox'
RPPS sampling on skewed column 'vac'
RPPS sampling on skewed column 'nuc'
Final sample size: 100
Sample size achieved: 100
Profiling data...
Sampling rare categories from categorical columns...
Applying stratified sampling o

Processing dataset: ecoli
Processing dataset: abalone


Silhouette Score for DBSCAN: 0.13
Included 3 samples from noise (outlier)
Included 3 samples from cluster 0
Included 3 samples from cluster 1
Included 3 samples from cluster 2
Included 3 samples from cluster 3
Included 3 samples from cluster 4
Included 3 samples from cluster 5
Included 3 samples from cluster 6
Sampling from skewed numerical features using refined PPS...
RPPS sampling on skewed column 'Height'
Final sample size: 100
Sample size achieved: 100


In [3]:
for dataset, metrics in results_summary.items():
    print(f"\nDataset: {dataset}")
    print(f"Original Size: {metrics['original_size']}, Sample Size: {metrics['sample_size']}")
    print("KL Divergence:")
    for col, score in metrics['kl_divergence'].items():
        print(f"  {col}: {score:.4f}")
    print("Category Coverage:")
    for col, coverage in metrics['category_coverage'].items():
        print(f"  {col}: {coverage:.2%}")
    print("Explanation Steps:")
    for step in metrics['explanation_steps']:
        print(f"  - {step}")



Dataset: iris
Original Size: 150, Sample Size: 100
KL Divergence:
  sepallength: 0.1006
  sepalwidth: 0.1367
  petallength: 0.0608
  petalwidth: 0.7295
Category Coverage:
Explanation Steps:
  - Profiling data...
  - Sampling rare categories from categorical columns...
  - Applying stratified sampling on 'class'...
  - Stratified sampling on 'class' completed
  - No temporal data found, applying systematic sampling...

Dataset: wine
Original Size: 178, Sample Size: 100
KL Divergence:
  Alcohol: 0.3138
  Malic_acid: 0.1046
  Ash: 0.5451
  Alcalinity_of_ash: 0.3135
  Magnesium: 0.3637
  Total_phenols: 0.2285
  Flavanoids: 0.4095
  Nonflavanoid_phenols: 0.0352
  Proanthocyanins: 0.3462
  Color_intensity: 0.0993
  Hue: 0.3900
  OD280%2FOD315_of_diluted_wines: 0.0676
  Proline: 0.0677
Category Coverage:
Explanation Steps:
  - Profiling data...
  - Sampling rare categories from categorical columns...
  - Applying stratified sampling on 'class'...
  - Stratified sampling on 'class' completed
