In [2]:
import pandas as pd
import numpy as np
from knsmote import KMeansSMOTE
# from imblearn.over_sampling import KMeansSMOTE
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
from data import get_data

In [3]:
# Define features and target variable
X, y, X_feature_names = get_data()

# Print original class distribution
print("Original class distribution:", Counter(y))

Original class distribution: Counter({4: 22503, 3: 8570, 0: 306, 2: 148, 1: 29})


In [8]:
import numpy as np
from collections import Counter
from sklearn.cluster import MiniBatchKMeans
from imblearn.over_sampling import KMeansSMOTE
from scipy import sparse

# Define features and target variable (using your existing code)
X, y, X_feature_names = get_data()

# Print original class distribution
print("Original class distribution:", Counter(y))
original_sample_count = len(y)
print(f"Original dataset size: {original_sample_count} samples")

# Convert COO matrix to CSR matrix if needed (CSR supports indexing)
if sparse.isspmatrix_coo(X):
    X = X.tocsr()
    print("Converted COO matrix to CSR matrix for indexing")

# Choose a simple case - let's oversample a single minority class
minority_class = 1  # Based on your data, class 1 has only 29 samples
target_count = 100  # Generate a small number for easy verification

# Configure KMeansSMOTE for this test
smote = KMeansSMOTE(
    kmeans_estimator=MiniBatchKMeans(n_clusters=20, n_init=1, random_state=0),
    cluster_balance_threshold=0.001,
    sampling_strategy={minority_class: target_count},
    random_state=42
)

# Apply SMOTE
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert result to CSR if it's a COO matrix
if sparse.isspmatrix_coo(X_resampled):
    X_resampled = X_resampled.tocsr()
    print("Converted resampled COO matrix to CSR matrix for indexing")

print(f"Resampled dataset size: {len(y_resampled)} samples")
print("Resampled class distribution:", Counter(y_resampled))

# Verification test 1: Check if original samples are preserved at the beginning
print("\n--- Testing if original samples are preserved at the beginning ---")

# Compare the first N samples (where N is the original dataset size)
samples_match = True
for i in range(min(5, original_sample_count)):  # Check first 5 samples for demonstration
    # For sparse matrices, use .toarray() to get actual values
    original_row = X[i].toarray() if sparse.issparse(X) else X[i]
    resampled_row = X_resampled[i].toarray() if sparse.issparse(X_resampled) else X_resampled[i]
    
    # Check if they match
    match = np.array_equal(original_row, resampled_row)
    print(f"Sample {i}: Original matches resampled? {match}")
    if not match:
        samples_match = False
        break

print("\nAll original samples preserved at beginning:", samples_match)

# Verification test 2: Check a specific feature to see the difference between original and synthetic
print("\n--- Checking some features in original vs synthetic samples ---")

# Select a feature (column) to compare
feature_idx = 0  # First feature
feature_name = X_feature_names[feature_idx] if X_feature_names is not None else f"Feature {feature_idx}"

# Extract that feature from original and all samples
if sparse.issparse(X):
    original_feature_values = X.toarray()[:, feature_idx]
    resampled_feature_values = X_resampled.toarray()[:, feature_idx]
else:
    original_feature_values = X[:, feature_idx]
    resampled_feature_values = X_resampled[:, feature_idx]

# Print statistics for that feature to see the difference
print(f"Feature '{feature_name}' statistics:")
print(f"  Original: mean={original_feature_values.mean():.4f}, std={original_feature_values.std():.4f}")
print(f"  Resampled: mean={resampled_feature_values.mean():.4f}, std={resampled_feature_values.std():.4f}")

# Look at new synthetic samples for the minority class
print("\n--- Examining synthetic samples for class {minority_class} ---")
synthetic_indices = np.where((y_resampled == minority_class) & (np.arange(len(y_resampled)) >= original_sample_count))[0]
print(f"Number of synthetic samples for class {minority_class}: {len(synthetic_indices)}")

if len(synthetic_indices) > 0:
    print(f"First few synthetic sample indices: {synthetic_indices[:5] if len(synthetic_indices) >= 5 else synthetic_indices}")
    
    # Check if these synthetic samples are truly new (not in original data)
    for idx in range(min(3, len(synthetic_indices))):  # Check first 3 synthetic samples
        i = synthetic_indices[idx]
        is_duplicate = False
        
        # Get the synthetic sample
        synthetic_sample = X_resampled[i].toarray() if sparse.issparse(X_resampled) else X_resampled[i]
        
        # Compare with original samples
        for j in range(original_sample_count):
            original_sample = X[j].toarray() if sparse.issparse(X) else X[j]
            
            if np.array_equal(synthetic_sample, original_sample):
                is_duplicate = True
                print(f"Synthetic sample at index {i} is identical to original sample at index {j}")
                break
        
        if not is_duplicate:
            print(f"Synthetic sample at index {i} is truly new (not found in original data)")

Original class distribution: Counter({4: 22503, 3: 8570, 0: 306, 2: 148, 1: 29})
Original dataset size: 31556 samples
Converted COO matrix to CSR matrix for indexing
Resampled dataset size: 31628 samples
Resampled class distribution: Counter({4: 22503, 3: 8570, 0: 306, 2: 148, 1: 101})

--- Testing if original samples are preserved at the beginning ---
Sample 0: Original matches resampled? True
Sample 1: Original matches resampled? True
Sample 2: Original matches resampled? True
Sample 3: Original matches resampled? True
Sample 4: Original matches resampled? True

All original samples preserved at beginning: True

--- Checking some features in original vs synthetic samples ---
Feature 'category_0' statistics:
  Original: mean=0.0000, std=0.0056
  Resampled: mean=0.0000, std=0.0056

--- Examining synthetic samples for class {minority_class} ---
Number of synthetic samples for class 1: 72
First few synthetic sample indices: [31556 31557 31558 31559 31560]
Synthetic sample at index 31556 