In [1]:
import pandas as pd
import numpy as np
from knsmote import KMeansSMOTE
# from imblearn.over_sampling import KMeansSMOTE
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
from data import get_data
import scipy.sparse as sparse

In [2]:
# Define features and target variable
X, y, X_feature_names = get_data()

In [3]:
# Convert X to CSR if it's COO (for indexing)
if sparse.isspmatrix_coo(X):
    X = X.tocsr()

# Print original class distribution
print("Original class distribution:", Counter(y))
original_sample_count = len(y)

# Define target counts for each class
samples_per_class = 15000
unique_classes = np.unique(y)

# Calculate required number of samples for each class
# Add original count to target count so we'll have enough after removing original samples
sampling_strategy = {
    label: samples_per_class + np.sum(y == label)
    for label in unique_classes
}

# Configure KMeansSMOTE
smote = KMeansSMOTE(
    kmeans_estimator=MiniBatchKMeans(n_clusters=20, n_init=1, random_state=0),
    cluster_balance_threshold=0.001,
    sampling_strategy=sampling_strategy,  # Use our custom sampling strategy
    random_state=42
)

# Apply KMeansSMOTE
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert to CSR if needed
if sparse.isspmatrix_coo(X_resampled):
    X_resampled = X_resampled.tocsr()

print("Resampled class distribution:", Counter(y_resampled))

# Create empty arrays to hold our purely synthetic samples
synthetic_X = []
synthetic_y = []

# For each class, extract exactly 15k synthetic samples
for class_label in unique_classes:
    print(f"Extracting synthetic samples for class {class_label}...")
    
    # Original samples of this class
    orig_class_count = np.sum(y == class_label)
    
    # Find synthetic samples for this class
    # They start after all original samples
    synthetic_class_indices = np.where(
        (y_resampled == class_label) & 
        (np.arange(len(y_resampled)) >= original_sample_count)
    )[0]
    
    # If we didn't generate enough synthetic samples
    if len(synthetic_class_indices) < samples_per_class:
        print(f"Warning: Only generated {len(synthetic_class_indices)} synthetic samples for class {class_label}")
        
        # If we have insufficient synthetic samples, we'll need to duplicate some
        needed = samples_per_class - len(synthetic_class_indices)
        duplicate_indices = np.random.choice(synthetic_class_indices, size=needed, replace=True)
        
        # Combine original synthetic indices with duplicates
        synthetic_class_indices = np.concatenate([synthetic_class_indices, duplicate_indices])
    
    # Take exactly 15k synthetic samples
    synthetic_class_indices = synthetic_class_indices[:samples_per_class]
    
    # Extract the synthetic samples
    if sparse.issparse(X_resampled):
        class_synthetic_X = X_resampled[synthetic_class_indices].toarray()
    else:
        class_synthetic_X = X_resampled[synthetic_class_indices]
        
    class_synthetic_y = y_resampled[synthetic_class_indices]
    
    # Add to our collection
    synthetic_X.append(class_synthetic_X)
    synthetic_y.append(class_synthetic_y)

# Combine all synthetic samples
all_synthetic_X = np.vstack(synthetic_X)
all_synthetic_y = np.concatenate(synthetic_y)

Original class distribution: Counter({4: 22503, 3: 8570, 0: 306, 2: 148, 1: 29})
Resampled class distribution: Counter({4: 37509, 3: 23578, 0: 15309, 2: 15152, 1: 15031})
Extracting synthetic samples for class 0...
Extracting synthetic samples for class 1...
Extracting synthetic samples for class 2...
Extracting synthetic samples for class 3...
Extracting synthetic samples for class 4...


In [4]:
all_synthetic_X.shape

(75000, 5287)

In [5]:
import h5py

In [6]:
def save_data_withh5py(X_res, y_res):
    # Convert X to sparse CSR if it's dense
    if not sparse.issparse(X_res):
        X_res = sparse.csr_matrix(X_res)

    with h5py.File("output/features4ausw4linearsvc_trainsampledonlynew.h5", "w") as f:
        # Save sparse matrix X as a compressed dataset
        f.create_dataset("X_data", data=X_res.data)
        f.create_dataset("X_indices", data=X_res.indices)
        f.create_dataset("X_indptr", data=X_res.indptr)
        f.create_dataset("X_shape", data=X_res.shape)

        # Save y as a dense dataset
        f.create_dataset("y", data=y_res)

save_data_withh5py(all_synthetic_X, all_synthetic_y)