In [1]:
import pandas as pd
import numpy as np
from knsmote import KMeansSMOTE
# from imblearn.over_sampling import KMeansSMOTE
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
from data import get_data
import scipy.sparse as sparse

In [2]:
# Define features and target variable
X, y, X_feature_names = get_data()

# Print original class distribution
print("Original class distribution:", Counter(y))

Original class distribution: Counter({4: 22503, 3: 8570, 0: 306, 2: 148, 1: 29})


In [3]:
# Apply KMeansSMOTE to oversample minority classes
# Adjust KMeansSMOTE parameters
smote = KMeansSMOTE(
    kmeans_estimator=MiniBatchKMeans(n_clusters=20, n_init=1, random_state=0),  # Increase clusters
    cluster_balance_threshold=0.001,  # Lower threshold to allow smaller clusters
    sampling_strategy="auto",  # Try different values like {1: 500, 2: 500} for fine-tuning
    random_state=42
)

# Apply SMOTE
X_res, y_res = smote.fit_resample(X, y)

# Print new class distribution
print("Resampled class distribution:", Counter(y_res))

Resampled class distribution: Counter({3: 22510, 2: 22509, 0: 22506, 1: 22505, 4: 22503})


In [4]:
X_res.shape

(112533, 5287)

In [5]:
len(X_feature_names.tolist())

5287

In [6]:
# Convert back to DataFrame and save the oversampled dataset
X_resampled_df = pd.DataFrame(X_res.toarray(), columns=X_feature_names.tolist())
y_resampled_df = pd.DataFrame(y_res, columns=["impact"])

# Combine resampled features with impact
resampled_df = pd.concat([y_resampled_df, X_resampled_df], axis=1)



In [7]:
res = np.isin(X_res.toarray(), X).all(axis=1)

  mask |= (ar1 == a)


In [8]:
res.sum()

0

In [9]:
import h5py

In [10]:
def save_data_withh5py(X_res, y_res):
    # Convert X to sparse CSR if it's dense
    if not sparse.issparse(X_res):
        X_res = sparse.csr_matrix(X_res)
    with h5py.File("output/features4ausw4linearsvc_trainsampled.h5", "w") as f:
        # Save sparse matrix X as a compressed dataset
        f.create_dataset("X_data", data=X_res.data)
        f.create_dataset("X_indices", data=X_res.indices)
        f.create_dataset("X_indptr", data=X_res.indptr)
        f.create_dataset("X_shape", data=X_res.shape)

        # Save y as a dense dataset
        f.create_dataset("y", data=y_res)

save_data_withh5py(X_res, y_res)