In [1]:
import pandas as pd
import numpy as np
from knsmote import KMeansSMOTE
# from imblearn.over_sampling import KMeansSMOTE
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
from data import get_data

In [2]:
# Define features and target variable
X, y, X_feature_names = get_data()

# Print original class distribution
print("Original class distribution:", Counter(y))

Original class distribution: Counter({3: 22503, 1: 4386, 2: 3593, 0: 1074})


In [3]:
# Apply KMeansSMOTE to oversample minority classes
# Adjust KMeansSMOTE parameters
smote = KMeansSMOTE(
    kmeans_estimator=MiniBatchKMeans(n_clusters=10, n_init=1, random_state=0),  # Increase clusters
    cluster_balance_threshold=0.01,  # Lower threshold to allow smaller clusters
    sampling_strategy="auto",  # Try different values like {1: 500, 2: 500} for fine-tuning
    random_state=42
)

# Apply SMOTE
X_res, y_res = smote.fit_resample(X, y)

# Print new class distribution
print("Resampled class distribution:", Counter(y_res))

found 0 physical cores < 1
  File "c:\Users\paul-\anaconda3\envs\python38\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Resampled class distribution: Counter({1: 22506, 2: 22506, 0: 22506, 3: 22503})


In [10]:
X_res.shape

(90021, 5287)

In [11]:
len(X_feature_names.tolist())

5287

In [12]:
# Convert back to DataFrame and save the oversampled dataset
X_resampled_df = pd.DataFrame(X_res.toarray(), columns=X_feature_names.tolist())
y_resampled_df = pd.DataFrame(y_res, columns=["urgency"])

# Combine resampled features with urgency
resampled_df = pd.concat([y_resampled_df, X_resampled_df], axis=1)



In [14]:
import h5py

In [16]:
def save_data_withh5py(X_res, y_res):
    with h5py.File("output/features4dringl4linearsvc_trainsampled.h5", "w") as f:
        # Save sparse matrix X as a compressed dataset
        f.create_dataset("X_data", data=X_res.data)
        f.create_dataset("X_indices", data=X_res.indices)
        f.create_dataset("X_indptr", data=X_res.indptr)
        f.create_dataset("X_shape", data=X_res.shape)

        # Save y as a dense dataset
        f.create_dataset("y", data=y_res)

save_data_withh5py(X_res, y_res)