How can you use Python to handle imbalanced datasets for classification tasks?


In [2]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting numpy>=1.19.5 (from scikit-learn)
  Using cached numpy-2.2.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.1 MB 6.2 MB/s eta 0:00:02
   ------ --------------------------------- 1.8/11.1 MB 4.7 MB/s eta 0:00:02
   ---------- ----------------------------- 2.9/11.1 MB 4.9 MB/s eta 0:00:02
   --------------- ------------------------ 4.2/11.1 MB 5.1 MB/s eta 0:00:0

In [5]:
from collections import Counter
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Load dataset (using diabetes dataset as an example)
data = load_diabetes()
X, y = data.data, (data.target > 140).astype(int)  # Binarizing target for classification

# Display original class distribution
print("Original class distribution:", Counter(y))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Manually oversample the minority class
def oversample(X, y):
    X_majority = X[y == 0]
    X_minority = X[y == 1]
    y_majority = y[y == 0]
    y_minority = y[y == 1]
    
    num_majority = len(y_majority)
    num_minority = len(y_minority)
    
    if num_minority < num_majority:
        oversample_size = num_majority - num_minority
        indices = np.random.choice(len(X_minority), oversample_size, replace=True)
        X_oversampled = np.vstack([X, X_minority[indices]])
        y_oversampled = np.hstack([y, y_minority[indices]])
        return X_oversampled, y_oversampled
    else:
        print("No need for oversampling, classes are balanced or minority is larger.")
        return X, y

# Apply manual oversampling
X_train_resampled, y_train_resampled = oversample(X_train, y_train)

# Display resampled class distribution
print("Resampled class distribution:", Counter(y_train_resampled))

# Train a classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate
predictions = clf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))


Original class distribution: Counter({np.int64(1): 221, np.int64(0): 221})
No need for oversampling, classes are balanced or minority is larger.
Resampled class distribution: Counter({np.int64(1): 160, np.int64(0): 149})
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.72      0.76        72
           1       0.71      0.79      0.74        61

    accuracy                           0.75       133
   macro avg       0.75      0.75      0.75       133
weighted avg       0.76      0.75      0.75       133

Accuracy: 0.7518796992481203
