In [9]:
!pip install sdv



In [13]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [15]:
# Create a synthetic dataset with high class imbalance
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                           n_redundant=5, weights=[0.95, 0.05], flip_y=0,
                           random_state=42)

# Convert to DataFrame for easier handling
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
y = pd.Series(y)

# Check original class distribution
print("Original dataset shape:")
print(y.value_counts())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Without SMOTE ---
# Train a Random Forest model without SMOTE
model_without_smote = RandomForestClassifier(random_state=42)
model_without_smote.fit(X_train, y_train)

# Make predictions on the test set
y_pred_without_smote = model_without_smote.predict(X_test)

# Evaluate the model
print("\n--- Without SMOTE ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_without_smote))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_without_smote))

# --- With SMOTE ---
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Check resampled class distribution
print("\nResampled dataset shape:")
print(pd.Series(y_resampled).value_counts())

# Train a Random Forest model on the resampled data
model_with_smote = RandomForestClassifier(random_state=42)
model_with_smote.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_with_smote = model_with_smote.predict(X_test)

# Evaluate the model
print("\n--- With SMOTE ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_with_smote))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_with_smote))

Original dataset shape:
0    950
1     50
Name: count, dtype: int64

--- Without SMOTE ---
Confusion Matrix:
[[190   0]
 [  9   1]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       190
           1       1.00      0.10      0.18        10

    accuracy                           0.95       200
   macro avg       0.98      0.55      0.58       200
weighted avg       0.96      0.95      0.94       200


Resampled dataset shape:
0    760
1    760
Name: count, dtype: int64

--- With SMOTE ---
Confusion Matrix:
[[186   4]
 [  7   3]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       190
           1       0.43      0.30      0.35        10

    accuracy                           0.94       200
   macro avg       0.70      0.64      0.66       200
weighted avg       0.94      0.94      0.94       200

