<a href="https://colab.research.google.com/github/vishnu9358862212/AIML-EXP/blob/main/EXPERIMENT9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Display basic details
print("Dataset Info:")
print(df.info())
print("\nClass Distribution:")
print(df['target'].value_counts())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness e

In [2]:
from imblearn.over_sampling import RandomOverSampler

# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(df.iloc[:, :-1], df['target'])

# Check new class distribution
print("\nClass Distribution After Oversampling:")
print(pd.Series(y_ros).value_counts())



Class Distribution After Oversampling:
target
0    357
1    357
Name: count, dtype: int64


In [3]:
from imblearn.under_sampling import RandomUnderSampler

# Apply Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(df.iloc[:, :-1], df['target'])

# Check new class distribution
print("\nClass Distribution After Undersampling:")
print(pd.Series(y_rus).value_counts())



Class Distribution After Undersampling:
target
0    212
1    212
Name: count, dtype: int64


In [4]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(df.iloc[:, :-1], df['target'])

# Check new class distribution
print("\nClass Distribution After SMOTE:")
print(pd.Series(y_smote).value_counts())



Class Distribution After SMOTE:
target
0    357
1    357
Name: count, dtype: int64


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split the original imbalanced dataset
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['target'], test_size=0.2, random_state=42)

# Function to train and evaluate Logistic Regression
from sklearn.metrics import classification_report, roc_auc_score

def evaluate_model(X_train, X_test, y_train, y_test, class_weight=None):
    model = LogisticRegression(class_weight=class_weight, random_state=42, max_iter=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    report = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_proba)

    return report, auc


In [12]:
# Evaluate original data
original_report, original_auc = evaluate_model(X_train, X_test, y_train, y_test)

# Evaluate Random Oversampling
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
ros_report, ros_auc = evaluate_model(X_train_ros, X_test, y_train_ros, y_test)

# Evaluate Random Undersampling
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
rus_report, rus_auc = evaluate_model(X_train_rus, X_test, y_train_rus, y_test)

# Evaluate SMOTE
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
smote_report, smote_auc = evaluate_model(X_train_smote, X_test, y_train_smote, y_test)

# Evaluate Class Weighting
class_weight_report, class_weight_auc = evaluate_model(X_train, X_test, y_train, y_test, class_weight="balanced")


In [13]:
import pandas as pd

# Extract F1-Score and AUC for each method
results = {
    "Technique": ["Original", "Random Oversampling", "Random Undersampling", "SMOTE", "Class Weighting"],
    "F1-Score": [
        original_report["weighted avg"]["f1-score"],
        ros_report["weighted avg"]["f1-score"],
        rus_report["weighted avg"]["f1-score"],
        smote_report["weighted avg"]["f1-score"],
        class_weight_report["weighted avg"]["f1-score"]
    ],
    "AUC": [original_auc, ros_auc, rus_auc, smote_auc, class_weight_auc]
}

results_df = pd.DataFrame(results)
print(results_df)


              Technique  F1-Score       AUC
0              Original  0.955801  0.997707
1   Random Oversampling  0.973621  0.995742
2  Random Undersampling  0.973621  0.997380
3                 SMOTE  0.964738  0.997380
4       Class Weighting  0.973621  0.997707
