<a href="https://colab.research.google.com/github/tanumeena28/ML-Assignment/blob/main/ml_lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE

iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)

# Function to train & evaluate with Linear Regression (forcing classification)
def evaluate_model(X_train, X_val, y_train, y_val, method="Original"):
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predictions will be continuous → round to nearest integer
    y_pred = np.rint(model.predict(X_val)).astype(int)

    acc = accuracy_score(y_val, y_pred)
    macro_f1 = f1_score(y_val, y_pred, average="macro")
    print(f"\n===== {method} =====")
    print(classification_report(y_val, y_pred))
    return acc, macro_f1


splits = [(0.8, 0.1, 0.1), (0.7, 0.15, 0.15)]
results = []

for train_size, val_size, test_size in splits:
    print(f"\n=== Split: Train {int(train_size*100)}%, Val {int(val_size*100)}%, Test {int(test_size*100)}% ===")


    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, train_size=train_size, stratify=y, random_state=42
    )


    relative_val = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, train_size=relative_val, stratify=y_temp, random_state=42
    )


    acc, f1 = evaluate_model(X_train, X_val, y_train, y_val, method="Original")
    results.append((f"{int(train_size*100)}-{int(val_size*100)}-{int(test_size*100)}", "Original", acc, f1))


    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X_train, y_train)
    acc, f1 = evaluate_model(X_res, X_val, y_res, y_val, method="Random Oversampling")
    results.append((f"{int(train_size*100)}-{int(val_size*100)}-{int(test_size*100)}", "RandomOversampling", acc, f1))

    # SMOTE
    smote = SMOTE(random_state=42, k_neighbors=1)  # you can also try k=5
    X_res, y_res = smote.fit_resample(X_train, y_train)
    acc, f1 = evaluate_model(X_res, X_val, y_res, y_val, method="SMOTE")
    results.append((f"{int(train_size*100)}-{int(val_size*100)}-{int(test_size*100)}", "SMOTE", acc, f1))


df_results = pd.DataFrame(results, columns=["Split", "Method", "Accuracy", "Macro_F1"])
print("\n===== Final Comparison Table =====")
print(df_results)



=== Split: Train 80%, Val 10%, Test 10% ===

===== Original =====
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15


===== Random Oversampling =====
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15


===== SMOTE =====
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
          

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Load dataset
df = pd.read_csv("Iris.csv")

# Features and labels
X = df.drop(columns=["Species", "Id"])
y = df["Species"]

# Encode labels to numeric
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)


In [None]:
print("Train class distribution:\n", pd.Series(y_train).value_counts(normalize=True))
print("Validation class distribution:\n", pd.Series(y_val).value_counts(normalize=True))
print("Test class distribution:\n", pd.Series(y_test).value_counts(normalize=True))


Train class distribution:
 0    0.333333
2    0.333333
1    0.333333
Name: proportion, dtype: float64
Validation class distribution:
 0    0.333333
2    0.333333
1    0.333333
Name: proportion, dtype: float64
Test class distribution:
 0    0.333333
1    0.333333
2    0.333333
Name: proportion, dtype: float64


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Validate
y_pred_val = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))


Validation Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      0.80      0.89         5
           2       0.83      1.00      0.91         5

    accuracy                           0.93        15
   macro avg       0.94      0.93      0.93        15
weighted avg       0.94      0.93      0.93        15



In [None]:
# Always reset index so X and y indices match
X_train_imbalanced = X_train_imbalanced.reset_index(drop=True)
y_train_imbalanced = y_train_imbalanced.reset_index(drop=True)

# Select samples from class 0
mask = (y_train_imbalanced == 0)
drop_idx = y_train_imbalanced[mask].sample(15, random_state=42).index

# Drop those indices
X_train_imbalanced = X_train_imbalanced.drop(drop_idx)
y_train_imbalanced = y_train_imbalanced.drop(drop_idx)

print("Class distribution before oversampling:\n", y_train_imbalanced.value_counts())


Class distribution before oversampling:
 2    40
1    40
0    25
Name: count, dtype: int64


In [None]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_imbalanced, y_train_imbalanced)

print("After Random Oversampling:\n", pd.Series(y_resampled).value_counts())


After Random Oversampling:
 0    40
2    40
1    40
Name: count, dtype: int64


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_imbalanced, y_train_imbalanced)

print("After SMOTE:\n", pd.Series(y_smote).value_counts())


After SMOTE:
 0    40
2    40
1    40
Name: count, dtype: int64


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train on RandomOversampler output
model_ros = LogisticRegression(max_iter=200)
model_ros.fit(X_resampled, y_resampled)

y_pred_ros = model_ros.predict(X_val)
print("Validation Accuracy (Random Oversampling):", accuracy_score(y_val, y_pred_ros))
print(classification_report(y_val, y_pred_ros))

# Train on SMOTE output
model_smote = LogisticRegression(max_iter=200)
model_smote.fit(X_smote, y_smote)

y_pred_smote = model_smote.predict(X_val)
print("Validation Accuracy (SMOTE):", accuracy_score(y_val, y_pred_smote))
print(classification_report(y_val, y_pred_smote))


Validation Accuracy (Random Oversampling): 0.9333333333333333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      0.80      0.89         5
           2       0.83      1.00      0.91         5

    accuracy                           0.93        15
   macro avg       0.94      0.93      0.93        15
weighted avg       0.94      0.93      0.93        15

Validation Accuracy (SMOTE): 0.9333333333333333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      0.80      0.89         5
           2       0.83      1.00      0.91         5

    accuracy                           0.93        15
   macro avg       0.94      0.93      0.93        15
weighted avg       0.94      0.93      0.93        15



In [None]:
# Always reset index so X and y indices match
X_train_imbalanced = X_train_imbalanced.reset_index(drop=True)
y_train_imbalanced = y_train_imbalanced.reset_index(drop=True)

# Select samples from class 0
mask = (y_train_imbalanced == 1)
drop_idx = y_train_imbalanced[mask].sample(15, random_state=42).index

# Drop those indices
X_train_imbalanced = X_train_imbalanced.drop(drop_idx)
y_train_imbalanced = y_train_imbalanced.drop(drop_idx)

print("Class distribution before oversampling:\n", y_train_imbalanced.value_counts())


Class distribution before oversampling:
 2    40
0    25
1    25
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE

# 1. Load dataset
df = pd.read_csv("Iris.csv")
X = df.drop(columns=["Species", "Id"])
y = df["Species"]

# Encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Stratified split 80/10/10
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Store results
results = []

# 2. Run experiment for each class being undersampled
for cls in np.unique(y_train):
    print(f"\n===== Experiment: Undersample Class {cls} =====")

    # Convert to DataFrame/Series with aligned indices
    X_train_df = pd.DataFrame(X_train, columns=X.columns).reset_index(drop=True)
    y_train_df = pd.Series(y_train).reset_index(drop=True)

    # Remove 15 samples from this class
    mask = (y_train_df == cls)
    drop_idx = y_train_df[mask].sample(15, random_state=42).index
    X_train_imb = X_train_df.drop(drop_idx).reset_index(drop=True)
    y_train_imb = y_train_df.drop(drop_idx).reset_index(drop=True)

    print("Class distribution before oversampling:\n", y_train_imb.value_counts())

    # 2a. Random Oversampling
    ros = RandomOverSampler(random_state=42)
    X_ros, y_ros = ros.fit_resample(X_train_imb, y_train_imb)

    model_ros = LogisticRegression(max_iter=200)
    model_ros.fit(X_ros, y_ros)
    y_pred_ros = model_ros.predict(X_val)
    acc_ros = accuracy_score(y_val, y_pred_ros)
    report_ros = classification_report(y_val, y_pred_ros, output_dict=True)

    results.append({
        "Minority_Class": cls,
        "Method": "Random Oversampling",
        "Accuracy": acc_ros,
        "Macro_F1": report_ros["macro avg"]["f1-score"]
    })

    # 2b. SMOTE
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(X_train_imb, y_train_imb)

    model_smote = LogisticRegression(max_iter=200)
    model_smote.fit(X_smote, y_smote)
    y_pred_smote = model_smote.predict(X_val)
    acc_smote = accuracy_score(y_val, y_pred_smote)
    report_smote = classification_report(y_val, y_pred_smote, output_dict=True)

    results.append({
        "Minority_Class": cls,
        "Method": "SMOTE",
        "Accuracy": acc_smote,
        "Macro_F1": report_smote["macro avg"]["f1-score"]
    })

# 3. Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\n===== Final Comparison Table =====\n")
print(results_df)




===== Experiment: Undersample Class 0 =====
Class distribution before oversampling:
 2    40
1    40
0    25
Name: count, dtype: int64

===== Experiment: Undersample Class 1 =====
Class distribution before oversampling:
 0    40
2    40
1    25
Name: count, dtype: int64

===== Experiment: Undersample Class 2 =====
Class distribution before oversampling:
 0    40
1    40
2    25
Name: count, dtype: int64

===== Final Comparison Table =====

   Minority_Class               Method  Accuracy  Macro_F1
0               0  Random Oversampling  0.933333   0.93266
1               0                SMOTE  0.933333   0.93266
2               1  Random Oversampling  0.933333   0.93266
3               1                SMOTE  0.933333   0.93266
4               2  Random Oversampling  1.000000   1.00000
5               2                SMOTE  1.000000   1.00000


In [None]:
# ==== Iris: Linear Regression + Oversampling/SMOTE Experiments ====
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score

from imblearn.over_sampling import RandomOverSampler, SMOTE

# ----------------- Helpers -----------------
def stratified_split(X, y, train_p, val_p, test_p, seed=42):
    assert abs(train_p + val_p + test_p - 1.0) < 1e-9
    # First: train vs temp
    X_tr, X_tmp, y_tr, y_tmp = train_test_split(
        X, y, test_size=(1 - train_p), stratify=y, random_state=seed
    )
    # Then: temp -> val & test
    val_share_of_tmp = val_p / (val_p + test_p)  # split temp into val/test
    X_val, X_te, y_val, y_te = train_test_split(
        X_tmp, y_tmp, test_size=(1 - val_share_of_tmp), stratify=y_tmp, random_state=seed
    )
    return X_tr, X_val, X_te, y_tr, y_val, y_te

def make_imbalance_drop(X_tr, y_tr, minority_cls, n_drop, seed=42):
    """Drop n_drop rows of the chosen class from the training set (keeps indices aligned)."""
    X_df = pd.DataFrame(X_tr, columns=X.columns).reset_index(drop=True)
    y_sr = pd.Series(y_tr).reset_index(drop=True)

    idx_pool = y_sr[y_sr == minority_cls].index
    n_drop = min(n_drop, len(idx_pool) - 1)  # keep at least 1 sample
    drop_idx = idx_pool.to_series().sample(n_drop, random_state=seed).index

    X_imb = X_df.drop(drop_idx).reset_index(drop=True)
    y_imb = y_sr.drop(drop_idx).reset_index(drop=True)
    return X_imb, y_imb

def train_eval_linear_ovr(X_tr, y_tr, X_eval, y_eval):
    """
    Linear Regression used as a classifier:
      - one-hot encode y
      - fit multi-output LinearRegression
      - predict scores and argmax to class labels
    """
    ohe = OneHotEncoder(sparse_output=False)
    Y_tr = ohe.fit_transform(np.array(y_tr).reshape(-1, 1))  # (n, n_classes)

    lin = LinearRegression()
    lin.fit(X_tr, Y_tr)
    scores = lin.predict(X_eval)  # (m, n_classes)
    y_pred = scores.argmax(axis=1)

    acc = accuracy_score(y_eval, y_pred)
    macro_f1 = f1_score(y_eval, y_pred, average="macro")
    report = classification_report(y_eval, y_pred, zero_division=0)
    return acc, macro_f1, report

def run_block(X_tr, y_tr, X_val, y_val, minority_cls, title, sampler):
    """Fit sampler on *training only*, then train/evaluate Linear OVR on validation."""
    X_bal, y_bal = sampler.fit_resample(X_tr, y_tr)
    acc, macro_f1, report = train_eval_linear_ovr(X_bal, y_bal, X_val, y_val)
    row = {
        "Minority_Class": minority_cls,
        "Method": title,
        "Val_Accuracy": acc,
        "Val_MacroF1": macro_f1,
    }
    return row, report

# ----------------- Load & Encode -----------------
df = pd.read_csv("Iris.csv")
X = df.drop(columns=["Species", "Id"])
y = LabelEncoder().fit_transform(df["Species"])  # 0,1,2

# ----------------- Experiment Settings -----------------
split_schemes = [
    (0.80, 0.10, 0.10, "80/10/10"),
    (0.70, 0.15, 0.15, "70/15/15"),
]
minority_drop = 15  # how many to remove from the chosen class in the training split

all_results = []

for train_p, val_p, test_p, tag in split_schemes:
    print(f"\n================= Split Scheme: {tag} =================")
    X_train, X_val, X_test, y_train, y_val, y_test = stratified_split(
        X, y, train_p, val_p, test_p, seed=42
    )

    # Baseline (no imbalance, no oversampling) with Linear Regression classifier
    base_acc, base_f1, base_report = train_eval_linear_ovr(X_train, y_train, X_val, y_val)
    print("\n--- Baseline (No Imbalance) on Validation ---")
    print(base_report)
    all_results.append({
        "Split": tag, "Minority_Class": "None", "Method": "Baseline (Linear OVR)",
        "Val_Accuracy": base_acc, "Val_MacroF1": base_f1
    })

    # Run for each class as minority
    for cls in np.unique(y_train):
        # Create imbalance by dropping samples from this class
        X_imb, y_imb = make_imbalance_drop(X_train, y_train, minority_cls=cls, n_drop=minority_drop, seed=42)

        print(f"\n--- Class distribution after dropping from class {cls} (train only) ---")
        print(pd.Series(y_imb).value_counts())

        # 1) Random Oversampling
        ros = RandomOverSampler(random_state=42)
        row, report = run_block(X_imb, y_imb, X_val, y_val, cls, "Random Oversampling", ros)
        print("\nRandom Oversampling (Validation Report):\n", report)
        row.update({"Split": tag})
        all_results.append(row)

        # 2) SMOTE Setting (a): "Take any 2 samples" → approximate via k_neighbors=2
        smote_two = SMOTE(random_state=42, k_neighbors=2)
        row, report = run_block(X_imb, y_imb, X_val, y_val, cls, "SMOTE (k_neighbors=2)", smote_two)
        print("\nSMOTE k=2 (Validation Report):\n", report)
        row.update({"Split": tag})
        all_results.append(row)

        # 3) SMOTE Setting (b): "Nearest sample" → k_neighbors=1
        smote_one = SMOTE(random_state=42, k_neighbors=1)
        row, report = run_block(X_imb, y_imb, X_val, y_val, cls, "SMOTE (k_neighbors=1, nearest)", smote_one)
        print("\nSMOTE k=1 (nearest) (Validation Report):\n", report)
        row.update({"Split": tag})
        all_results.append(row)

# ----------------- Final Tables -----------------
results_df = pd.DataFrame(all_results)
print("\n================= FINAL COMPARISON (Validation) =================")
print(results_df.sort_values(["Split", "Minority_Class", "Method"]).reset_index(drop=True))

# (Optional) Also evaluate the best model(s) on the untouched test set.
# You can re-train on the *oversampled training data* and then predict on X_test similarly.




--- Baseline (No Imbalance) on Validation ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       0.75      0.60      0.67         5
           2       0.67      0.80      0.73         5

    accuracy                           0.80        15
   macro avg       0.81      0.80      0.80        15
weighted avg       0.81      0.80      0.80        15


--- Class distribution after dropping from class 0 (train only) ---
2    40
1    40
0    25
Name: count, dtype: int64

Random Oversampling (Validation Report):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       0.75      0.60      0.67         5
           2       0.67      0.80      0.73         5

    accuracy                           0.80        15
   macro avg       0.81      0.80      0.80        15
weighted avg       0.81      0.80      0.80        15


SMOTE k=2 (Validation Repor