In [1]:
# pipeline_pca_multi_models.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# --- load data (Wine dataset) ---
df = pd.read_csv("Wine.csv")   # ensure Wine.csv is in same folder
X = df.iloc[:, 0:13].values
y = df.iloc[:, 13].values

# --- train/test split (preserve class balance) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

# common PCA choices (include float for explained variance)
pca_choices = [2, 3, 5, 8, 0.95]   # last entry keeps components explaining 95% variance

# cross-validation splitter
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Model configurations: (name, estimator, param_grid)
model_configs = [
    ("Logistic", LogisticRegression(max_iter=5000, solver="liblinear", random_state=0),
     {"pca__n_components": pca_choices, "clf__C": [0.01, 0.1, 1, 10]}),

    ("SVM_Linear", SVC(kernel="linear", random_state=0),
     {"pca__n_components": pca_choices, "clf__C": [0.1, 1, 10]}),

    ("SVM_RBF", SVC(kernel="rbf", random_state=0),
     {"pca__n_components": pca_choices, "clf__C": [0.1, 1, 10], "clf__gamma": ["scale", "auto"]}),

    ("DecisionTree", DecisionTreeClassifier(random_state=0),
     {"pca__n_components": pca_choices, "clf__max_depth": [None, 3, 5, 8]}),

    ("RandomForest", RandomForestClassifier(random_state=0, n_jobs=-1),
     {"pca__n_components": pca_choices, "clf__n_estimators": [50, 100], "clf__max_depth": [None, 5, 10]}),

    ("KNN", KNeighborsClassifier(),
     {"pca__n_components": pca_choices, "clf__n_neighbors": [3,5,7]}),

    ("GaussianNB", GaussianNB(),
     {"pca__n_components": pca_choices})
]

results = []

for name, estimator, param_grid in model_configs:
    print("\n" + "="*80)
    print(f"Running GridSearchCV for: {name}")

    # pipeline: scaler -> pca -> classifier (clf)
    pipe = Pipeline([("scaler", StandardScaler()), ("pca", PCA()), ("clf", estimator)])

    # GridSearchCV
    gs = GridSearchCV(pipe, param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=0)
    gs.fit(X_train, y_train)

    best_pipe = gs.best_estimator_
    best_params = gs.best_params_
    best_cv_score = gs.best_score_

    # Evaluate on test set
    y_pred = best_pipe.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)

    print(f"Model: {name}")
    print(f" Best CV score: {best_cv_score:.4f}")
    print(f" Test accuracy : {test_acc:.4f}")
    print(" Best params:", best_params)
    print(" Confusion matrix:\n", cm)
    print(" Classification report:\n", report)

    # capture results
    results.append({
        "model": name,
        "best_cv_score": best_cv_score,
        "test_acc": test_acc,
        "best_params": best_params,
        "confusion_matrix": cm,
        "clf_object": best_pipe
    })

# summary table
summary_df = pd.DataFrame([{
    "Model": r["model"],
    "CV_Acc": round(r["best_cv_score"], 4),
    "Test_Acc": round(r["test_acc"], 4),
    "Best_Params": r["best_params"]
} for r in results])

print("\n" + "="*80)
print("Summary of models:")
print(summary_df.sort_values("Test_Acc", ascending=False).reset_index(drop=True))



Running GridSearchCV for: Logistic




Model: Logistic
 Best CV score: 0.9722
 Test accuracy : 1.0000
 Best params: {'clf__C': 10, 'pca__n_components': 8}
 Confusion matrix:
 [[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]
 Classification report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36


Running GridSearchCV for: SVM_Linear
Model: SVM_Linear
 Best CV score: 0.9650
 Test accuracy : 0.9722
 Best params: {'clf__C': 1, 'pca__n_components': 5}
 Confusion matrix:
 [[11  1  0]
 [ 0 14  0]
 [ 0  0 10]]
 Classification report:
               precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       0.93      1.00      0.97        14
           3       1.