<a href="https://colab.research.google.com/github/vishalallen0018-ai/Python-Decision-Tree/blob/main/data%20set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Lung Cancer Classification with Decision Tree + PCA (≥95% variance)
Usage:
    python main.py [path_to_csv]

- Expects columns like: Name, Surname, Age, Smokes, AreaQ, Alcohol (or Alkhol), Result
- Outputs figures and metrics to ./outputs/
"""

import sys
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def main():
    csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("lung_cancer_examples.csv")
    out_dir = Path("outputs")
    out_dir.mkdir(exist_ok=True)

    if not csv_path.exists():
        print(f"[ERROR] CSV not found at {csv_path.resolve()}. Put your dataset in this folder.")
        sys.exit(1)

    df = pd.read_csv(csv_path)

    # Harmonize column names (Alcohol vs Alkhol) and trim whitespace
    df.columns = [c.strip() for c in df.columns]
    if "Alkhol" in df.columns and "Alcohol" not in df.columns:
        df = df.rename(columns={"Alkhol": "Alcohol"})

    # Drop identifiers if present
    for col in ["Name", "Surname"]:
        if col in df.columns:
            df = df.drop(columns=[col])

    if "Result" not in df.columns:
        raise ValueError("Target column 'Result' not found in CSV.")

    X = df.drop(columns=["Result"]).copy()
    y = df["Result"].copy()

    # Coerce numeric
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors="coerce")

    # Impute + scale
    imputer = SimpleImputer(strategy="median")
    X_imp = imputer.fit_transform(X)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imp)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

    # Baseline DT
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)

    def metrics(y_true, y_pred):
        return (
            accuracy_score(y_true, y_pred),
            precision_score(y_true, y_pred, zero_division=0),
            recall_score(y_true, y_pred, zero_division=0),
            f1_score(y_true, y_pred, zero_division=0),
            confusion_matrix(y_true, y_pred),
            classification_report(y_true, y_pred, zero_division=0)
        )

    b_acc, b_prec, b_rec, b_f1, b_cm, b_report = metrics(y_test, y_pred)
    print("\n=== Baseline Decision Tree ===")
    print(b_report)

    # Save baseline CM
    plt.figure()
    plt.imshow(b_cm, interpolation='nearest')
    plt.title('Baseline DT - Confusion Matrix')
    plt.xlabel('Predicted'); plt.ylabel('Actual')
    for i in range(b_cm.shape[0]):
        for j in range(b_cm.shape[1]):
            plt.text(j, i, str(b_cm[i, j]), ha='center', va='center')
    plt.tight_layout(); plt.savefig(out_dir / 'baseline_confusion_matrix.png', dpi=150); plt.close()

    # Feature importances
    orig_cols = [c for c in df.columns if c != "Result"]
    importances = getattr(dt, 'feature_importances_', None)
    if importances is not None:
        fi = pd.Series(importances, index=orig_cols).sort_values(ascending=False)
        fi.to_csv(out_dir / "baseline_feature_importances.csv")
        ax = fi.plot(kind='bar', title='Baseline DT - Feature Importances')
        ax.set_ylabel('Importance')
        plt.tight_layout(); plt.savefig(out_dir / 'baseline_feature_importances.png', dpi=150); plt.close()

    # PCA (>=95% variance)
    pca = PCA(n_components=0.95, svd_solver='full', random_state=42)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    cum_var = np.cumsum(pca.explained_variance_ratio_)
    plt.figure()
    plt.plot(np.arange(1, len(cum_var)+1), cum_var, marker='o')
    plt.axhline(0.95, linestyle='--')
    plt.title('PCA - Cumulative Explained Variance')
    plt.xlabel('Number of Components'); plt.ylabel('Cumulative Variance Ratio')
    plt.grid(True); plt.tight_layout(); plt.savefig(out_dir / 'pca_cumulative_variance.png', dpi=150); plt.close()

    # DT on PCA components
    dt_pca = DecisionTreeClassifier(random_state=42)
    dt_pca.fit(X_train_pca, y_train)
    y_pred_pca = dt_pca.predict(X_test_pca)

    p_acc, p_prec, p_rec, p_f1, p_cm, p_report = metrics(y_test, y_pred_pca)
    print("\n=== Decision Tree on PCA Components (>=95% variance) ===")
    print(p_report)

    plt.figure()
    plt.imshow(p_cm, interpolation='nearest')
    plt.title('DT on PCA - Confusion Matrix')
    plt.xlabel('Predicted'); plt.ylabel('Actual')
    for i in range(p_cm.shape[0]):
        for j in range(p_cm.shape[1]):
            plt.text(j, i, str(p_cm[i, j]), ha='center', va='center')
    plt.tight_layout(); plt.savefig(out_dir / 'pca_confusion_matrix.png', dpi=150); plt.close()

    # Comparison table
    comparison = pd.DataFrame({
        'Model': ['Baseline DT', 'DT + PCA (>=95% var)'],
        'Accuracy': [b_acc, p_acc],
        'Precision': [b_prec, p_prec],
        'Recall': [b_rec, p_rec],
        'F1': [b_f1, p_f1]
    })
    comparison.to_csv(out_dir / 'metrics_comparison.csv', index=False)
    print("\n=== Metrics Comparison ===\n", comparison)

    # Answer script
    lines = []
    lines.append("# Answer Script\\n")
    lines.append("## 1. Preprocessing\\n- Dropped identifiers (Name, Surname) if present.\\n- Median imputation.\\n- StandardScaler on features.\\n- Stratified 80/20 train-test split.\\n")
    lines.append("## 2. Baseline Decision Tree\\n")
    lines.append(f"- Accuracy: {b_acc:.3f}, Precision: {b_prec:.3f}, Recall: {b_rec:.3f}, F1: {b_f1:.3f}.\\n")
    lines.append("Confusion Matrix:\\n" + str(b_cm) + "\\n")
    if importances is not None:
        lines.append("- Top baseline feature: " + pd.Series(importances, index=orig_cols).sort_values(ascending=False).index[0] + "\\n")
    lines.append("## 3. PCA + Decision Tree\\n")
    lines.append(f"- Components retained: {X_train_pca.shape[1]} (>=95% variance).\\n")
    lines.append(f"- Accuracy: {p_acc:.3f}, Precision: {p_prec:.3f}, Recall: {p_rec:.3f}, F1: {p_f1:.3f}.\\n")
    lines.append("Confusion Matrix:\\n" + str(p_cm) + "\\n")
    lines.append("## 4. Discussion\\n")
    lines.append(f"- PCA effect on F1: {p_f1 - b_f1:+.3f}. PCA reduces dimensionality and collinearity; trees are already robust to scaling, so changes may be small or data-dependent.\\n")
    (out_dir / "answer_script.md").write_text("\\n".join(lines), encoding="utf-8")

if __name__ == "__main__":
    main()


[ERROR] CSV not found at /content/-f. Put your dataset in this folder.


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
