In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)
from sklearn.feature_selection import mutual_info_classif, SelectKBest
import seaborn as sns
import matplotlib.pyplot as plt

from tabpfn import TabPFNClassifier

# === 1) Load data ===
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

# Drop unnecessary columns
for col in ["id", "Unnamed: 32"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Features / Target
X = df.drop(columns=["diagnosis"])
y = df["diagnosis"]

# Encode labels (M=1, B=0)
le = LabelEncoder()
y = le.fit_transform(y)

# === 2) Train/Test Split (before feature selection to avoid leakage) ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === 3) Mutual Information Feature Selection (fit ONLY on train) ===

TOP_K = 15

#y_train
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X_train.columns).sort_values(ascending=False)

#  Top-K
top_features = mi_series.head(TOP_K).index.tolist()

print("\n=== Top-{} Features by Mutual Information ===".format(TOP_K))
for i, (feat, score) in enumerate(mi_series.head(TOP_K).items(), start=1):
    print(f"{i:2d}. {feat:25s}  MI = {score:.4f}")

#MI
plt.figure(figsize=(8, 5))
sns.barplot(x=mi_series.head(TOP_K).values, y=mi_series.head(TOP_K).index, orient='h')
plt.title(f"Mutual Information (Top {TOP_K})")
plt.xlabel("MI Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# The selected data
X_train_sel = X_train[top_features].copy()
X_test_sel  = X_test[top_features].copy()

# === 4) TabPFN Classifier
clf = TabPFNClassifier(device="cpu")
clf.fit(X_train_sel, y_train)

# Predictions
y_pred = clf.predict(X_test_sel)
y_proba = clf.predict_proba(X_test_sel)[:, 1]

# === 5) Metrics ===
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)
auc  = roc_auc_score(y_test, y_proba)

print("\n=== TabPFN + MI (Top-{}) Metrics ===".format(TOP_K))
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")

# === 6) Confusion Matrix ===
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(3.2, 3.2))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False,
            xticklabels=['Benign','Malignant'], yticklabels=['Benign','Malignant'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("TabPFN (Top-{} via Mutual Information)".format(TOP_K))
plt.tight_layout()
plt.show()

In [None]:
# Imports
import os
import kagglehub
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

import matplotlib.pyplot as plt
import seaborn as sns

# Download + Load data

path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data")
print("Path to dataset files:", path)
print("Files:", os.listdir(path))

# data.cs
df = pd.read_csv(os.path.join(path, "data.csv"))

# Clean

for col in ["id", "Unnamed: 32"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Target / Features
y = df["diagnosis"]                # 'M' or 'B'
X = df.drop(columns=["diagnosis"])

# B=0, M=1
le = LabelEncoder()
y = le.fit_transform(y)

# 4) Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocess
num_cols = X_train.columns.tolist()
numeric_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])
preprocess = ColumnTransformer(
    transformers=[("num", numeric_pre, num_cols)],
    remainder="drop"
)

#  RandomForest (no Feature Selection)
rf_base = Pipeline(steps=[
    ("prep", preprocess),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        random_state=42,
        n_jobs=-1
    ))
])

rf_base.fit(X_train, y_train)
y_pred_base  = rf_base.predict(X_test)
y_proba_base = rf_base.predict_proba(X_test)[:, 1]

print("\n=== RandomForest (no FS) ===")
print("Accuracy :", accuracy_score(y_test, y_pred_base))
print("Precision:", precision_score(y_test, y_pred_base))
print("Recall   :", recall_score(y_test, y_pred_base))
print("F1-Score :", f1_score(y_test, y_pred_base))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_base))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred_base, target_names=["Benign","Malignant"]))


# RandomForest + Feature Selection (Mutual Information)
rf_fs = Pipeline(steps=[
    ("prep", preprocess),
    ("kbest", SelectKBest(score_func=mutual_info_classif, k=10)),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        random_state=42,
        n_jobs=-1
    ))
])

rf_fs.fit(X_train, y_train)
y_pred_fs  = rf_fs.predict(X_test)
y_proba_fs = rf_fs.predict_proba(X_test)[:, 1]

print("\n=== RandomForest + Feature Selection (Mutual Info) ===")
print("Accuracy :", accuracy_score(y_test, y_pred_fs))
print("Precision:", precision_score(y_test, y_pred_fs))
print("Recall   :", recall_score(y_test, y_pred_fs))
print("F1-Score :", f1_score(y_test, y_pred_fs))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_fs))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred_fs, target_names=["Benign","Malignant"]))



# Confusion Matrix - no Feature Selection
cm_base = confusion_matrix(y_test, y_pred_base)

# Confusion Matrix - with Feature Selection
cm_fs = confusion_matrix(y_test, y_pred_fs)

# fig
fig, axes = plt.subplots(1, 2, figsize=(6,3))

# no FS
sns.heatmap(cm_base, annot=True, fmt="d", cmap="Blues", ax=axes[0])
axes[0].set_title("RandomForest ")
axes[0].set_xlabel("Predicted")
axes[0].set_ylabel("Actual")

# with FS (Mutual Info)
sns.heatmap(cm_fs, annot=True, fmt="d", cmap="Blues", ax=axes[1])
axes[1].set_title("RandomForest + FS (MI)")
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

try:
    from sklearn import set_config
    set_config(transform_output="pandas")
except Exception:
    pass

# Extract the names of the selected features (after SelectKBest)
kbest_step = rf_fs.named_steps["kbest"]
selected_mask = kbest_step.get_support()
selected_features = np.array(num_cols)[selected_mask]

rf_model = rf_fs.named_steps["rf"]
importances = rf_model.feature_importances_

feat_imp = pd.DataFrame({"feature": selected_features,
                         "importance": importances}).sort_values("importance", ascending=False)

print("\nTop features (RF + MI):")
print(feat_imp.head(10))

plt.figure(figsize=(6,3))
sns.barplot(x="importance", y="feature", data=feat_imp.head(10))
plt.title("Top Features (RF + Mutual Info)")
plt.tight_layout(); plt.show()