In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from tabpfn import TabPFNClassifier

# === Load data ===
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

# Drop unnecessary columns
for col in ["id", "Unnamed: 32"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Features and target
X = df.drop(columns=["diagnosis"])
y = df["diagnosis"]

# Encode labels (M=1, B=0)
le = LabelEncoder()
y = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === TabPFN Classifier ===
clf = TabPFNClassifier(device="cpu")  # if you have GPU
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Metrics
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("ROC AUC  :", roc_auc_score(y_test, y_proba))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(3,3))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("TabPFN Confusion Matrix")
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)
from sklearn.feature_selection import mutual_info_classif, SelectKBest
import seaborn as sns
import matplotlib.pyplot as plt

from tabpfn import TabPFNClassifier

# === 1) Load data ===
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

# Drop unnecessary columns
for col in ["id", "Unnamed: 32"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Features / Target
X = df.drop(columns=["diagnosis"])
y = df["diagnosis"]

# Encode labels (M=1, B=0)
le = LabelEncoder()
y = le.fit_transform(y)

# === 2) Train/Test Split (before feature selection to avoid leakage) ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === 3) Mutual Information Feature Selection (fit ONLY on train) ===

TOP_K = 15

#y_train
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X_train.columns).sort_values(ascending=False)

#  Top-K
top_features = mi_series.head(TOP_K).index.tolist()

print("\n=== Top-{} Features by Mutual Information ===".format(TOP_K))
for i, (feat, score) in enumerate(mi_series.head(TOP_K).items(), start=1):
    print(f"{i:2d}. {feat:25s}  MI = {score:.4f}")

#MI
plt.figure(figsize=(8, 5))
sns.barplot(x=mi_series.head(TOP_K).values, y=mi_series.head(TOP_K).index, orient='h')
plt.title(f"Mutual Information (Top {TOP_K})")
plt.xlabel("MI Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# The selected data
X_train_sel = X_train[top_features].copy()
X_test_sel  = X_test[top_features].copy()

# === 4) TabPFN Classifier
clf = TabPFNClassifier(device="cpu")
clf.fit(X_train_sel, y_train)

# Predictions
y_pred = clf.predict(X_test_sel)
y_proba = clf.predict_proba(X_test_sel)[:, 1]

# === 5) Metrics ===
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)
auc  = roc_auc_score(y_test, y_proba)

print("\n=== TabPFN + MI (Top-{}) Metrics ===".format(TOP_K))
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")

# === 6) Confusion Matrix ===
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(3.2, 3.2))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", cbar=False,
            xticklabels=['Benign','Malignant'], yticklabels=['Benign','Malignant'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("TabPFN (Top-{} via Mutual Information)".format(TOP_K))
plt.tight_layout()
plt.show()