In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import (
    classification_report,
    precision_recall_fscore_support,
    accuracy_score,
    confusion_matrix
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the CSV
df_msk = pd.read_csv("/home/val/workspaces/histotab/data/processed/msk_pattern_gene_mutations.csv", index_col=0)
df_tcga = pd.read_csv("/home/val/workspaces/histotab/data/processed/tcga_pattern_gene_mutations.csv", index_col=0)
df = pd.concat([df_msk, df_tcga], axis=0)

In [3]:
df_clean = df.drop(columns=["Study ID", "Patient ID"])
df_clean = df_clean[df_clean["Predominant Histologic Pattern"].notna()]


In [4]:
df_clean

Unnamed: 0_level_0,Predominant Histologic Pattern,EGFR,KRAS,BRAF,TP53,PIK3CA,PTEN,RBM10,TERT,SETD2,...,ALK,AKT1,SMARCA4,ARID1A,ARID2,RIT1,MAP2K1,SPOP,NFE2L2,TSC2
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P-0000208-T01-WES,Unknown,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0000219-T01-IM3,Solid,MUT,WT,MUT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0000280-T01-IM3,Acinar,MUT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0000348-T01-IM3,Papillary,MUT,WT,WT,MUT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0000459-T01-IM3,Papillary,WT,WT,WT,WT,WT,WT,WT,MUT,WT,...,WT,WT,WT,WT,MUT,WT,WT,WT,WT,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-NJ-A4YQ-01,Mixed,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,MUT,WT,WT,WT,WT,WT,WT,WT
TCGA-NJ-A55A-01,Acinar,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
TCGA-NJ-A55R-01,Solid,WT,MUT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
TCGA-NJ-A7XG-01,Unknown,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [None]:
gene_cols = df_clean.columns[1:]  # assuming first column is the label
mask = (df_clean[gene_cols] == 'not profiled').sum(axis=1) <= 10

# Filter the DataFrame
df_clean = df_clean[mask]

In [None]:

# Drop genes with 'not profiled'
genes_to_keep = [
    col for col in df_clean.columns[1:]
    if 'not profiled' not in df_clean[col].unique()
]
df_clean = df_clean[["Predominant Histologic Pattern"] + genes_to_keep]


In [None]:

# Drop unwanted labels
df_filtered = df_clean[
    ~df_clean["Predominant Histologic Pattern"].isin(["Unknown", "Mixed"])
]

In [None]:

# Encode gene features (MUT=1, WT=0)
X = df_filtered.drop(columns="Predominant Histologic Pattern").replace({"MUT": 1, "WT": 0}).astype(int)

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df_filtered["Predominant Histologic Pattern"])


In [None]:

# --- Define the pipeline ---
pipeline = Pipeline([
    ("feature_selection", SelectKBest(mutual_info_classif, k=30)),
    ("classifier", XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        objective='multi:softmax',
        num_class=len(le.classes_),
        random_state=42
    ))
])

# --- 5-fold CV setup ---
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
precisions, recalls, f1s, accuracies = [], [], [], []
all_y_true, all_y_pred = [], []

In [None]:

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="macro", zero_division=0
    )
    acc = accuracy_score(y_test, y_pred)

    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    accuracies.append(acc)

    all_y_true.extend(y_test)
    all_y_pred.extend(y_pred)

# --- Print average results ---
print("Average over 5 folds:")
print(f"Precision (macro): {np.mean(precisions):.2f}")
print(f"Recall (macro):    {np.mean(recalls):.2f}")
print(f"F1-score (macro):  {np.mean(f1s):.2f}")
print(f"Accuracy:          {np.mean(accuracies):.2f}")

# --- Confusion matrix ---
cm = confusion_matrix(all_y_true, all_y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - XGBoost (5-fold CV)")
plt.tight_layout()
plt.show()
