In [3]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier


class MISupervisedSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k_other=40, always_keep=None, random_state=42):
        from sklearn.feature_selection import mutual_info_classif
        self.k_other = k_other
        self.always_keep = always_keep if always_keep is not None else []
        self.random_state = random_state
        self._mi_func = mutual_info_classif

    def fit(self, X, y):
        X = pd.DataFrame(X)
        self.feature_names_in_ = np.array(X.columns)

        keep_mask = np.isin(self.feature_names_in_, self.always_keep)
        keep_idx = np.where(keep_mask)[0]

        imp = SimpleImputer(strategy="median")
        X_imp = imp.fit_transform(X)

        mi = self._mi_func(X_imp, y, random_state=self.random_state)

        non_keep_idx = np.where(~keep_mask)[0]
        mi_non_keep = mi[non_keep_idx]

        order = np.argsort(mi_non_keep)[::-1]
        order = order[: self.k_other]

        selected_non_keep_idx = non_keep_idx[order]

        final_idx = sorted(set(keep_idx) | set(selected_non_keep_idx))

        self.selected_mask_ = np.zeros_like(mi, dtype=bool)
        self.selected_mask_[final_idx] = True
        self.selected_features_ = self.feature_names_in_[self.selected_mask_]

        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X.loc[:, self.selected_features_]
        else:
            return X[:, self.selected_mask_]


TRAIN_FILE = "TrainDataset2025.xls"
TEST_FILE = "FinalTestDataset2025.xls"

ID_COL = "ID"
TARGET_PCR = "pCR (outcome)"
TARGET_RFS = "RelapseFreeSurvival (outcome)"

ALWAYS_KEEP = ["ER", "HER2", "Gene"]

BEST_THRESHOLD = 0.13


def build_final_model(scale_pos_weight):
    xgb = XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        n_estimators=600,
        learning_rate=0.03,
        max_depth=3,
        subsample=1.0,
        colsample_bytree=1.0,
        scale_pos_weight=scale_pos_weight,
        tree_method="hist",
        random_state=42,
        n_jobs=-1,
    )

    model = Pipeline(
        steps=[
            ("feat_sel", MISupervisedSelector(k_other=40, always_keep=ALWAYS_KEEP)),
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("xgb", xgb),
        ]
    )
    return model


df_train = pd.read_excel(TRAIN_FILE, na_values=[999])
df_test = pd.read_excel(TEST_FILE, na_values=[999])

df_train[TARGET_PCR] = pd.to_numeric(df_train[TARGET_PCR], errors="coerce")
df_train.loc[df_train[TARGET_PCR] == 999, TARGET_PCR] = np.nan
df_train = df_train[df_train[TARGET_PCR].isin([0, 1])]
df_train = df_train.dropna(subset=[TARGET_PCR])

y = df_train[TARGET_PCR].astype(int)
X = df_train.drop(columns=[ID_COL, TARGET_PCR, TARGET_RFS])

n_pos = (y == 1).sum()
n_neg = (y == 0).sum()
scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0

model = build_final_model(scale_pos_weight)
model.fit(X, y)

X_test = df_test[X.columns]
proba_test = model.predict_proba(X_test)[:, 1]
pred_test = (proba_test >= BEST_THRESHOLD).astype(int)

out_df = pd.DataFrame(
    {
        ID_COL: df_test[ID_COL].values,
        "PCR": pred_test,
    }
)

out_df.to_csv("PCRPrediction.csv", index=False)
out_df.head()


Unnamed: 0,ID,PCR
0,TRG002219,0
1,TRG002222,1
2,TRG002223,0
3,TRG002235,0
4,TRG002240,1
