In [1]:
import sys
sys.path.append('/host/d/Github')
import os
import numpy as np
import pandas as pd
import nibabel as nb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import Osteosarcoma.functions_collection as ff
import Osteosarcoma.Build_lists.Build_list as Build_list

import radiomics
from radiomics import (
    featureextractor,  # This module is used for interaction with pyradiomics
)

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

### feature selection step 1: ICC calculation for radimoics features from reader 1 and reader 2

In [19]:
df_reader1 = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_normalized.xlsx')
df_reader2 = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_normalized_reader2.xlsx')

# we only keep the rows in df_reader1 that are also in df_reader2 based on Patient_index
df_reader1 = df_reader1[df_reader1['Patient_index'].isin(df_reader2['Patient_index'])]
print(f'Number of cases in reader 1 after matching: {len(df_reader1)}')

non_feature_cols = ['Patient_index', 'Image_filepath', 'Mask_filepath']
feature_cols = [col for col in df_reader1.columns if col not in non_feature_cols]

# we need to calculate the ICC for each feature between reader 1 and reader 2, if it's >0.75, we keep it
# calculate ICC, import packages 

icc_rows = []
for f in feature_cols:
    x = df_reader1[f].values
    y = df_reader2[f].values
    icc = ff.icc2_1(x, y)
    if icc<0.75:
        print('feature:', f, ' ICC:', icc)
    icc_rows.append({'Feature': f, 'ICC': icc})

# only keep features with ICC > 0.75
icc_df = pd.DataFrame(icc_rows)
selected_features = icc_df[icc_df['ICC'] > 0.75]['Feature'].tolist()
print('original number of features:', len(feature_cols))
print(f'Number of features with ICC > 0.75: {len(selected_features)}')

# dropped features
dropped_features = icc_df[icc_df['ICC'] <= 0.75]['Feature'].tolist()
# save dropped features to excel, file name: dropped_features.xlsx, sheet_name: 'inter_reader_icc'
dropped_df = pd.DataFrame({'dropped_feature': dropped_features})

with pd.ExcelWriter('/host/d/projects/Habitats/radiomics/dropped_features.xlsx', engine='openpyxl') as writer:
    dropped_df.to_excel(writer, sheet_name='inter_reader_icc', index=False)

# now we create "df" for reader 1 with only selected features
df_reader1 = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_normalized.xlsx')
df_reader1_selected = df_reader1[['Patient_index', 'Image_filepath', 'Mask_filepath'] + selected_features]
print('Shape of df_reader1_selected:', df_reader1_selected.shape)

Number of cases in reader 1 after matching: 28
feature: wavelet-LLH_glszm_GrayLevelNonUniformityNormalized  ICC: 0.6277989207823124
feature: wavelet-LLH_glszm_LargeAreaEmphasis  ICC: 0.5325832832741472
feature: wavelet-LLH_glszm_LargeAreaLowGrayLevelEmphasis  ICC: 0.3769028391923343
feature: wavelet-LLH_glszm_ZoneEntropy  ICC: 0.7020581753592479
feature: wavelet-LLH_glszm_ZoneVariance  ICC: 0.5432978276331522
feature: wavelet-LLH_ngtdm_Busyness  ICC: 0.475098495423173
feature: wavelet-LHH_glrlm_LongRunLowGrayLevelEmphasis  ICC: 0.6849483059280537
feature: wavelet-LHH_glszm_LargeAreaLowGrayLevelEmphasis  ICC: 0.5590709216948252
feature: wavelet-LHH_ngtdm_Busyness  ICC: 0.3105004200724539
feature: wavelet-HLH_glcm_ClusterShade  ICC: -0.06956898791435204
feature: wavelet-HLH_ngtdm_Busyness  ICC: 0.5519783007725854
original number of features: 1106
Number of features with ICC > 0.75: 1095
Shape of df_reader1_selected: (81, 1098)


### feature selection step 2: PCC


In [20]:
df =    df_reader1_selected.copy()
# df = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_normalized.xlsx')

non_feature_cols = ["Patient_index", "Image_filepath", "Mask_filepath"]
feature_cols = [c for c in df.columns if c not in non_feature_cols]

X = df[feature_cols].copy()

corr = X.corr(method='pearson').abs()

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

# 5) 找到需要删除的列：如果该列与任何其他列相关性 > 阈值，就删它
threshold = 0.90
to_drop = [col for col in upper.columns if (upper[col] > threshold).any()]

print(f"Total features: {len(feature_cols)}")
print(f"Dropped due to PCC > {threshold}: {len(to_drop)}")
print(f"Remaining: {len(feature_cols) - len(to_drop)}")

Total features: 1095
Dropped due to PCC > 0.9: 844
Remaining: 251


In [21]:
X_selected = X.drop(columns=to_drop)

df_pcc = pd.concat([df[non_feature_cols], X_selected], axis=1)
df_pcc.to_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_filtered.xlsx', index=False)

# save dropped features to excel, file name: dropped_features.xlsx, sheet_name: 'pcc'
dropped_df = pd.DataFrame({'dropped_feature': to_drop})
with pd.ExcelWriter('/host/d/projects/Habitats/radiomics/dropped_features.xlsx', engine='openpyxl', mode='a') as writer:
    dropped_df.to_excel(writer, sheet_name='pcc', index=False)

### Feature selection step 3A: LASSO

In [44]:
radiomics_df = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_filtered.xlsx')
non_feature_cols = ["Patient_index", "Image_filepath", "Mask_filepath"]
feature_cols = [c for c in radiomics_df.columns if c not in non_feature_cols]


label_df = pd.read_excel('/host/d/Data/Habitats/Jishuitan/Patient_lists/labels_with_image_info_included.xlsx')
label_df = label_df[label_df['Include'] == 'Yes']
y_col = 'Pathologic_Response_Necrosis_gt90pct'

X = radiomics_df[feature_cols].values
y = label_df[y_col].values

print(f'Feature matrix shape: {X.shape}', f'Label vector shape: {y.shape}')


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)
model = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", LogisticRegressionCV(
        Cs=30, cv=cv, penalty="l1", solver="liblinear",
        scoring="roc_auc", max_iter=5000, refit=True, n_jobs=-1
    ))
])

model.fit(X, y)
lasso_cv = model.named_steps["lasso"]

Feature matrix shape: (81, 251) Label vector shape: (81,)


In [45]:
print("Best C (1/lambda):", lasso_cv.C_[0])
print("Best mean CV AUC:", lasso_cv.scores_[1].mean(axis=0).max())

coef = lasso_cv.coef_.ravel()          # (n_features,)
selected_idx = np.where(coef != 0)[0]  # 哪些feature被选中
selected_features = [feature_cols[i] for i in selected_idx]
selected_coef = coef[selected_idx]

print("Total features:", len(feature_cols))
print("Selected (non-zero):", len(selected_features))

# save radimoics features selected by LASSO
df_lasso = radiomics_df[non_feature_cols + selected_features]
# df_lasso.to_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_lasso_selected.xlsx', index=False)

Best C (1/lambda): 0.38566204211634725
Best mean CV AUC: 0.5591666666666667
Total features: 251
Selected (non-zero): 25


### Feature selection step 3B: RFE for different ML models

#### for SVM

In [48]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

radiomics_df = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_filtered.xlsx')
non_feature_cols = ["Patient_index", "Image_filepath", "Mask_filepath"]
feature_cols = [c for c in radiomics_df.columns if c not in non_feature_cols]


label_df = pd.read_excel('/host/d/Data/Habitats/Jishuitan/Patient_lists/labels_with_image_info_included.xlsx')
label_df = label_df[label_df['Include'] == 'Yes']
y_col = 'Pathologic_Response_Necrosis_gt90pct'

X = radiomics_df[feature_cols].values
y = label_df[y_col].values

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

svm = SVC(
    kernel="linear",   # 关键！
    C=1.0,             # 你指定的
)

rfecv = RFECV(
    estimator=svm,
    step=1,                 # 每次删 1 个（稳，慢）
    cv=cv,
    scoring="roc_auc",
    min_features_to_select=1,
    n_jobs=-1
)

rfecv.fit(X, y)

In [55]:
support = rfecv.support_      # True = 被选中
ranking = rfecv.ranking_      # 1 = 最终保留的特征
best_N = rfecv.n_features_

selected_features = [f for f, keep in zip(feature_cols, support) if keep]

print("Best N =", best_N)
print("Selected features:")
for f in selected_features:
    print("  ", f)

df_selected = radiomics_df[non_feature_cols + selected_features]
df_selected.to_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_svm_selected.xlsx', index=False)


Best N = 9
Selected features:
   original_gldm_LargeDependenceHighGrayLevelEmphasis
   log-sigma-6-0-mm-3D_firstorder_Kurtosis
   log-sigma-6-0-mm-3D_glcm_ClusterShade
   wavelet-LLH_glcm_Idn
   wavelet-LLH_glrlm_LongRunLowGrayLevelEmphasis
   wavelet-LHH_firstorder_Skewness
   wavelet-HLH_glcm_ClusterProminence
   wavelet-HHH_ngtdm_Strength
   wavelet-LLL_glszm_ZoneEntropy


### for XGBoost

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier

radiomics_df = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_filtered.xlsx')
non_feature_cols = ["Patient_index", "Image_filepath", "Mask_filepath"]
feature_cols = [c for c in radiomics_df.columns if c not in non_feature_cols]


label_df = pd.read_excel('/host/d/Data/Habitats/Jishuitan/Patient_lists/labels_with_image_info_included.xlsx')
label_df = label_df[label_df['Include'] == 'Yes']
y_col = 'Pathologic_Response_Necrosis_gt90pct'

X = radiomics_df[feature_cols].values
y = label_df[y_col].values

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=32)


# ---------- 3. XGBoost 模型（你指定的超参数） ----------
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.001,
    subsample=1.0,
    colsample_bytree=1.0,
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=30
)

rfecv = RFECV(
    estimator=xgb,
    step=1,                 # 每次删 1 个（稳，慢）
    cv=cv,
    scoring="roc_auc",
    min_features_to_select=1,
    n_jobs=-1
)

rfecv.fit(X, y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


In [57]:
support = rfecv.support_      # True = 被选中
ranking = rfecv.ranking_      # 1 = 最终保留的特征
best_N = rfecv.n_features_

selected_features = [f for f, keep in zip(feature_cols, support) if keep]

print("Best N =", best_N)
print("Selected features:")
for f in selected_features:
    print("  ", f)

df_selected = radiomics_df[non_feature_cols + selected_features]
df_selected.to_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_XGBoost_selected.xlsx', index=False)


Best N = 2
Selected features:
   original_gldm_LargeDependenceHighGrayLevelEmphasis
   log-sigma-2-0-mm-3D_glszm_ZoneEntropy
