In [1]:
import sys
sys.path.append('/host/d/Github')
import os
import numpy as np
import pandas as pd
import nibabel as nb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import Osteosarcoma.functions_collection as ff
import Osteosarcoma.Build_lists.Build_list as Build_list
import Osteosarcoma.Data_processing as Data_processing

from __future__ import annotations

import os  # needed navigate the system to get the input data

import radiomics
from radiomics import (
    featureextractor,  # This module is used for interaction with pyradiomics
)

  from .autonotebook import tqdm as notebook_tqdm


### patchify image

In [14]:
build = Build_list.Build(os.path.join('/host/d/Data/Habitats/Jishuitan/Patient_lists', 'labels_with_image_info_included.xlsx'))
batch_list, patient_index_list, label_list, image_path_list, mask_path_list = build.__build__()
print(f'Number of cases to process: {len(image_path_list)}')

resampled_path = '/host/d/Data/Habitats/Jishuitan/resampled_data/'

Number of cases to process: 81


In [3]:
for index in range(0,1):#len(patient_index_list)):
    patient_index = patient_index_list[index]
    image_path = os.path.join(resampled_path, str(patient_index), 'image_resampled.nii.gz')
    label_path = os.path.join(resampled_path, str(patient_index), 'label_resampled.nii.gz')
    print('image path:', image_path, '\nlabel path:', label_path)

    habitats_out_path = '/host/d/Data/Habitats/Jishuitan/habitats/'
    ff.make_folder([os.path.join(habitats_out_path, str(patient_index))])
    
    label_nii = nb.load(label_path)
    label_arr = label_nii.get_fdata()
    affine = label_nii.affine; header = label_nii.header

    # bbox
    bbox_out_path = os.path.join(habitats_out_path, str(patient_index), 'tumor_bbox.nii.gz')
    bbox_arr, x0, x1, y0, y1, z0, z1 = Data_processing.bbox3d(label_arr, buffer_x=5, buffer_y = 5, buffer_z = 1)
    bbox_nii = nb.Nifti1Image(bbox_arr, affine=affine, header=header)
    nb.save(bbox_nii, bbox_out_path)

    # extract patches
   
    patch_size_mm = 10.0               # 1 cm
    min_tumor_fraction = 0.1          # almost include everyone, will exclude based on table later
    out_dir = os.path.join(habitats_out_path, str(patient_index), "patches")
    os.makedirs(out_dir, exist_ok=True)

    patch_table_path = os.path.join(habitats_out_path, str(patient_index), "patch_table.xlsx")

    # 你上一步已经得到 bbox_arr（shape = (X,Y,Z)），以及 bbox 的坐标 x0,x1,y0,y1,z0,z1
    # 这里假设你已经有：bbox_arr, x0,x1,y0,y1,z0,z1
    # 并且 label_path 指向 label_resampled.nii.gz

    count_included, total_patches = Data_processing.patchify(label_arr, label_nii, x0, x1, y0, y1, z0, z1, patch_size_mm, min_tumor_fraction, out_dir, patch_table_path)
    print(f'Patient {patient_index}: Included patches {count_included} out of {total_patches} total patches.')



image path: /host/d/Data/Habitats/Jishuitan/resampled_data/5/image_resampled.nii.gz 
label path: /host/d/Data/Habitats/Jishuitan/resampled_data/5/label_resampled.nii.gz
Tight bbox (x,y,z): (188, 353) (165, 331) (39, 101)
Buffered bbox: x[183:358], y[160:336], z[38:102]
Spacing (mm): 0.390625 0.3906250298023224 1.0
Patch size (vox): 26 26 10
Grid start: (183, 160, 38)
Grid end  : (364, 341, 107)
Grid n patches: (7, 7, 7)
Patient 5: Included patches 198 out of 343 total patches.


### initiate extractor

In [None]:
# # define which features we need
# # 1. 读入 Excel
# df = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_svm_selected.xlsx')

# non_feature_cols = ["Patient_index","Image_filepath","Mask_filepath"]

# # 3. 提取 radiomics feature 列
# radiomics_features = [c for c in df.columns if c not in non_feature_cols]

# # 4. 打印结果
# print("Radiomics features list:")
# for f in radiomics_features:
#     print(f)

# print("\nTotal number of radiomics features:", len(radiomics_features))

In [2]:
# Instantiate the extractor
paramPath = '/host/d/Github/Osteosarcoma/radiomics_settings/MR_setting_habitats_patch_SVM_selected.yaml'
extractor = featureextractor.RadiomicsFeatureExtractor(paramPath)

print('Extraction parameters:\n\t', extractor.settings)
print('Enabled filters:\n\t', extractor.enabledImagetypes)
print('Enabled features:\n\t', extractor.enabledFeatures)

Extraction parameters:
	 {'minimumROIDimensions': 2, 'minimumROISize': None, 'normalize': True, 'normalizeScale': 100, 'removeOutliers': None, 'resampledPixelSpacing': [1, 1, 1], 'interpolator': 'sitkBSpline', 'preCrop': False, 'padDistance': 5, 'distances': [1], 'force2D': False, 'force2Ddimension': 0, 'resegmentRange': None, 'label': 1, 'additionalInfo': True, 'binWidth': 10, 'voxelArrayShift': 300, 'geometryTolerance': 0.0001}
Enabled filters:
	 {'Original': {}, 'LoG': {'sigma': [6.0]}, 'Wavelet': {}}
Enabled features:
	 {'firstorder': ['Kurtosis', 'Skewness'], 'glcm': ['ClusterProminence', 'ClusterShade', 'Imc1', 'Idn'], 'glrlm': ['LongRunLowGrayLevelEmphasis', 'LongRunHighGrayLevelEmphasis'], 'glszm': ['ZoneEntropy'], 'gldm': ['LargeDependenceHighGrayLevelEmphasis', 'DependenceEntropy'], 'ngtdm': ['Strength']}


### define patient list

In [3]:
build = Build_list.Build(os.path.join('/host/d/Data/Habitats/Jishuitan/Patient_lists', 'labels_with_image_info_included.xlsx'))
batch_list, patient_index_list, label_list, image_path_list, mask_path_list = build.__build__()
print(f'Number of cases to process: {len(image_path_list)}')

Number of cases to process: 81


### extract patch features

In [None]:
rows = []
tumor_fraction_threshold = 0.5
for i in range(0, len(patient_index_list)):
    patient_index = patient_index_list[i]
    print('Processing patient index:', patient_index)
    img_p = os.path.join(resampled_path, str(patient_index), 'img_resampled.nii.gz')
    # save_folder
    save_folder = os.path.join('/host/d/projects/Habitats/radiomics/habitats', str(patient_index))
    ff.make_folder([save_folder])

    if os.path.exists(os.path.join(save_folder, 'radiomics_features_patches.xlsx')):
        print('  Features already extracted for this patient. Skipping...')
        continue
    
    rows = []

    # we need to extract features for each patch
    patches_dir = os.path.join('/host/d/Data/Habitats/Jishuitan/habitats/', str(patient_index), 'patches')
    patch_table_raw = pd.read_excel(os.path.join('/host/d/Data/Habitats/Jishuitan/habitats/', str(patient_index), 'patch_table.xlsx'))
    # only keep patches with ['tumor_fraction'] >= 0.5
    patch_table = patch_table_raw[patch_table_raw['tumor_fraction'] >= tumor_fraction_threshold].reset_index(drop=True)
    print(f'  Number of patches to process: {patch_table.shape[0]}', ' (out of ', patch_table_raw.shape[0], ' total patches )')

    for j in range(0, patch_table.shape[0]):
        patch_id = patch_table.loc[j, 'patch_id']
        tumor_fraction = patch_table.loc[j, 'tumor_fraction']
        patch_p = os.path.join(patches_dir, f'patch_{patch_id:04d}.nii.gz')
        print('  Processing patch:', patch_p)
        msk_p = patch_p  # mask is the same as image for patches
        result = extractor.execute(img_p,msk_p) # important!
        # Keep only radiomics features (drop diagnostics)
        feats = {k: v for k, v in result.items() if not k.startswith("diagnostics_")}
        feats["Patient_index"] = patient_index
        feats["patch_id"] = patch_id
        feats["tumor_fraction"] = tumor_fraction
        feats["Image_filepath"] = img_p
        feats["Mask_filepath"] = msk_p
        
        
        rows.append(feats)
        df = pd.DataFrame(rows)
        front_cols = ["Patient_index", "patch_id","tumor_fraction", "Image_filepath", "Mask_filepath"]
        other_cols = [col for col in df.columns if col not in front_cols]
        df = df[front_cols + other_cols]
        df.to_excel(os.path.join(save_folder, 'radiomics_features_patches.xlsx'), index=False)

Processing patient index: 5
  Features already extracted for this patient. Skipping...
Processing patient index: 7
  Features already extracted for this patient. Skipping...
Processing patient index: 8
  Features already extracted for this patient. Skipping...
Processing patient index: 11
  Features already extracted for this patient. Skipping...
Processing patient index: 15
  Features already extracted for this patient. Skipping...
Processing patient index: 18
  Features already extracted for this patient. Skipping...
Processing patient index: 19
  Number of patches to process: 160  (out of  237  total patches )
  Processing patch: /host/d/Data/Habitats/Jishuitan/habitats/19/patches/patch_0007.nii.gz
  Processing patch: /host/d/Data/Habitats/Jishuitan/habitats/19/patches/patch_0008.nii.gz


### normalize features

#### normalize features, will use the min and max values from whole-tumor 

In [31]:
scale_df = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_features_list.xlsx')
build = Build_list.Build(os.path.join('/host/d/Data/Habitats/Jishuitan/Patient_lists', 'labels_with_image_info_included.xlsx'))
svm_selected_features = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_svm_selected.xlsx')
non_feature_cols = ["Patient_index","Image_filepath","Mask_filepath"]
svm_selected_radiomics_features = [c for c in svm_selected_features.columns if c not in non_feature_cols]


batch_list, patient_index_list, label_list, image_path_list, mask_path_list = build.__build__()
print(f'Number of cases to process: {len(image_path_list)}')

for i in range(0,len(patient_index_list)):
    patient_index = patient_index_list[i]

    feature_file = os.path.join('/host/d/projects/Habitats/radiomics/habitats', str(patient_index), 'radiomics_features_patches.xlsx')
    if os.path.isfile(feature_file) == False:
        # print('  Feature file not found for this patient. Skipping...')
        continue
    df = pd.read_excel(feature_file)
    
    # Normalize each feature
    for feature in scale_df['feature_name']:
        if feature in df.columns:
            min_val = scale_df.loc[scale_df['feature_name'] == feature, 'feature_min'].values[0]
            max_val = scale_df.loc[scale_df['feature_name'] == feature, 'feature_max'].values[0]
            # if max_val - min_val != 0:
            df[feature] = (df[feature] - min_val) / (max_val - min_val)
           
        # else:
        #     print(f'Feature {feature} not found in dataframe for patient {patient_index}. Skipping normalization for this feature.')
    
    # Save the normalized features
    df.to_excel(os.path.join('/host/d/projects/Habitats/radiomics/habitats', str(patient_index), 'radiomics_features_patches_normalized.xlsx'), index=False)

    # only pick the selected features for Svm and save
    df_svm = df[["Patient_index", "patch_id","tumor_fraction", "Image_filepath", "Mask_filepath"] + svm_selected_radiomics_features]
    # for each feature, check if there are any values < -1 or > 2 and print the count
    for feature in svm_selected_radiomics_features:
        out_of_bounds = ((df_svm[feature] < -1) | (df_svm[feature] > 2)).sum()
        if out_of_bounds > 0:
            print(f'Feature {feature} has {out_of_bounds} out-of-bounds elements out of {df_svm.shape[0]} , percent: {out_of_bounds/df_svm.shape[0]*100:.2f}% for patient {patient_index}')
    # out_of_bounds = ((df_svm[svm_selected_radiomics_features] < -1) | (df_svm[svm_selected_radiomics_features] > 2)).sum().sum()
    # print(f'Number of out-of-bounds elements in SVM selected features for patient {patient_index}: {out_of_bounds}', 'out of', df_svm[svm_selected_radiomics_features].size, 'total elements')
    df_svm.to_excel(os.path.join('/host/d/projects/Habitats/radiomics/habitats', str(patient_index), 'radiomics_features_patches_normalized_svm_selected.xlsx'), index=False)


Number of cases to process: 81
Feature wavelet-LLH_glrlm_LongRunLowGrayLevelEmphasis has 13 out-of-bounds elements out of 127 , percent: 10.24% for patient 5
Feature wavelet-HHL_glcm_Imc1 has 2 out-of-bounds elements out of 127 , percent: 1.57% for patient 5
Feature wavelet-LLL_glszm_ZoneEntropy has 52 out-of-bounds elements out of 127 , percent: 40.94% for patient 5
Feature log-sigma-6-0-mm-3D_firstorder_Kurtosis has 1 out-of-bounds elements out of 262 , percent: 0.38% for patient 7
Feature wavelet-LLH_glrlm_LongRunLowGrayLevelEmphasis has 86 out-of-bounds elements out of 262 , percent: 32.82% for patient 7
Feature wavelet-HHL_glcm_Imc1 has 3 out-of-bounds elements out of 262 , percent: 1.15% for patient 7
Feature wavelet-LLL_glszm_ZoneEntropy has 133 out-of-bounds elements out of 262 , percent: 50.76% for patient 7
Feature wavelet-LLL_glszm_ZoneEntropy has 8 out-of-bounds elements out of 26 , percent: 30.77% for patient 8
Feature wavelet-LLH_glrlm_LongRunLowGrayLevelEmphasis has 94 o

#### normalize for reader 1

In [None]:
### normalize features
# df = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements.xlsx')
### normalize features to [0,1]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
non_feature_cols = ["Patient_index", "Image_filepath", "Mask_filepath"]
feature_cols = [c for c in df.columns if c not in non_feature_cols]
df_features = df[feature_cols]
df_features_scaled = pd.DataFrame(scaler.fit_transform(df_features), columns=feature_cols)
df_scaled = pd.concat([df[non_feature_cols], df_features_scaled], axis=1)
df_scaled.to_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_normalized.xlsx', index=False)

In [None]:
feature_min = scaler.data_min_
feature_max = scaler.data_max_

feature_table = pd.read_excel('/host/d/projects/Habitats/results/radiomics_features_list.xlsx')
feature_table['feature_min'] = feature_min
feature_table['feature_max'] = feature_max
feature_table.to_excel('/host/d/projects/Habitats/results/radiomics_features_list.xlsx', index=False)

#### normalize for reader 2, will use data_min and data_max from reader 1 normalization

In [None]:
# for each feature, get the scaler.data_min_ and data_max_ from reader 1 normalization and use them to normalize reader 2 features
scale_df = pd.read_excel('/host/d/projects/Habitats/radiomics/radiomics_features_list.xlsx')
df_reader2 = df.copy()
feature_names = [c for c in df_reader2.columns if c not in non_feature_cols]
for feature in feature_names:
    f_min = scale_df.loc[scale_df['feature_name'] == feature, 'feature_min'].values[0]
    f_max = scale_df.loc[scale_df['feature_name'] == feature, 'feature_max'].values[0]
    df_reader2[feature] = (df_reader2[feature] - f_min) / (f_max - f_min)
df_reader2.to_excel('/host/d/projects/Habitats/radiomics/radiomics_measurements_normalized_reader2.xlsx', index=False)

