In [26]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [27]:
def wprint(info, style=0):
    '''
    :description: Information print with separator.
    :param info: 'str', The information which want to print. 
    :param style: 'int', 0: Only one line info.
    '''
    if len(info) < 57:
        tot_len = 60
    elif len(info) < 77:
        tot_len = 80
    else:
        tot_len = len(info)+6
    space_len = (tot_len - len(info) - 6) // 2
    print('='*tot_len)
    print('=',' '*space_len, f'\033[1;97;95m{info}\033[0m', ' '*(tot_len-space_len-6-len(info)),'=')
    print('='*tot_len)
# 缺失值大于cut,则丢弃此列
def drop_col(df, cutoff=0.5):
    n = len(df)
    for col_name in df.columns:
        cnt = df[col_name].isnull().sum()
        if (float(cnt) / n) > cutoff:
            df.drop([col_name], axis=1, inplace=True)
    return df

def features_preprocess(df):
    # 去除重复项
    df = df.drop_duplicates()
    # 丢掉缺失值比例超过50%的列
    # df = drop_col(df)
    # 用众数填充缺失值
    #most_frequency = pd.Series(df.value_counts().index[0], index=df.columns)
    # 用中位数填充缺失值
    most_frequency = df.median()
    #most_frequency = 0.
    df = df.fillna(most_frequency)
    return df

def features_norm(df):
    # 标准化
    scale_features = list(df.select_dtypes(include=[float]))
    ss = StandardScaler()
    df[scale_features] = ss.fit_transform(df[scale_features])
    return df

# Add suffix for the colunms' name for the df.
def feas_name_addsux(df, sux, exclude_columns=[]):
    newcolumn = [name+str(sux) if name not in exclude_columns else name for name in list(df.columns) ]
    df_copy = df.copy()
    df_copy.columns=newcolumn
    return df_copy

def df_parallel_fusion(df_list, series_index, pid='pid', *nonfeas):
    # The list which are not features' columns
    no_feas_list = list(nonfeas)+[series_index, pid]
    # Chage name for the df.columns.
    df_list1 = [feas_name_addsux(df, str(df.at[0, series_index]), no_feas_list) for df in df_list]
    # Remain the 'pid' for the first df. 
    df_fusion = df_list1[0].drop(list(nonfeas)+[series_index], axis=1)
    for i in range(len(df_list)-1):
        feas_list = [col for col in list(df_list1[i+1]) if col not in list(nonfeas)+[series_index]]
        df_fusion = pd.merge(df_fusion, df_list1[i+1][feas_list], on=pid, how='inner')
    return df_fusion

# According to the index('str'), to select the df1[add_list] to concate with df.
def df_added(df, df1, index, add_cols, how='inner'):
    cols_copy = add_cols.copy()
    cols_copy.insert(0, index)
    new_df1 = df1[cols_copy]
    new_df = pd.merge(new_df1, df, on=index, how=how)  # 'inner' 表示只取交集
    return new_df

## Internal test

In [28]:
cwd = os.getcwd()
traindir = os.path.join(cwd, 'Feas_data')
datadir = os.path.join(cwd, 'Feas_data_test')
random_state = 2022 # random seed
sequence_id = [2, 3, 4]                 # The digital id to label T2, DWI, T1CE sequences.
tag_cols = ['pid', 'image','series', 'mask', 'label']

### SSM selection

In [29]:
# ---------------------------------------Select the features in train data extractor ----------------------------------------
df_slist = [pd.read_excel(os.path.join(datadir, 'test_feas_scale.xlsx'), sheet_name=f'sequence{num}') for num in
            sequence_id]
# Read the train output files and get the extracted features' name.
df_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_mrmr_sel.xlsx'), sheet_name=f'sequence{num}') for num in
# df_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_lasso.xlsx'), sheet_name=f'sequence{num}') for num in
            sequence_id]
df_columns_list = [df.columns for df in df_train_slist]
df_slist = [df[col] for df,col in zip(df_slist, df_columns_list)]
df_slist = [features_norm(df) for df in df_slist]
# Save and print information
pwriter = pd.ExcelWriter(os.path.join(datadir, 'SSM_test.xlsx'))
for seq_, df in enumerate(df_slist):
    df.to_excel(pwriter, f'sequence{seq_ + 2}', index=False)
pwriter.save()

### DSM selection

In [30]:
df_slist = [pd.read_excel(os.path.join(datadir, 'SSM_test.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
tag_df = df_slist[0][['pid', 'label']]

## Any two sequence fusion.
dfuse_list = []
for i in range(len(df_slist)):
    fuse_df_list = [df_slist[k] for k in range(len(df_slist)) if k != i]
    fuse_df = df_parallel_fusion(fuse_df_list, 'series', *tag_cols)
    fuse_df = pd.merge(tag_df, fuse_df, on='pid', how='inner')
    dfuse_list.append(fuse_df)

writer = pd.ExcelWriter(os.path.join(datadir, 'DSM_feas.xlsx'))
for seq_, df in enumerate(dfuse_list):
    df.to_excel(writer, f'no_sequence{seq_ + 2}', index=False)
writer.save()
info = f'{dfuse_list[0].shape[1]-2}, {dfuse_list[1].shape[1]-2} and {dfuse_list[2].shape[1]-2} \
                features for no_sequence 2, 3 and 4 fusion.'
wprint(info)
dfuse_list[1].describe()

=   [1;97;95m7, 7 and 10                 features for no_sequence 2, 3 and 4 fusion.[0m    =


Unnamed: 0,pid,label,glszm_SmallAreaEmphasis_logarithm2,glcm_InverseVariance_exponential2,glszm_GrayLevelNonUniformity_wavelet-HHH2,firstorder_Skewness_logarithm2,glcm_Correlation_log-sigma-3-0-mm-3D2,glrlm_ShortRunLowGrayLevelEmphasis_square4,glszm_ZoneEntropy_exponential4
count,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0
mean,122.142857,0.657143,1.015061e-16,-1.9032390000000002e-17,-1.5860330000000003e-17,4.758099e-18,4.282289e-17,5.471813e-17,6.344132000000001e-17
std,69.343983,0.478091,1.00722,1.00722,1.00722,1.00722,1.00722,1.00722,1.00722
min,5.0,0.0,-3.379121,-1.768737,-0.7987017,-4.015094,-2.687189,-0.999822,-4.497317
25%,66.25,0.0,-0.4858936,-0.8705739,-0.6908957,-0.3893325,-0.6817374,-0.7330122,-0.2639472
50%,116.0,1.0,0.1263628,-0.01191104,-0.3162051,0.3272211,0.004738335,-0.2858058,0.3237146
75%,181.75,1.0,0.5752702,0.8855897,0.08397974,0.6365412,0.6708849,0.2604686,0.6395031
max,233.0,1.0,2.019223,1.769255,3.661935,1.356856,2.011847,3.674148,1.03214


In [31]:
# ---------------------------------------Select the features in train data extractor ----------------------------------------
df_slist = [pd.read_excel(os.path.join(datadir, 'DSM_feas.xlsx'), sheet_name=f'no_sequence{num}') for num in
            sequence_id]
# Read the train output files and get the extracted features' name.
df_train_slist = [pd.read_excel(os.path.join(traindir, 'DSM_feas_mrmr_sel.xlsx'), sheet_name=f'no_sequence{num}') for num in
            sequence_id]
df_columns_list = [df.columns for df in df_train_slist]
df_slist = [df[col] for df,col in zip(df_slist, df_columns_list)]
df_slist = [features_norm(df) for df in df_slist]
# Save and print information
pwriter = pd.ExcelWriter(os.path.join(datadir, 'DSM_test.xlsx'))
for seq_, df in enumerate(df_slist):
    df.to_excel(pwriter, f'no_sequence{seq_ + 2}', index=False)
pwriter.save()
feas_num = [(df.shape[1]-2) for df in df_slist]
print(feas_num)

[6, 7, 7]


### ASM selection.

In [32]:
## All sequences fusion.
df_slist = [pd.read_excel(os.path.join(datadir, 'SSM_test.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
df_label = df_slist[0][['pid', 'label']]
afuse_df = df_parallel_fusion(df_slist, 'series', 'pid', 'mask', 'image','label')
afuse_df = pd.merge(df_label, afuse_df, on='pid', how='inner')
afuse_df = afuse_df.sample(frac=1.0, random_state=random_state)
afuse_df.to_csv(os.path.join(datadir, 'ASM_feas.csv'), index=0)

In [33]:
df = pd.read_csv(os.path.join(datadir, 'ASM_feas.csv'))
# Read the train output files and get the extracted features' name.
df_train = pd.read_csv(os.path.join(traindir, 'ASM_mrmr_feas.csv'))
df_columns = df_train.columns 
print(df_columns)
df = df[df_columns] 
info = f'{df.shape[1]-2} features selected'
wprint(info)
df.to_csv(os.path.join(datadir, 'ASM_test.csv'), index=0)

Index(['pid', 'label', 'gldm_DependenceVariance_wavelet-LLH3',
       'ngtdm_Contrast_wavelet-HHL3', 'glcm_Correlation_log-sigma-3-0-mm-3D2',
       'glszm_ZoneEntropy_exponential4', 'glcm_InverseVariance_exponential2',
       'glcm_Imc2_wavelet-HHH3', 'glszm_GrayLevelNonUniformity_wavelet-HHH2',
       'glszm_SmallAreaEmphasis_logarithm2', 'glrlm_RunEntropy_exponential3',
       'firstorder_Skewness_log-sigma-2-0-mm-3D3',
       'glrlm_ShortRunLowGrayLevelEmphasis_square4'],
      dtype='object')
=                   [1;97;95m11 features selected[0m                   =


### Clinical selection.

In [10]:
# 临床特征查看验证标签
clinical_df = pd.read_csv('../DataPreprocess/dataset_info/clinical_data.csv')
compare_list = pd.read_excel(os.path.join(datadir, 'test_feas_scale.xlsx'), sheet_name='sequence2')
train_num = len(compare_list)
clinical_df = df_added(clinical_df, compare_list, index='pid', add_cols=['label'], how='inner')
consis_label_num =(clinical_df['label_x']==clinical_df['label_y']).sum()
if consis_label_num == train_num:
    wprint('Clinical features consistent with radiomics')
else:
    wprint('Clinical features not consistent with radiomics, please cheak!!!')
clinical_df.insert(1, 'label', clinical_df['label_x'])
clinical_df = clinical_df.drop(['label_x', 'label_y'], axis=1)
#clinical_df['mass_feature'] = clinical_df['mass_feature'].astype(float)
clinical_df = features_preprocess(clinical_df)
clinical_df = features_norm(clinical_df)
clinical_df.to_csv(os.path.join(datadir, 'clinical_test_feas.csv'), index=0)
info = f'Clinical features\' number: {clinical_df.shape[1]-2}'
clinical_df.describe()
wprint(info)

=       [1;97;95mClinical features consistent with radiomics[0m        =


Unnamed: 0,pid,label,age,unilateral_or_bilateral,mass_feature,diaphram_nodule,relationship_on_T1_dual_echo_images,peritoneum_mesentery_nodules,parenchymal_organs,CA125,HE4,LDH,NLR,ASA,ascites_amount
count,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0
mean,122.142857,0.657143,1.300547e-16,1.5,2.528571,0.328571,1.771429,0.842857,0.071429,9.278292e-17,-1.9032390000000002e-17,1.054712e-16,-2.438526e-16,2.257143,1.542857
std,69.343983,0.478091,1.00722,0.50361,0.630652,0.473085,1.009684,0.366563,0.259399,1.00722,1.00722,1.00722,1.00722,0.440215,0.501757
min,5.0,0.0,-2.242827,1.0,1.0,0.0,0.0,0.0,0.0,-0.3093531,-0.7441307,-1.2278,-0.9719123,2.0,1.0
25%,66.25,0.0,-0.7449341,1.0,2.0,0.0,1.0,1.0,0.0,-0.2778139,-0.6061668,-0.5697525,-0.6252849,2.0,1.0
50%,116.0,1.0,-0.1832244,1.5,3.0,0.0,2.0,1.0,0.0,-0.2223852,-0.3726124,-0.3745099,-0.2495805,2.0,2.0
75%,181.75,1.0,0.9167905,2.0,3.0,1.0,2.75,1.0,0.0,-0.09257017,0.006368678,0.1813337,0.06261375,2.75,2.0
max,233.0,1.0,2.250851,2.0,3.0,1.0,3.0,1.0,1.0,7.801511,4.676612,4.500577,5.265851,3.0,2.0


=              [1;97;95mClinical features' number: 13[0m               =


In [11]:
df = pd.read_csv(os.path.join(datadir, 'clinical_test_feas.csv'))
# Read the train output files and get the extracted features' name.
df_train = pd.read_csv(os.path.join(traindir, 'clinical_lasso_sel.csv'))
df_columns = df_train.columns 
df = df[df_columns] 
info = f'{df.shape[1]-2} features selected'
wprint(info)
df.to_csv(os.path.join(datadir, 'clinical_test.csv'), index=0)
df_train.describe()
df.head()
df.describe()

=                   [1;97;95m10 features selected[0m                   =


Unnamed: 0,pid,label,mass_feature,NLR,diaphram_nodule,CA125,parenchymal_organs,HE4,ascites_amount,relationship_on_T1_dual_echo_images,peritoneum_mesentery_nodules,LDH
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,116.550633,0.651899,2.329114,0.0,0.253165,9.837419e-18,0.06962,-5.621382e-18,1.626582,1.740506,0.721519,5.621382e-18
std,68.790881,0.477883,0.785563,1.00318,0.436207,1.00318,0.255315,1.00318,0.48525,1.059866,0.449677,1.00318
min,1.0,0.0,1.0,-1.444844,0.0,-0.6226793,0.0,-0.5885402,1.0,0.0,0.0,-1.202434
25%,53.25,0.0,2.0,-0.702642,0.0,-0.5357474,0.0,-0.492863,1.0,1.0,0.0,-0.5212775
50%,118.5,1.0,3.0,-0.315323,0.0,-0.3537746,0.0,-0.302882,2.0,2.0,1.0,-0.3258107
75%,175.75,1.0,3.0,0.435465,0.75,0.1141483,0.0,0.03015706,2.0,3.0,1.0,0.1238642
max,235.0,1.0,3.0,4.546197,1.0,8.508225,1.0,6.831249,2.0,3.0,1.0,7.296381


Unnamed: 0,pid,label,mass_feature,NLR,diaphram_nodule,CA125,parenchymal_organs,HE4,ascites_amount,relationship_on_T1_dual_echo_images,peritoneum_mesentery_nodules,LDH
0,5,1,2,1.317512,0,-0.08024,0,-0.653564,2,0,1,0.053828
1,6,1,2,1.384848,0,-0.063505,0,-0.361334,2,2,1,1.492248
2,7,1,3,-0.166941,0,-0.229288,0,0.324561,2,3,1,4.500577
3,17,1,3,-0.368949,0,-0.284774,0,-0.682545,2,2,1,-0.364549
4,19,1,2,1.565431,1,-0.050291,0,0.562451,2,3,1,1.336851


Unnamed: 0,pid,label,mass_feature,NLR,diaphram_nodule,CA125,parenchymal_organs,HE4,ascites_amount,relationship_on_T1_dual_echo_images,peritoneum_mesentery_nodules,LDH
count,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0
mean,122.142857,0.657143,2.528571,-2.298261e-16,0.328571,9.992007e-17,0.071429,-3.1720660000000005e-17,1.542857,1.771429,0.842857,1.134014e-16
std,69.343983,0.478091,0.630652,1.00722,0.473085,1.00722,0.259399,1.00722,0.501757,1.009684,0.366563,1.00722
min,5.0,0.0,1.0,-0.9719123,0.0,-0.3093531,0.0,-0.7441307,1.0,0.0,0.0,-1.2278
25%,66.25,0.0,2.0,-0.6252849,0.0,-0.2778139,0.0,-0.6061668,1.0,1.0,1.0,-0.5697525
50%,116.0,1.0,3.0,-0.2495805,0.0,-0.2223852,0.0,-0.3726124,2.0,2.0,1.0,-0.3745099
75%,181.75,1.0,3.0,0.06261375,1.0,-0.09257017,0.0,0.006368678,2.0,2.75,1.0,0.1813337
max,233.0,1.0,3.0,5.265851,1.0,7.801511,1.0,4.676612,2.0,3.0,1.0,4.500577


## External test

In [25]:
sequence_id1 = [2, 3, 4]
ex_df = pd.read_csv('../DataPreprocess/dataset_info/external_feas_original.csv')
# ex_df_na = ex_df[ex_df['shape_Elongation_original'].isna()]
# nan_pid = ex_df_na['pid'].values
# ex_df = ex_df[~ex_df['pid'].isin(nan_pid)]
ex_df.head()
# Select the test dataset
ex_df = ex_df.drop(['RD'], axis=1)
ex_df_slist = [ex_df[ex_df['series'] == num] for num in [2,3, 4]]
ex_df_slist = [features_preprocess(df) for df in ex_df_slist]
ex_df_slist = [features_norm(df) for df in ex_df_slist]
ex_df_slist = [df.sort_values(['pid'], ascending=[True]) for df in ex_df_slist]
# ex_df0 = ex_df[ex_df['label']==0]
# ex_df1 = ex_df[ex_df['label']==1]

# ex_df_slist = [ex_df0[ex_df0['series'] == num] for num in [2,4]]
# ex_df_slist = [features_preprocess(df) for df in ex_df_slist]
# ex_df_slist = [features_norm(df) for df in ex_df_slist]
# ex_df_slist = [df.sort_values(['pid'], ascending=[True]) for df in ex_df_slist]
# 
# ex_df_slist1 = [ex_df1[ex_df1['series'] == num] for num in [2,4]]
# ex_df_slist1 = [features_preprocess(df) for df in ex_df_slist1]
# ex_df_slist1 = [features_norm(df) for df in ex_df_slist1]
# ex_df_slist1 = [df.sort_values(['pid'], ascending=[True]) for df in ex_df_slist1]

# ex_df_slist = [pd.concat([df0,df1], axis=0) for df0, df1 in zip(ex_df_slist,ex_df_slist1)]

# Save and print information
pwriter = pd.ExcelWriter('./Feas_data_test/external_feas_scale.xlsx')
for seq_, df in zip([2,3, 4],ex_df_slist):
    df.to_excel(pwriter,f'sequence{seq_}', index=0)
pwriter.save()

Unnamed: 0,pid,label,series,RD,mask,image,shape_Elongation_original,shape_Flatness_original,shape_LeastAxisLength_original,shape_MajorAxisLength_original,...,glszm_SmallAreaHighGrayLevelEmphasis_exponential,glszm_SmallAreaLowGrayLevelEmphasis_exponential,glszm_ZoneEntropy_exponential,glszm_ZonePercentage_exponential,glszm_ZoneVariance_exponential,ngtdm_Busyness_exponential,ngtdm_Coarseness_exponential,ngtdm_Complexity_exponential,ngtdm_Contrast_exponential,ngtdm_Strength_exponential
0,1,1,2,R2,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.1941,0.035149,5.567816,158.405448,...,3.342207,0.099761,3.010434,0.006092,514221.5,12.326277,0.01366,0.361439,0.000181,0.19488
1,1,1,4,R2,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.837888,0.779085,24.473828,31.413538,...,4.816801,0.083049,3.574558,0.011355,149964.7,36.426232,0.005033,1.421613,0.005755,0.056022
2,2,0,2,R0,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.113828,0.025387,27.517706,1083.946699,...,21.116616,0.057424,5.814968,0.014091,4505141.0,452.113424,3.3e-05,97.446421,0.013427,0.010267
3,2,0,4,R0,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,,,,,...,,,,,,,,,,
4,3,0,2,R0,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.782079,0.417705,24.519569,58.700707,...,34.203468,0.046023,5.532083,0.067723,11146.55,12.684849,0.001794,185.874065,0.045435,0.845809




ValueError: Found array with 0 sample(s) (shape=(0, 1454)) while a minimum of 1 is required by StandardScaler.

### SSM selection

In [23]:
# ---------------------------------------Select the features in train data extractor ----------------------------------------
df_slist = [pd.read_excel(os.path.join(datadir, 'external_feas_scale.xlsx'), sheet_name=f'sequence{num}') for num in
            sequence_id]
# Read the train output files and get the extracted features' name.
df_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_mrmr_sel.xlsx'), sheet_name=f'sequence{num}') for num in
# df_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_lasso.xlsx'), sheet_name=f'sequence{num}') for num in
            sequence_id]
df_columns_list = [df.columns for df in df_train_slist]
df_slist = [df[col] for df,col in zip(df_slist, df_columns_list)]
df_slist = [features_norm(df) for df in df_slist]
# Save and print information
pwriter = pd.ExcelWriter(os.path.join(datadir, 'SSM_test_external.xlsx'))
for seq_, df in zip(sequence_id,df_slist):
    df.to_excel(pwriter, f'sequence{seq_}', index=False)
pwriter.save()
print(sequence_id)

ValueError: Worksheet named 'sequence3' not found

### DSM selection

In [14]:
df_slist = [pd.read_excel(os.path.join(datadir, 'SSM_test_external.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id1]
tag_df = df_slist[0][['pid', 'label']]

## Any two sequence fusion.
fuse_df = df_parallel_fusion(df_slist, 'series', *tag_cols)
fuse_df = pd.merge(tag_df, fuse_df, on='pid', how='inner')

fuse_df.to_csv(os.path.join(datadir, 'DSM_feas_external.csv'), index=0)
info = f'{fuse_df.shape[1]-2} features for DSM.'
wprint(info)

=                   [1;97;95m7 features for DSM.[0m                    =


In [15]:
# ---------------------------------------Select the features in train data extractor ----------------------------------------
df_DSM = pd.read_csv(os.path.join(datadir, 'DSM_feas_external.csv'))
# Read the train output files and get the extracted features' name.
df_train_slist = [pd.read_excel(os.path.join(traindir, 'DSM_feas_mrmr_sel.xlsx'), sheet_name=f'no_sequence{num}') for num in
            sequence_id]
df_columns_list = df_train_slist[1].columns 
df_DSM = df_DSM[df_columns_list]
# Save and print information
df_DSM.to_csv(os.path.join(datadir, 'DSM_test_external.csv'), index=0)
print(f'feas num is {df_DSM.shape[1]-2}')

feas num is 7


### ASM selection

In [20]:
## All sequences fusion.
df_slist = [pd.read_excel(os.path.join(datadir, 'SSM_test_external.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
df_label = df_slist[0][['pid', 'label']]
afuse_df = df_parallel_fusion(df_slist, 'series', 'pid', 'mask', 'image','label')
afuse_df = pd.merge(df_label, afuse_df, on='pid', how='inner')
afuse_df = afuse_df.sample(frac=1.0, random_state=random_state)
afuse_df.to_csv(os.path.join(datadir, 'ASM_feas.csv'), index=0)

ValueError: Worksheet named 'sequence4' not found

In [None]:
df = pd.read_csv(os.path.join(datadir, 'ASM_feas.csv'))
# Read the train output files and get the extracted features' name.
df_train = pd.read_csv(os.path.join(traindir, 'ASM_mrmr_feas.csv'))
df_columns = df_train.columns 
print(df_columns)
df = df[df_columns] 
info = f'{df.shape[1]-2} features selected'
wprint(info)
df.to_csv(os.path.join(datadir, 'ASM_test_external.csv'), index=0)

### Clinical selection.

In [None]:
# 临床特征查看验证标签
ex_clinical_df = pd.read_csv('../DataPreprocess/dataset_info/external_clinical.csv')
compare_list = pd.read_excel(os.path.join(datadir, 'external_feas_scale.xlsx'), sheet_name='sequence2')
train_num = len(compare_list)
ex_clinical_df = df_added(ex_clinical_df, compare_list, index='pid', add_cols=['label'], how='inner')
consis_label_num =(ex_clinical_df['label_x']==ex_clinical_df['label_y']).sum()
if consis_label_num == train_num:
    wprint('ex_clinical features consistent with radiomics')
else:
    wprint('ex_clinical features not consistent with radiomics, please cheak!!!')
ex_clinical_df.insert(1, 'label', ex_clinical_df['label_x'])
ex_clinical_df = ex_clinical_df.drop(['label_x', 'label_y'], axis=1)
ex_clinical_df.to_csv(os.path.join(datadir, 'clinical_external_feas.csv'), index=0)
info = f'ex_clinical features\' number: {ex_clinical_df.shape[1]-2}'
wprint(info)
ex_clinical_df.describe()

In [None]:
df = pd.read_csv(os.path.join(datadir, 'clinical_external_feas.csv'))
df['CA125'] = df['CA125'].astype(float)
df['HE4'] = df['HE4'].astype(float)
df = features_preprocess(df)
df = features_norm(df)
# Read the train output files and get the extracted features' name.
df_train = pd.read_csv(os.path.join(traindir, 'clinical_lasso_sel.csv'))
df_columns = df_train.columns 
df = df[df_columns] 
info = f'{df.shape[1]-2} features selected'
wprint(info)
df.to_csv(os.path.join(datadir, 'clinical_test_external.csv'), index=0)
df.describe()
df.head()