## Train the models

In [2]:
import random
import joblib
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn import metrics
from openpyxl import load_workbook
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

## (1) Random Forest

+ 重要参数:
  1. n_estimators: 
  2. max_depth
  3. max_features
  4. min_samples_split
  5. min_samples_leaf
  6. bootstrap

In [3]:
from sklearn.metrics import confusion_matrix    #导入计算混淆矩阵的包
def specificity_score(y_true, y_pred):
    C = confusion_matrix(y_true, y_pred)
    TP = C[1,1]
    FP = C[0,1]
    TN = C[0,0]
    FN = C[1,0]
    specificity = TN/(TN+FP)
    return specificity

def classification_evaluation(y_true, y_pred, y_score):
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    auc = metrics.roc_auc_score(y_true, y_score)
    f1 = metrics.f1_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    evaluation = {'accuracy':accuracy, 'recall':recall, 'precision':precision, 'f1':f1, 'auc':auc, 'specificity':specificity}
    return evaluation

### Global params setting and load data

In [4]:
cwd = os.getcwd()
traindir = '../Feature_filter/Feas_data'
testdir = '../Feature_filter/Feas_data_test'
imgdir = os.path.join(cwd, 'IMG')
modeldir = os.path.join(cwd, 'Model')
tag_cols = ['pid', 'label', 'series', 'image', 'mask']
sequence_id = [2, 3, 4]
# Generate the random seed
random_state = random.randint(1,10000)

# Load the SSM features.
SSM_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_mrmr_sel.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
SSM_test_slist = [pd.read_excel(os.path.join(testdir, 'SSM_test.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]

In [5]:
# Print features
SSM_features_list = [df.columns for df in SSM_train_slist]
print(SSM_features_list[0].to_list())
print(SSM_features_list[1].to_list())
print(SSM_features_list[2].to_list())
print(len(SSM_features_list[0])-4, len(SSM_features_list[1])-4, len(SSM_features_list[2])-4)

['pid', 'label', 'series', 'image', 'mask', 'glszm_SmallAreaEmphasis_logarithm', 'glcm_InverseVariance_exponential', 'glszm_GrayLevelNonUniformity_wavelet-HHH', 'firstorder_Skewness_logarithm', 'glcm_Correlation_log-sigma-3-0-mm-3D']
['pid', 'label', 'series', 'image', 'mask', 'gldm_DependenceVariance_wavelet-LLH', 'ngtdm_Contrast_wavelet-HHL', 'firstorder_Skewness_log-sigma-2-0-mm-3D', 'glcm_Imc2_wavelet-HHH', 'glrlm_RunEntropy_exponential']
['pid', 'label', 'series', 'image', 'mask', 'glrlm_ShortRunLowGrayLevelEmphasis_square', 'glszm_ZoneEntropy_exponential']
6 6 3


In [7]:
# Train data
standardscaler = StandardScaler()
SSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_train_slist]
SSM_train_y = [df['label'] for df in SSM_train_slist]
ssm_train_pid = SSM_train_slist[0]['pid']
SSM_train_x = [df.drop(tag_cols, axis=1) for df in SSM_train_slist]
SSM_train_x = [standardscaler.fit_transform(df) for df in SSM_train_x]
# Test data
SSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_test_slist]
SSM_test_y = [df['label'] for df in SSM_test_slist]
ssm_test_pid = SSM_test_slist[0]['pid']
SSM_test_x = [df.drop(tag_cols, axis=1) for df in SSM_test_slist]
SSM_test_x = [standardscaler.fit_transform(df) for df in SSM_test_x]

SSM2_train_y, SSM3_train_y, SSM4_train_y = (y_.to_list() for y_ in SSM_train_y)             
SSM2_train_x, SSM3_train_x, SSM4_train_x = (x_ for x_ in SSM_train_x)
SSM2_test_y, SSM3_test_y, SSM4_test_y = (y_.to_list() for y_ in SSM_test_y)
SSM2_test_x, SSM3_test_x, SSM4_test_x = (x_ for x_ in SSM_test_x)
SSM2_model, SSM3_model, SSM4_model = (f'RandomForest_SSM{i+2}.model' for i in range(3))

### 随机森林系统调参

### SSM2
#### Get data

#### 超参数随机匹配择优
+ 按照排列组合来计算的话，会有很多很多种组合方式，如果要一一尝试未免也太麻烦了。因此，我们用到RandomizedSearchCV这一功能——其将随机匹配每一种超参数组合，并输出最优的组合。换句话说，我们用RandomizedSearchCV来进行随机的排列，而不是对所有的超参数排列组合方法进行遍历。这样子确实可以节省很多时间。

In [6]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=3,
                                                   verbose=1,
                                                   random_state=random_forest_seed,
                                                   scoring='roc_auc'
                                                   )
random_forest_model_test_random.fit(SSM2_train_x, SSM2_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


KeyboardInterrupt: 

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [None]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[900, 1100, 1200],
                          'max_features':[2, 3, 4],
                          'max_depth':[300, 490, 600],
                          'min_samples_split':[2, 3, 4], # Greater than 1
                          'min_samples_leaf':[6, 8, 10]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=4,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='roc_auc'
                                              )
random_forest_model_test_2_random.fit(SSM2_train_x, SSM2_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

In [None]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[700,800],
                          'max_features':[2],
                          'max_depth':[400,500],
                          'min_samples_split':[3], # Greater than 1
                          'min_samples_leaf':[12, 13]
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=6,
                                               verbose=1,
                                               n_jobs=-1,
                                              scoring='accuracy'
                                              )
random_forest_model_test_3_random.fit(SSM2_train_x, SSM2_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

#### Build (select) and save model

In [7]:
# Build RF regression model with optimal hyperparameters
SSM2=random_forest_model_test_3_random.best_estimator_
# Predict test set data
random_forest_predict=SSM2.predict(SSM2_test_x)
score = SSM2.score(SSM2_test_x, SSM2_test_y)
print(score)
joblib.dump(SSM2, os.path.join(modeldir, SSM2_model))

NameError: name 'random_forest_model_test_3_random' is not defined

#### Model evaluation

In [19]:
model_forest = joblib.load(os.path.join(modeldir, SSM2_model)) 
predict_label = model_forest.predict(SSM2_train_x) #预测的标签
predict_score = model_forest.predict_proba(SSM2_train_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM2_train_y  #真实标签
tmp_df =  pd.DataFrame({'pid':ssm_train_pid, 'dataset':'train', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
train_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7721518987341772,
 'recall': 0.9029126213592233,
 'precision': 0.7815126050420168,
 'f1': 0.8378378378378379,
 'auc': 0.853486319505737,
 'specificity': 0.5272727272727272}

In [20]:
model_forest = joblib.load(os.path.join(modeldir, SSM2_model)) 
predict_label = model_forest.predict(SSM2_test_x) #预测的标签
predict_score = model_forest.predict_proba(SSM2_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM2_test_y  #真实标签
tmp_df =  pd.DataFrame({'pid':ssm_test_pid, 'dataset':'test', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
test_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7285714285714285,
 'recall': 0.9347826086956522,
 'precision': 0.7288135593220338,
 'f1': 0.819047619047619,
 'auc': 0.6567028985507246,
 'specificity': 0.3333333333333333}

In [21]:
ssm2_df = pd.concat([train_df, test_df])
ssm2_df.to_csv('./radiomics_result_20230406/ssm2.csv', index=0)

### SSM3

#### 超参数随机匹配择优

In [None]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=4,
                                                   verbose=1,
                                                   random_state=random_forest_seed,
                                                   scoring='roc_auc'
                                                   )
random_forest_model_test_random.fit(SSM3_train_x, SSM3_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [None]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[400,450,500],
                          'max_features':['auto'],
                          'max_depth':[100, 120, 200],
                          'min_samples_split':[4,5,8], # Greater than 1
                          'min_samples_leaf':[1,2]
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=4,
                                               verbose=1,
                                               n_jobs=-1)
random_forest_model_test_2_random.fit(SSM3_train_x, SSM3_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

In [None]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[1600],
                          'max_features':[4,5],
                          'max_depth':[200, 210],
                          'min_samples_split':[2], # Greater than 1
                          'min_samples_leaf':[8]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='roc_auc'
                                              )
random_forest_model_test_3_random.fit(SSM3_train_x, SSM3_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

#### Build (select) and save model

In [10]:
# Build RF regression model with optimal hyperparameters
SSM3=random_forest_model_test_3_random.best_estimator_
# Predict test set data
random_forest_predict=SSM3.predict(SSM3_test_x)
score = SSM3.score(SSM3_test_x, SSM3_test_y)
print(score)
joblib.dump(SSM3, os.path.join(modeldir, SSM3_model))

NameError: name 'random_forest_model_test_3_random' is not defined

#### Model evaluation

In [15]:
model_forest = joblib.load(os.path.join(modeldir, SSM3_model))
predict_label = model_forest.predict(SSM3_train_x) #预测的标签
predict_score = model_forest.predict_proba(SSM3_train_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM3_train_y  #真实标签
tmp_df =  pd.DataFrame({'pid':ssm_train_pid, 'dataset':'train', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
train_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.8227848101265823,
 'recall': 0.9514563106796117,
 'precision': 0.8099173553719008,
 'f1': 0.8749999999999999,
 'auc': 0.9353927625772286,
 'specificity': 0.5818181818181818}

In [16]:
model_forest = joblib.load(os.path.join(modeldir, SSM3_model))
predict_label = model_forest.predict(SSM3_test_x) #预测的标签
predict_score = model_forest.predict_proba(SSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM3_test_y  #真实标签
tmp_df =  pd.DataFrame({'pid':ssm_test_pid, 'dataset':'test', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
test_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7,
 'recall': 0.8695652173913043,
 'precision': 0.7272727272727273,
 'f1': 0.792079207920792,
 'auc': 0.6639492753623188,
 'specificity': 0.375}

In [18]:
ssm3_df = pd.concat([train_df, test_df])
ssm3_df.to_csv('./radiomics_result_20230406/ssm3.csv', index=0)

### SSM4

#### 超参数随机匹配择优
+ 按照排列组合来计算的话，会有很多很多种组合方式，如果要一一尝试未免也太麻烦了。因此，我们用到RandomizedSearchCV这一功能——其将随机匹配每一种超参数组合，并输出最优的组合。换句话说，我们用RandomizedSearchCV来进行随机的排列，而不是对所有的超参数排列组合方法进行遍历。这样子确实可以节省很多时间。

In [None]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=4,
                                                   verbose=1,
                                                   random_state=random_forest_seed
                                                   )
random_forest_model_test_random.fit(SSM4_train_x, SSM4_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [None]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[2500, 2750],
                          'max_features':['auto'],
                          'max_depth':[300,400,500],
                          'min_samples_split':[8,10,12], # Greater than 1
                          'min_samples_leaf':[4,8,12]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='roc_auc'
                                              )
random_forest_model_test_2_random.fit(SSM4_train_x, SSM4_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

In [None]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[2500],
                          'max_features':['auto'],
                          'max_depth':[400],
                          'min_samples_split':[8], # Greater than 1
                          'min_samples_leaf':[8]
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='accuracy')
random_forest_model_test_3_random.fit(SSM4_train_x, SSM4_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

#### Build (select) and save model

In [None]:
# Build RF regression model with optimal hyperparameters
SSM4=random_forest_model_test_3_random.best_estimator_
# Predict test set data
random_forest_predict=SSM4.predict(SSM4_test_x)
score = SSM4.score(SSM4_test_x, SSM4_test_y)
print(score)
joblib.dump(SSM4, os.path.join(modeldir, SSM4_model))

#### Model evaluation

In [23]:
model_forest = joblib.load(os.path.join(modeldir, SSM4_model)) 
predict_label = model_forest.predict(SSM4_train_x) #预测的标签
predict_score = model_forest.predict_proba(SSM4_train_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM4_train_y  #真实标签
tmp_df =  pd.DataFrame({'pid':ssm_train_pid, 'dataset':'train', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
train_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7341772151898734,
 'recall': 0.9029126213592233,
 'precision': 0.744,
 'f1': 0.8157894736842106,
 'auc': 0.8317740511915269,
 'specificity': 0.41818181818181815}

In [24]:
model_forest = joblib.load(os.path.join(modeldir, SSM4_model)) 
predict_label = model_forest.predict(SSM4_test_x) #预测的标签
predict_score = model_forest.predict_proba(SSM4_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM4_test_y  #真实标签
tmp_df =  pd.DataFrame({'pid':ssm_test_pid, 'dataset':'test', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
test_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7428571428571429,
 'recall': 0.9347826086956522,
 'precision': 0.7413793103448276,
 'f1': 0.826923076923077,
 'auc': 0.7001811594202899,
 'specificity': 0.375}

In [25]:
ssm4_df = pd.concat([train_df, test_df])
ssm4_df.to_csv('./radiomics_result_20230406/ssm4.csv', index=0)

### DSM

In [37]:
# Load the DSM features.
Dtag_cols = ['pid', 'label']
DSM_train_slist = [pd.read_excel(os.path.join(traindir, 'DSM_feas_mrmr_sel.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
DSM_train3 = DSM_train_slist[1]
DSM_test_slist = [pd.read_excel(os.path.join(testdir, 'DSM_test.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
# Train data
standardscaler = StandardScaler()
DSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_train_slist]
DSM_train_y = [df['label'] for df in DSM_train_slist]
DSM_train_x = [df.drop(Dtag_cols, axis=1) for df in DSM_train_slist]
train_pid = list(DSM_train_slist[0]['pid'].apply(lambda x: 'i'+str(x)))
print(train_pid)
#DSM_train_x = [standardscaler.fit_transform(df) for df in DSM_train_x]
# Test data
#DSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_test_slist]
DSM_test_y = [df['label'] for df in DSM_test_slist]
DSM_test_x = [df.drop(Dtag_cols, axis=1) for df in DSM_test_slist]
test_pid = list(DSM_test_slist[0]['pid'].apply(lambda x: 'i'+str(x)))
print(test_pid)
#DSM_test_x = [standardscaler.fit_transform(df) for df in DSM_test_x]

DSM2_train_y, DSM3_train_y, DSM4_train_y = (y_.to_list() for y_ in DSM_train_y)             
DSM2_train_x, DSM3_train_x, DSM4_train_x = (np.array(x_) for x_ in DSM_train_x)
DSM2_test_y, DSM3_test_y, DSM4_test_y = (y_.to_list() for y_ in DSM_test_y)
DSM2_test_x, DSM3_test_x, DSM4_test_x = (np.array(x_) for x_ in DSM_test_x)
DSM2_model, DSM3_model, DSM4_model = (f'RandomForest_DSM{i+2}.model' for i in range(3))

['i91', 'i76', 'i4', 'i124', 'i20', 'i120', 'i25', 'i165', 'i167', 'i219', 'i78', 'i59', 'i215', 'i162', 'i42', 'i118', 'i176', 'i116', 'i101', 'i161', 'i186', 'i86', 'i144', 'i151', 'i72', 'i190', 'i234', 'i198', 'i139', 'i64', 'i82', 'i196', 'i183', 'i156', 'i149', 'i11', 'i84', 'i99', 'i131', 'i71', 'i226', 'i10', 'i50', 'i193', 'i92', 'i35', 'i93', 'i29', 'i199', 'i85', 'i106', 'i200', 'i80', 'i111', 'i230', 'i170', 'i14', 'i39', 'i191', 'i184', 'i164', 'i229', 'i53', 'i49', 'i107', 'i58', 'i206', 'i95', 'i188', 'i45', 'i13', 'i227', 'i34', 'i52', 'i8', 'i70', 'i171', 'i54', 'i22', 'i9', 'i135', 'i160', 'i228', 'i147', 'i102', 'i148', 'i46', 'i113', 'i159', 'i195', 'i112', 'i231', 'i2', 'i16', 'i209', 'i32', 'i163', 'i36', 'i205', 'i175', 'i189', 'i174', 'i15', 'i69', 'i197', 'i179', 'i178', 'i18', 'i152', 'i143', 'i220', 'i109', 'i150', 'i203', 'i77', 'i97', 'i137', 'i44', 'i40', 'i81', 'i61', 'i169', 'i31', 'i211', 'i104', 'i1', 'i142', 'i154', 'i202', 'i140', 'i12', 'i3', 'i155'

In [38]:
DSM_test3 = DSM_test_slist[1]
DSM_test3.head()
print(Dtag_cols)
DSM_train3_y = DSM_train3['label'].to_list() 
DSM_train3_x = DSM_train3.drop(Dtag_cols, axis=1)
DSM_train3_x = np.array(DSM_train3_x)
print(DSM_train3_x[0])
DSM_train3.head()

['pid', 'label']
[ 1.44708148  0.7578142  -0.45226493  0.49498292 -1.01138155  0.9196928
 -0.30742702]


Unnamed: 0,pid,label,glszm_SmallAreaEmphasis_logarithm2,glcm_InverseVariance_exponential2,glrlm_ShortRunLowGrayLevelEmphasis_square4,glszm_GrayLevelNonUniformity_wavelet-HHH2,glcm_Correlation_log-sigma-3-0-mm-3D2,glszm_ZoneEntropy_exponential4,firstorder_Skewness_logarithm2
0,1,0,1.447081,0.757814,-0.452265,0.494983,-1.011382,0.919693,-0.307427
1,2,1,-0.145508,-1.29819,-0.19061,-0.276422,-0.294524,0.616358,1.33294
2,3,1,0.691198,-0.340941,-0.438346,-0.121146,-0.288731,-1.451869,0.788919
3,4,0,0.542103,1.250093,-0.7252,0.342291,-0.524326,0.675756,0.528886
4,8,1,-0.124139,0.967292,-0.238149,-0.411194,-0.570905,0.180514,1.294495


In [45]:
# 过采样
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import ClusterCentroids 
train_df = DSM_train_slist[1]
train_df = train_df.sample(frac=1.0, random_state=123)
train_df = train_df.drop(['pid'], axis=1)
train_X, train_Y = train_df.loc[:, train_df.columns != 'label'], train_df.loc[:, train_df.columns == 'label']

smote = SMOTE(k_neighbors=5, random_state=2022)
X_smote, Y_smote = smote.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())

bsmote = BorderlineSMOTE(k_neighbors=2, random_state=2022)
X_smote, Y_smote = bsmote.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())

adasyn = ADASYN(n_neighbors=3, random_state=42)
X_smote, Y_smote = adasyn.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())

tl = TomekLinks()
X_smote, Y_smote = tl.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())

cc = ClusterCentroids(random_state=42)
X_smote, Y_smote = cc.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)

DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())
print(df_smote.groupby('label').count())

       glszm_SmallAreaEmphasis_logarithm2  glcm_InverseVariance_exponential2  \
label                                                                          
0                                      55                                 55   
1                                      55                                 55   

       glrlm_ShortRunLowGrayLevelEmphasis_square4  \
label                                               
0                                              55   
1                                              55   

       glszm_GrayLevelNonUniformity_wavelet-HHH2  \
label                                              
0                                             55   
1                                             55   

       glcm_Correlation_log-sigma-3-0-mm-3D2  glszm_ZoneEntropy_exponential4  \
label                                                                          
0                                         55                              55   
1                

In [46]:
print(DSM3_train_y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [47]:
print(len(DSM3_train_x))
np.array(DSM3_train_y).sum()

110


55

In [48]:
# external test
DSM_test_external = pd.read_csv(os.path.join(testdir, 'DSM_test_external.csv'))
DSM3_extest_y = DSM_test_external['label']
extest_pid = list(DSM_test_external['pid'].apply(lambda x: 'w'+str(x)))
print(extest_pid)
DSM3_extest_x = DSM_test_external.drop(Dtag_cols, axis=1)
DSM3_extest_x = DSM3_extest_x
DSM_test_external.head()

['w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7', 'w8', 'w9', 'w10', 'w11', 'w12', 'w13', 'w14', 'w15', 'w16', 'w17', 'w18', 'w19', 'w20', 'w21', 'w22', 'w23', 'w24', 'w25', 'w26', 'w27', 'w28', 'w29', 'w30', 'w31', 'w32', 'w33', 'w34', 'w35', 'w36', 'w37', 'w38', 'w39', 'w40', 'w41', 'w42', 'w43', 'w44', 'w45']


Unnamed: 0,pid,label,glszm_SmallAreaEmphasis_logarithm2,glcm_InverseVariance_exponential2,glrlm_ShortRunLowGrayLevelEmphasis_square4,glszm_GrayLevelNonUniformity_wavelet-HHH2,glcm_Correlation_log-sigma-3-0-mm-3D2,glszm_ZoneEntropy_exponential4,firstorder_Skewness_logarithm2
0,1,1,-0.082571,-1.467897,-0.371897,-0.403232,-0.163733,-0.537051,-0.044727
1,2,0,0.343019,1.377483,-0.326643,2.612101,0.663478,0.0465,-1.606117
2,3,0,1.248603,0.698196,-0.326643,-0.536871,-2.005267,0.0465,-0.960588
3,4,1,0.044844,-0.316359,-1.180298,0.074876,0.596528,-0.746848,1.090658
4,5,1,-1.067449,2.091546,0.837405,-0.577231,0.388151,1.330975,-2.159078


In [49]:
DSM_extest3 = DSM_test_external.copy()
print(len(DSM_extest3))
DSM_extest3.head()

45


Unnamed: 0,pid,label,glszm_SmallAreaEmphasis_logarithm2,glcm_InverseVariance_exponential2,glrlm_ShortRunLowGrayLevelEmphasis_square4,glszm_GrayLevelNonUniformity_wavelet-HHH2,glcm_Correlation_log-sigma-3-0-mm-3D2,glszm_ZoneEntropy_exponential4,firstorder_Skewness_logarithm2
0,1,1,-0.082571,-1.467897,-0.371897,-0.403232,-0.163733,-0.537051,-0.044727
1,2,0,0.343019,1.377483,-0.326643,2.612101,0.663478,0.0465,-1.606117
2,3,0,1.248603,0.698196,-0.326643,-0.536871,-2.005267,0.0465,-0.960588
3,4,1,0.044844,-0.316359,-1.180298,0.074876,0.596528,-0.746848,1.090658
4,5,1,-1.067449,2.091546,0.837405,-0.577231,0.388151,1.330975,-2.159078


In [50]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=5,
                                                   verbose=1,
                                                   random_state=random_forest_seed,
                                                   scoring='balanced_accuracy'
                                                   )
random_forest_model_test_random.fit(DSM3_train_x, DSM3_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


KeyboardInterrupt: 

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [None]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[150,200,250],
                          'max_features':[2,3,4],
                          'max_depth':[10, 20, 30],
                          'min_samples_split':[8,10,12], # Greater than 1
                          'min_samples_leaf':[4,8,16]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=3,
                                               verbose=1,
                                               n_jobs=-1,
                                               #scoring='balanced_accuracy'
                                               scoring='neg_log_loss'
                                              )
random_forest_model_test_2_random.fit(DSM3_train_x, DSM3_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

In [None]:
print(len(DSM3_train_x))
t1 = np.array(DSM3_extest_x)
print(len(DSM3_extest_x))
all_train_x = np.concatenate((DSM3_train_x, t1),axis=0)
# print(len(all_train_x))

#print(DSM3_extest_x)
print(DSM3_train_x.shape)
print(t1.shape)
print(all_train_x.shape)

print(DSM3_train_y)
print(list(DSM3_extest_y))
all_train_y = DSM3_train_y + list(DSM3_extest_y)
print(len(all_train_y))


In [None]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[160,240,300],
                          'max_features':[2,3,4,5],
                          'max_depth':[20,22],
                          'min_samples_split':[7,8,10], # Greater than 1
                          'min_samples_leaf':[16,32]
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=10,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='recall_macro')
                                               #scoring='neg_log_loss')
random_forest_model_test_3_random.fit(all_train_x, all_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

In [None]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

#### Build (select) and save model

In [None]:
# Build RF regression model with optimal hyperparameters
DSM3=random_forest_model_test_3_random.best_estimator_
# Predict test set data
score = DSM3.score(DSM3_test_x, DSM3_test_y)
print(score)
joblib.dump(DSM3, os.path.join(modeldir, DSM3_model))

#### Model evaluation

In [39]:
model= joblib.load(os.path.join(modeldir, DSM3_model)) 
model= joblib.load(os.path.join('./Model_old/', DSM3_model)) 
predict_label = model.predict(DSM_train3_x) #预测的标签
predict_score = model.predict_proba(DSM_train3_x) #得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = DSM_train3_y  #真实标签
tmp_df =  pd.DataFrame({'pid':train_pid, 'dataset':'traning', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
print(tmp_df.head())
# tmp_df['dataset'] = 'training'
train_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

    pid  dataset  y_true  y_pred  y_score0  y_score1
0   i91  traning       0       0  0.595680  0.404320
1   i76  traning       1       1  0.336130  0.663870
2    i4  traning       1       1  0.448695  0.551305
3  i124  traning       0       0  0.662772  0.337228
4   i20  traning       1       0  0.513240  0.486760


{'accuracy': 0.8164556962025317,
 'recall': 0.8349514563106796,
 'precision': 0.8775510204081632,
 'f1': 0.8557213930348259,
 'auc': 0.8875551632833187,
 'specificity': 0.7818181818181819}

In [40]:
nomogram_train = DSM_train3.copy()
nomogram_train['radiomics_score'] = predict_score[:,1]
nomogram_train.head(2)
nomogram_train['dataset']='training'

In [41]:
model_forest = joblib.load(os.path.join(modeldir, DSM3_model)) 
model_forest= joblib.load(os.path.join('./Model_old/', DSM3_model)) 
predict_label = model_forest.predict(DSM3_test_x) #预测的标签
predict_score = model_forest.predict_proba(DSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = DSM3_test_y  #真实标签
tmp_df =  pd.DataFrame({'pid':test_pid, 'dataset':'test', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
test_df = tmp_df.copy()
tmp_df.to_csv('../20230108/test_radio.csv',index=0)
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6857142857142857,
 'recall': 0.782608695652174,
 'precision': 0.75,
 'f1': 0.7659574468085107,
 'auc': 0.720108695652174,
 'specificity': 0.5}

In [42]:
print(DSM3_test_x[0])
print(DSM_test3.head(2))
nomogram_test = DSM_test3.copy()
nomogram_test['radiomics_score'] = predict_score[:,1]
nomogram_test.head(2)
nomogram_test['dataset']='test'

[ 0.79708496  0.5927037  -0.55790057 -0.66888717 -0.92396195 -0.0422019
  0.36268925]
   pid  label  glszm_SmallAreaEmphasis_logarithm2  \
0    5      1                            0.797085   
1    6      1                            0.710836   

   glcm_InverseVariance_exponential2  \
0                           0.592704   
1                           0.190043   

   glrlm_ShortRunLowGrayLevelEmphasis_square4  \
0                                   -0.557901   
1                                   -0.354496   

   glszm_GrayLevelNonUniformity_wavelet-HHH2  \
0                                  -0.668887   
1                                   0.938193   

   glcm_Correlation_log-sigma-3-0-mm-3D2  glszm_ZoneEntropy_exponential4  \
0                              -0.923962                       -0.042202   
1                              -0.438992                        0.703784   

   firstorder_Skewness_logarithm2  
0                        0.362689  
1                        0.044257  


In [43]:
model= joblib.load(os.path.join(modeldir, DSM3_model)) 
# model= joblib.load(os.path.join('./Model_old/', DSM3_model)) 
predict_label = model.predict(DSM3_extest_x) #预测的标签
predict_score = model.predict_proba(DSM3_extest_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = DSM3_extest_y  #真实标签
tmp_df =  pd.DataFrame({'pid':extest_pid, 'dataset':'external', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
tmp_df.to_csv('../20230108/extest_radio.csv',index=0)
extest_df = tmp_df.copy()
label = DSM3_extest_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

NameError: name 'DSM3_extest_x' is not defined

In [58]:
print(len(DSM3_extest_x))
nomogram_extest = DSM_extest3.copy()
print(len(nomogram_extest))
print(len(DSM3_extest_x))
nomogram_extest['radiomics_score'] = predict_score[:,1]
nomogram_extest.head(2)
nomogram_extest['dataset']='external'

45
45
45


In [59]:
all_df = pd.concat([train_df, test_df, extest_df], axis=0)
all_df.to_csv('../20230108/all_radio.csv', index=0)
nomogram_all = pd.concat([nomogram_train, nomogram_test, nomogram_extest])
nomogram_all.to_csv('./out_data/nomogram_feas_radiomics.csv',index=0)

### ASM

In [30]:
# Load the ASM features.
Atag_cols = ['pid', 'label']
ASM_train = pd.read_csv(os.path.join(traindir, 'ASM_mrmr_feas.csv')) 
ASM_test = pd.read_csv(os.path.join(testdir, 'ASM_test.csv')) 
print(testdir)
# Train data
standardscaler = StandardScaler()
# ASM_train = ASM_train.sample(frac=1.0, random_state=random_state) 
train_pid = ASM_train['pid']
ASM_train_y = ASM_train['label'] 

ASM_train_x = ASM_train.drop(Atag_cols, axis=1) 
ASM_train_x = standardscaler.fit_transform(ASM_train_x) 
# Test data
# ASM_test = ASM_test.sample(frac=1.0, random_state=random_state)
ASM_test_y = ASM_test['label']
test_pid = ASM_test['pid']
ASM_test_x =ASM_test.drop(Atag_cols, axis=1)
ASM_test_x = standardscaler.fit_transform(ASM_test_x)

ASM_model = f'RandomForest_ASM.model'

../Feature_filter/Feas_data_test


In [None]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=4,
                                                   verbose=1,
                                                   random_state=random_forest_seed
                                                   )
random_forest_model_test_random.fit(ASM_train_x, ASM_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [None]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[100, 150, 200],
                          'max_features':['auto'],
                          'max_depth':[200, 220, 300],
                          'min_samples_split':[4,5,6], # Greater than 1
                          'min_samples_leaf':[4,8],
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=4,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='roc_auc')
random_forest_model_test_2_random.fit(ASM_train_x, ASM_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

In [None]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[200, 300],
                          'max_features':['auto'],
                          'max_depth':[200],
                          'min_samples_split':[5], # Greater than 1
                          'min_samples_leaf':[8]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='accuracy')
random_forest_model_test_3_random.fit(ASM_train_x, ASM_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

#### Build (select) and save model

In [None]:
# Build RF regression model with optimal hyperparameters
ASM=random_forest_model_test_3_random.best_estimator_
# Predict test set data
score = ASM.score(ASM_test_x, ASM_test_y)
print(score)
joblib.dump(ASM, os.path.join(modeldir, ASM_model))

#### Model evaluation

In [33]:
model_forest = joblib.load(os.path.join(modeldir, ASM_model)) 
predict_label = model_forest.predict(ASM_train_x) #预测的标签
predict_score = model_forest.predict_proba(ASM_train_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = ASM_train_y  #真实标签
tmp_df =  pd.DataFrame({'pid':train_pid, 'dataset':'traning', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
print(tmp_df.head())
train_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

   pid  dataset  y_true  y_pred  y_score0  y_score1
0   25  traning       1       1  0.150769  0.849231
1  230  traning       1       1  0.381310  0.618690
2  171  traning       1       1  0.101922  0.898078
3   61  traning       1       1  0.110356  0.889644
4   40  traning       1       1  0.294703  0.705297


{'accuracy': 0.8227848101265823,
 'recall': 0.9514563106796117,
 'precision': 0.8099173553719008,
 'f1': 0.8749999999999999,
 'auc': 0.9330979699911739,
 'specificity': 0.5818181818181818}

In [34]:
model_forest = joblib.load(os.path.join(modeldir, ASM_model)) 
predict_label = model_forest.predict(ASM_test_x) #预测的标签
predict_score = model_forest.predict_proba(ASM_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = ASM_test_y  #真实标签
tmp_df =  pd.DataFrame({'pid':test_pid, 'dataset':'traning', 'y_true':label, 'y_pred': predict_label, 'y_score0': predict_score[:,0], 'y_score1':predict_score[:,1]})
print(tmp_df.head())
train_df = tmp_df.copy()
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

   pid  dataset  y_true  y_pred  y_score0  y_score1
0  224  traning       1       1  0.404221  0.595779
1   30  traning       1       0  0.658133  0.341867
2  157  traning       0       1  0.399216  0.600784
3  117  traning       1       1  0.336892  0.663108
4   24  traning       1       1  0.298579  0.701421


{'accuracy': 0.7571428571428571,
 'recall': 0.9565217391304348,
 'precision': 0.7457627118644068,
 'f1': 0.8380952380952381,
 'auc': 0.6838768115942029,
 'specificity': 0.375}

In [35]:
asm_df = pd.concat([train_df, test_df])
asm_df.to_csv('./radiomics_result_20230406/asm.csv', index=0)

### Clinical

In [None]:
# Load the Clinical features.
Atag_cols = ['pid', 'label']
Clinical_train = pd.read_csv(os.path.join(traindir, 'clinical_lasso_sel.csv')) 
Clinical_test = pd.read_csv(os.path.join(testdir, 'clinical_test.csv')) 
# Train data
standardscaler = StandardScaler()
Clinical_train = Clinical_train.sample(frac=1.0, random_state=random_state) 
Clinical_train_y = Clinical_train['label'] 
Clinical_train_x = Clinical_train.drop(Atag_cols, axis=1) 
#Clinical_train_x = standardscaler.fit_transform(Clinical_train_x) 
# Test data
#Clinical_test = Clinical_test.sample(frac=1.0, random_state=random_state)
Clinical_test_y = np.array(Clinical_test['label'])
Clinical_test_x = np.array(Clinical_test.drop(Dtag_cols, axis=1))
#Clinical_test_x = standardscaler.fit_transform(Clinical_test_x)

Clinical_model = f'RandomForest_Clinical.model'

In [None]:
# 过采样
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import ClusterCentroids 
Clinical_train = Clinical_train.sample(frac=1.0, random_state=123)
Clinical_train = Clinical_train.drop(['pid'], axis=1)
train_X, train_Y = Clinical_train.loc[:, Clinical_train.columns != 'label'], Clinical_train.loc[:, Clinical_train.columns == 'label']

smote = SMOTE(k_neighbors=3, random_state=2022)
X_smote, Y_smote = smote.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

bsmote = BorderlineSMOTE(k_neighbors=4, random_state=2022)
X_smote, Y_smote = bsmote.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

adasyn = ADASYN(n_neighbors=3, random_state=42)
X_smote, Y_smote = adasyn.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

tl = TomekLinks()
X_smote, Y_smote = tl.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

cc = ClusterCentroids(random_state=42)
X_smote, Y_smote = cc.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

print(df_smote.groupby('label').count())

In [None]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=4,
                                                   verbose=1,
                                                   random_state=random_forest_seed,
                                                   scoring='roc_auc'
                                                   )
random_forest_model_test_random.fit(Clinical_train_x, Clinical_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [None]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[40, 50, 60],
                          'max_features':[3,4,5],
                          'max_depth':[30, 40, 50],
                          'min_samples_split':[2,3,4], # Greater than 1
                          'min_samples_leaf':[1, 2, 3],
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=4,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='balanced_accuracy')
random_forest_model_test_2_random.fit(Clinical_train_x, Clinical_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

In [None]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[30,40],
                          'max_features':[5,6],
                          'max_depth':[20,30],
                          'min_samples_split':[3,4,5], # Greater than 1
                          'min_samples_leaf':[2,4,8]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='balanced_accuracy')
random_forest_model_test_3_random.fit(Clinical_train_x, Clinical_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

#### Build (select) and save model

In [None]:
# Build RF regression model with optimal hyperparameters
Clinical=random_forest_model_test_3_random.best_estimator_
# Predict test set data
score = Clinical.score(Clinical_test_x, Clinical_test_y)
print(score)
joblib.dump(Clinical, os.path.join(modeldir, Clinical_model))

#### Model evaluation

In [None]:
model_forest = joblib.load(os.path.join(modeldir, Clinical_model)) 
predict_label = model_forest.predict(Clinical_test_x) #预测的标签
predict_score = model_forest.predict_proba(Clinical_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = Clinical_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic