## XGBoost

+ 重要参数:
XGBoost的参数分为三种：

通用参数：（两种类型的booster，因为tree的性能比线性回归好得多，因此我们很少用线性回归。）

booster:使用哪个弱学习器训练，默认gbtree，可选gbtree，gblinear 或dart
nthread：用于运行XGBoost的并行线程数，默认为最大可用线程数
verbosity：打印消息的详细程度。有效值为0（静默），1（警告），2（信息），3（调试）。
eta（learning_rate）：learning_rate，在更新中使用步长收缩以防止过度拟合，默认= 0.3，范围：[0,1]；典型值一般设置为：0.01-0.2
gamma（min_split_loss）：默认= 0，分裂节点时，损失函数减小值只有大于等于gamma节点才分裂，gamma值越大，算法越保守，越不容易过拟合，但性能就不一定能保证，需要平衡。范围：[0，∞]
max_depth：默认= 6，一棵树的最大深度。增加此值将使模型更复杂，并且更可能过度拟合。范围：[0，∞]
min_child_weight：默认值= 1，如果新分裂的节点的样本权重和小于min_child_weight则停止分裂 。这个可以用来减少过拟合，但是也不能太高，会导致欠拟合。范围：[0，∞]
max_delta_step：默认= 0，允许每个叶子输出的最大增量步长。如果将该值设置为0，则表示没有约束。如果将其设置为正值，则可以帮助使更新步骤更加保守。通常不需要此参数，但是当类极度不平衡时，它可能有助于逻辑回归。将其设置为1-10的值可能有助于控制更新。范围：[0，∞]
subsample：默认值= 1，构建每棵树对样本的采样率，如果设置成0.5，XGBoost会随机选择一半的样本作为训练集。范围：（0,1]
sampling_method：默认= uniform，用于对训练实例进行采样的方法。
uniform：每个训练实例的选择概率均等。通常将subsample> = 0.5 设置 为良好的效果。
gradient_based：每个训练实例的选择概率与规则化的梯度绝对值成正比，具体来说就是g 2 + λ h 2 \sqrt{g^2+\lambda h^2} 


In [1]:
import random
import joblib
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from openpyxl import load_workbook
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost.sklearn import XGBClassifier
warnings.filterwarnings("ignore")

In [2]:
from sklearn.metrics import confusion_matrix    #导入计算混淆矩阵的包
def specificity_score(y_true, y_pred):
    C = confusion_matrix(y_true, y_pred)
    TP = C[1,1]
    FP = C[0,1]
    TN = C[0,0]
    FN = C[1,0]
    specificity = TN/(TN+FP)
    return specificity

def classification_evaluation(y_true, y_pred, y_score):
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    auc = metrics.roc_auc_score(y_true, y_score)
    f1 = metrics.f1_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    evaluation = {'accuracy':accuracy, 'recall':recall, 'precision':precision, 'f1':f1, 'auc':auc, 'specificity':specificity}
    return evaluation

### Global params setting and load data

In [3]:
cwd = os.getcwd()
traindir = '../Feature_filter/Feas_data'
testdir = '../Feature_filter/Feas_data_test'
imgdir = os.path.join(cwd, 'IMG')
modeldir = os.path.join(cwd, 'Model')
tag_cols = ['pid', 'label', 'series', 'image', 'mask']
sequence_id = [2, 3, 4]
# Generate the random seed
random_state = random.randint(1,10000)

# Load the SSM features.
SSM_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_mrmr_sel.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
SSM_test_slist = [pd.read_excel(os.path.join(testdir, 'SSM_test.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]

In [4]:
# Print features
SSM_features_list = [df.columns for df in SSM_train_slist]
print(SSM_features_list[0].to_list())
print(SSM_features_list[1].to_list())
print(SSM_features_list[2].to_list())
print(len(SSM_features_list[0])-4, len(SSM_features_list[1])-4, len(SSM_features_list[2])-4)

['pid', 'label', 'series', 'image', 'mask', 'glszm_SmallAreaEmphasis_logarithm', 'glcm_InverseVariance_exponential', 'glszm_GrayLevelNonUniformity_wavelet-HHH', 'firstorder_Skewness_logarithm', 'glcm_Correlation_log-sigma-3-0-mm-3D']
['pid', 'label', 'series', 'image', 'mask', 'gldm_DependenceVariance_wavelet-LLH', 'ngtdm_Contrast_wavelet-HHL', 'firstorder_Skewness_log-sigma-2-0-mm-3D', 'glcm_Imc2_wavelet-HHH', 'glrlm_RunEntropy_exponential']
['pid', 'label', 'series', 'image', 'mask', 'glrlm_ShortRunLowGrayLevelEmphasis_square', 'glszm_ZoneEntropy_exponential']
6 6 3


In [5]:
# Train data
standardscaler = StandardScaler()
SSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_train_slist]
SSM_train_y = [df['label'] for df in SSM_train_slist]
SSM_train_x = [df.drop(tag_cols, axis=1) for df in SSM_train_slist]
SSM_train_x = [standardscaler.fit_transform(df) for df in SSM_train_x]
# Test data
SSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_test_slist]
SSM_test_y = [df['label'] for df in SSM_test_slist]
SSM_test_x = [df.drop(tag_cols, axis=1) for df in SSM_test_slist]
SSM_test_x = [standardscaler.fit_transform(df) for df in SSM_test_x]

SSM2_train_y, SSM3_train_y, SSM4_train_y = (y_.to_list() for y_ in SSM_train_y)             
SSM2_train_x, SSM3_train_x, SSM4_train_x = (x_ for x_ in SSM_train_x)
SSM2_test_y, SSM3_test_y, SSM4_test_y = (y_.to_list() for y_ in SSM_test_y)
SSM2_test_x, SSM3_test_x, SSM4_test_x = (x_ for x_ in SSM_test_x)
SSM2_model, SSM3_model, SSM4_model = (f'XGBoost_SSM{i+2}.model' for i in range(3))

### 系统调参

### SSM2

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [6]:
# Search optimal hyperparameter
random_seed = np.random.randint(low=1,high=230)

hparam = {
              'max_depth': [5, 10, 15, 20, 25],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'n_estimators': [500, 1000, 2000, 3000, 5000],
              'min_child_weight': [0, 2, 5, 10, 20],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]

}

model_base = xgb.XGBClassifier()
model_random = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam,
                                  n_iter=100,
                                  n_jobs=-1,
                                  cv=4,
                                  verbose=1,
                                  scoring='roc_auc',
                                  random_state=random_seed
                                 )
model_random.fit(SSM2_train_x, SSM2_train_y)



Fitting 4 folds for each of 100 candidates, totalling 400 fits


NameError: name 'model_random1' is not defined

In [7]:

best_SSM2 = model_random.best_params_
pprint(best_SSM2)

{'colsample_bytree': 0.6,
 'learning_rate': 0.02,
 'max_delta_step': 0.2,
 'max_depth': 5,
 'min_child_weight': 5,
 'n_estimators': 1000,
 'reg_alpha': 0,
 'reg_lambda': 0.4,
 'scale_pos_weight': 0.6,
 'subsample': 0.8}


In [8]:
hparam1 = {
              'colsample_bytree': [0.5, 0.6, 0.7],
              'learning_rate': [0.01,0.02,0.03],
              'max_delta_step': [0.1, 0.2, 0.3],
              'max_depth': [4, 7, 8],
              'min_child_weight': [4,5,6],
              'n_estimators': [800, 1000, 1200],
              'reg_alpha': [0., 0.2],
              'reg_lambda': [0.3,0.4,0.5],
              'scale_pos_weight': [0.5, 0.6, 0.7],
              'subsample': [0.7,0.8, 0.85],
}
model_base = xgb.XGBClassifier()
model_random1 = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam1,
                                  n_iter=100,
                                  n_jobs=-1,
                                  cv=3,
                                  verbose=1,
                                  scoring='roc_auc',
                                  random_state=random_seed
                                 )
model_random1.fit(SSM2_train_x, SSM2_train_y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=None,
                                           gpu_id=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None, max_bin=None,...
                   param_distributions={'colsample_bytree': [0.5, 0.6, 0.7],
                                        'learning_rate': [0.0

In [9]:
best_SSM2 = model_random1.best_params_
pprint(best_SSM2)

{'colsample_bytree': 0.6,
 'learning_rate': 0.01,
 'max_delta_step': 0.1,
 'max_depth': 4,
 'min_child_weight': 5,
 'n_estimators': 1200,
 'reg_alpha': 0.2,
 'reg_lambda': 0.5,
 'scale_pos_weight': 0.7,
 'subsample': 0.8}


In [25]:
hparam_grid = {
              'colsample_bytree': [0.6],
              'learning_rate': [0.01],
              'max_delta_step': [0.1],
              'max_depth': [8],
              'min_child_weight': [5],
              'n_estimators': [1300],
              'reg_alpha': [0.2],
              'reg_lambda': [0.5],
              'scale_pos_weight': [0.8],
              'subsample': [0.8],
}
model_base = xgb.XGBClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam_grid,
                          n_jobs=-1,
                          cv=2,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(SSM2_train_x, SSM2_train_y)
best_SSM2 = model_grid.best_params_
pprint(best_SSM2)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
{'colsample_bytree': 0.6,
 'learning_rate': 0.01,
 'max_delta_step': 0.1,
 'max_depth': 8,
 'min_child_weight': 5,
 'n_estimators': 1300,
 'reg_alpha': 0.2,
 'reg_lambda': 0.5,
 'scale_pos_weight': 0.8,
 'subsample': 0.8}


In [26]:
# Build RF regression model with optimal hyperparameters
SSM2=model_grid.best_estimator_
# Predict test set data
score = SSM2.score(SSM2_test_x, SSM2_test_y)
print(score)
joblib.dump(SSM2, os.path.join(modeldir, SSM2_model))

0.6857142857142857


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/XGBoost_SSM2.model']

In [27]:
model= joblib.load(os.path.join(modeldir, SSM2_model)) 
predict_label = model.predict(SSM2_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM2_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM2_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6857142857142857,
 'recall': 0.8043478260869565,
 'precision': 0.74,
 'f1': 0.7708333333333333,
 'auc': 0.6096014492753623,
 'specificity': 0.4583333333333333}

### SSM3

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [28]:
# Search optimal hyperparameter
random_seed = np.random.randint(low=1,high=230)

hparam = {
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'max_depth': [5, 10, 15, 20, 25],
              'min_child_weight': [0, 2, 5, 10, 20],
              'n_estimators': [500, 1000, 2000, 3000, 5000],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95]

}

model_base = xgb.XGBClassifier()
model_random = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam,
                                  n_iter=100,
                                  n_jobs=-1,
                                  cv=3,
                                  verbose=1,
                                  scoring='roc_auc',
                                  random_state=random_seed
                                 )
model_random.fit(SSM3_train_x, SSM3_train_y)



Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=None,
                                           gpu_id=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None, max_bin=None,...
                                        'learning_rate': [0.01, 0.02, 0.05, 0.1,
                                                         

In [29]:
best_SSM3 = model_random.best_params_
pprint(best_SSM3)

{'colsample_bytree': 0.7,
 'learning_rate': 0.02,
 'max_delta_step': 0,
 'max_depth': 20,
 'min_child_weight': 10,
 'n_estimators': 5000,
 'reg_alpha': 0.25,
 'reg_lambda': 0.2,
 'scale_pos_weight': 1,
 'subsample': 0.7}


In [30]:
hparam1 = {
              'colsample_bytree': [0.6, 0.7, 0.8],
              'learning_rate': [0.01, 0.02, 0.03],
              'max_delta_step': [0, 0.1, 0.2],
              'max_depth': [18, 20, 22],
              'min_child_weight': [8, 10, 15],
              'n_estimators': [4500, 5000],
              'reg_alpha': [0.6, 0.75, 0.8],
              'reg_lambda': [0.1, 0.25, 0.3],
              'scale_pos_weight': [0.8,1],
              'subsample': [0.6,0.7,0.8],
}
model_base = xgb.XGBClassifier()
model_random = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam1,
                                  n_iter=100,
                                  n_jobs=-1,
                                  cv=3,
                                  verbose=1,
                                  scoring='roc_auc',
                                  random_state=random_seed
                                 )
model_random.fit(SSM3_train_x, SSM3_train_y)
best_SSM3 = model_random.best_params_
pprint(best_SSM3)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'colsample_bytree': 0.7,
 'learning_rate': 0.03,
 'max_delta_step': 0.2,
 'max_depth': 18,
 'min_child_weight': 8,
 'n_estimators': 4500,
 'reg_alpha': 0.8,
 'reg_lambda': 0.1,
 'scale_pos_weight': 0.8,
 'subsample': 0.6}


In [40]:
hparam_grid = {
              'colsample_bytree': [0.7],
              'learning_rate': [0.03],
              'max_delta_step': [0.2],
              'max_depth': [9,10],
              'min_child_weight': [8],
              'n_estimators': [4500],
              'reg_alpha': [0.9],
              'reg_lambda': [0.2],
              'scale_pos_weight': [0.8],
              'subsample': [0.6],
}
model_base = xgb.XGBClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam_grid,
                          n_jobs=-1,
                          cv=5,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(SSM3_train_x, SSM3_train_y)
best_SSM3 = model_grid.best_params_
pprint(best_SSM3)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'colsample_bytree': 0.7,
 'learning_rate': 0.03,
 'max_delta_step': 0.2,
 'max_depth': 9,
 'min_child_weight': 8,
 'n_estimators': 4500,
 'reg_alpha': 0.9,
 'reg_lambda': 0.2,
 'scale_pos_weight': 0.8,
 'subsample': 0.6}


In [41]:
# Build RF regression model with optimal hyperparameters
SSM3=model_grid.best_estimator_
# Predict test set data
score = SSM3.score(SSM3_test_x, SSM3_test_y)
print(score)
joblib.dump(SSM3, os.path.join(modeldir, SSM3_model))

0.6571428571428571


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/XGBoost_SSM3.model']

In [42]:
model= joblib.load(os.path.join(modeldir, SSM3_model)) 
predict_label = model.predict(SSM3_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM3_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6571428571428571,
 'recall': 0.8260869565217391,
 'precision': 0.7037037037037037,
 'f1': 0.76,
 'auc': 0.5955615942028986,
 'specificity': 0.3333333333333333}

### SSM4

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [43]:
# Search optimal hyperparameter
random_seed = np.random.randint(low=1,high=230)

hparam = {
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'max_depth': [5, 10, 15, 20, 25],
              'min_child_weight': [0, 2, 5, 10, 20],
              'n_estimators': [500, 1000, 2000, 3000, 5000],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95]

}

model_base = xgb.XGBClassifier()
model_random = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam,
                                  n_iter=100,
                                  n_jobs=-1,
                                  cv=2,
                                  verbose=1,
                                  scoring='roc_auc',
                                  random_state=random_seed
                                 )
model_random.fit(SSM4_train_x, SSM4_train_y)

best_SSM4 = model_random.best_params_
pprint(best_SSM4)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
{'colsample_bytree': 0.8,
 'learning_rate': 0.02,
 'max_delta_step': 1,
 'max_depth': 25,
 'min_child_weight': 0,
 'n_estimators': 2000,
 'reg_alpha': 0,
 'reg_lambda': 0.2,
 'scale_pos_weight': 1,
 'subsample': 0.7}


In [44]:
hparam1 = {
              'colsample_bytree': [0.7,0.8,0.9],
              'learning_rate': [0.01, 0.02, 0.03],
              'max_delta_step': [0.8, 1, 1.5],
              'max_depth': [20, 25, 30],
              'min_child_weight': [0,1,2],
              'n_estimators': [1500, 2000],
              'reg_alpha': [0, 0.1],
              'reg_lambda': [0.1,0.2,0.3],
              'scale_pos_weight': [0,8, 1],
              'subsample': [0.6,0.7,0.8],
}
model_base = xgb.XGBClassifier()
model_random1 = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam1,
                                  n_iter=100,
                                  n_jobs=-1,
                                  cv=2,
                                  verbose=1,
                                  scoring='roc_auc',
                                  random_state=random_seed
                                 )
model_random1.fit(SSM4_train_x, SSM4_train_y)
best_SSM4 = model_random1.best_params_
pprint(best_SSM4)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
{'colsample_bytree': 0.9,
 'learning_rate': 0.01,
 'max_delta_step': 1.5,
 'max_depth': 25,
 'min_child_weight': 0,
 'n_estimators': 1500,
 'reg_alpha': 0,
 'reg_lambda': 0.1,
 'scale_pos_weight': 8,
 'subsample': 0.6}


In [48]:
hparam_grid = {
              'colsample_bytree': [0.9,0.95],
              'learning_rate': [0.01],
              'max_delta_step': [2,2.5],
              'max_depth': [25],
              'min_child_weight': [0],
              'n_estimators': [1500],
              'reg_alpha': [0],
              'reg_lambda': [0.1],
              'scale_pos_weight': [1],
              'subsample': [0.6],
}
model_base = xgb.XGBClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam_grid,
                          n_jobs=-1,
                          cv=5,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(SSM4_train_x, SSM4_train_y)
best_SSM4 = model_grid.best_params_
pprint(best_SSM4)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'colsample_bytree': 0.9,
 'learning_rate': 0.01,
 'max_delta_step': 2,
 'max_depth': 25,
 'min_child_weight': 0,
 'n_estimators': 1500,
 'reg_alpha': 0,
 'reg_lambda': 0.1,
 'scale_pos_weight': 1,
 'subsample': 0.6}


In [49]:
# Build RF regression model with optimal hyperparameters
SSM4=model_grid.best_estimator_
# Predict test set data
score = SSM4.score(SSM4_test_x, SSM4_test_y)
print(score)
joblib.dump(SSM4, os.path.join(modeldir, SSM4_model))

0.6428571428571429


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/XGBoost_SSM4.model']

In [50]:
model= joblib.load(os.path.join(modeldir, SSM4_model)) 
predict_label = model.predict(SSM4_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM4_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM4_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6428571428571429,
 'recall': 0.7391304347826086,
 'precision': 0.723404255319149,
 'f1': 0.7311827956989247,
 'auc': 0.6322463768115942,
 'specificity': 0.4583333333333333}

### DSM

In [6]:
# Load the DSM features.
Dtag_cols = ['pid', 'label']
DSM_train_slist = [pd.read_excel(os.path.join(traindir, 'DSM_feas_mrmr_sel.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
DSM_test_slist = [pd.read_excel(os.path.join(testdir, 'DSM_test.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
# Train data
standardscaler = StandardScaler()
DSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_train_slist]
DSM_train_y = [df['label'] for df in DSM_train_slist]
DSM_train_x = [df.drop(Dtag_cols, axis=1) for df in DSM_train_slist]
DSM_train_x = [standardscaler.fit_transform(df) for df in DSM_train_x]
# Test data
DSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_test_slist]
DSM_test_y = [df['label'] for df in DSM_test_slist]
DSM_test_x = [df.drop(Dtag_cols, axis=1) for df in DSM_test_slist]
DSM_test_x = [standardscaler.fit_transform(df) for df in DSM_test_x]

DSM2_train_y, DSM3_train_y, DSM4_train_y = (y_.to_list() for y_ in DSM_train_y)             
DSM2_train_x, DSM3_train_x, DSM4_train_x = (x_ for x_ in DSM_train_x)
DSM2_test_y, DSM3_test_y, DSM4_test_y = (y_.to_list() for y_ in DSM_test_y)
DSM2_test_x, DSM3_test_x, DSM4_test_x = (x_ for x_ in DSM_test_x)
DSM2_model, DSM3_model, DSM4_model = (f'XGBoost_DSM{i+2}.model' for i in range(3))

In [8]:
# Search optimal hyperparameter
random_seed = np.random.randint(low=1,high=230)

hparam = {
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'max_depth': [5, 10, 15, 20, 25],
              'min_child_weight': [0, 2, 5, 10, 20],
              'n_estimators': [500, 1000, 2000, 3000, 5000],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95]

}

model_base = xgb.XGBClassifier()
model_random = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam,
                                  n_iter=50,
                                  n_jobs=-1,
                                  cv=2,
                                  verbose=1,
                                  scoring='roc_auc',
                                  random_state=random_seed
                                 )
model_random.fit(DSM3_train_x, DSM3_train_y)

best_DSM3 = model_random.best_params_
pprint(best_DSM3)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
{'colsample_bytree': 0.6,
 'learning_rate': 0.01,
 'max_delta_step': 0.2,
 'max_depth': 5,
 'min_child_weight': 2,
 'n_estimators': 2000,
 'reg_alpha': 1,
 'reg_lambda': 0.2,
 'scale_pos_weight': 1,
 'subsample': 0.6}


In [9]:
hparam1 = {
              'colsample_bytree': [0.5, 0.6, 0.7],
              'learning_rate': [0.01, 0.02],
              'max_delta_step': [0.1, 0.2, 0.3],
              'max_depth': [5, 8],
              'min_child_weight': [1, 2, 3],
              'n_estimators': [1500, 2000],
              'reg_alpha': [0.8,0.9,1],
              'reg_lambda': [0.1, 0.2, 0.3],
              'scale_pos_weight': [0.8,0.9,1.0],
              'subsample': [0.5, 0.6,0.7],
}
model_base = xgb.XGBClassifier()
model_random1 = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam1,
                                  n_iter=70,
                                  n_jobs=-1,
                                  cv=3,
                                  verbose=1,
                                  scoring='accuracy',
                                  random_state=random_seed
                                 )
model_random1.fit(DSM3_train_x, DSM3_train_y)
best_DSM3 = model_random1.best_params_
pprint(best_DSM3)

Fitting 3 folds for each of 70 candidates, totalling 210 fits
{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_delta_step': 0.3,
 'max_depth': 8,
 'min_child_weight': 3,
 'n_estimators': 1500,
 'reg_alpha': 0.9,
 'reg_lambda': 0.3,
 'scale_pos_weight': 1.0,
 'subsample': 0.6}


In [25]:
hparam_grid = {
              'colsample_bytree': [0.5],
              'learning_rate': [0.01],
              'max_delta_step': [0.4],
              'max_depth': [8],
              'min_child_weight': [0,1],
              'n_estimators': [500],
              'reg_alpha': [0.9],
              'reg_lambda': [0.3],
              'scale_pos_weight': [1.0],
              'subsample': [0.6],
}
model_base = xgb.XGBClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam_grid,
                          n_jobs=-1,
                          cv=6,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(DSM3_train_x, DSM3_train_y)
best_DSM3 = model_grid.best_params_
pprint(best_DSM3)

Fitting 6 folds for each of 2 candidates, totalling 12 fits
{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_delta_step': 0.4,
 'max_depth': 8,
 'min_child_weight': 1,
 'n_estimators': 500,
 'reg_alpha': 0.9,
 'reg_lambda': 0.3,
 'scale_pos_weight': 1.0,
 'subsample': 0.6}


In [26]:
# Build RF regression model with optimal hyperparameters
DSM3=model_grid.best_estimator_
# Predict test set data
score = DSM3.score(DSM3_test_x, DSM3_test_y)
print(score)
joblib.dump(DSM3, os.path.join(modeldir, DSM3_model))

0.7571428571428571


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/XGBoost_DSM3.model']

In [27]:
model= joblib.load(os.path.join(modeldir, DSM3_model)) 
predict_label = model.predict(DSM3_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(DSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = DSM3_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7571428571428571,
 'recall': 0.9565217391304348,
 'precision': 0.7457627118644068,
 'f1': 0.8380952380952381,
 'auc': 0.7110507246376813,
 'specificity': 0.375}

### ASM

In [28]:
# Load the ASM features.
Atag_cols = ['pid', 'label']
ASM_train = pd.read_csv(os.path.join(traindir, 'ASM_mrmr_feas.csv')) 
ASM_test = pd.read_csv(os.path.join(testdir, 'ASM_test.csv')) 
# Train data
standardscaler = StandardScaler()
ASM_train = ASM_train.sample(frac=1.0, random_state=random_state) 
ASM_train_y = ASM_train['label'] 
ASM_train_x = ASM_train.drop(Atag_cols, axis=1) 
ASM_train_x = standardscaler.fit_transform(ASM_train_x) 
# Test data
ASM_test = ASM_test.sample(frac=1.0, random_state=random_state)
ASM_test_y = ASM_test['label']
ASM_test_x =ASM_test.drop(Dtag_cols, axis=1)
ASM_test_x = standardscaler.fit_transform(ASM_test_x)

ASM_model = f'XGBoost_ASM.model'

In [29]:
# Search optimal hyperparameter
random_seed = np.random.randint(low=1,high=230)

hparam = {
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'max_depth': [5, 10, 15, 20, 25],
              'min_child_weight': [0, 2, 5, 10, 20],
              'n_estimators': [500, 1000, 2000, 3000, 5000],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95]

}

model_base = xgb.XGBClassifier()
model_random = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam,
                                  n_iter=50,
                                  n_jobs=-1,
                                  cv=3,
                                  verbose=1,
                                  scoring='roc_auc',
                                  random_state=random_seed
                                 )
model_random.fit(ASM_train_x, ASM_train_y)

best_ASM = model_random.best_params_
pprint(best_ASM)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'colsample_bytree': 0.5,
 'learning_rate': 0.02,
 'max_delta_step': 2,
 'max_depth': 15,
 'min_child_weight': 2,
 'n_estimators': 1000,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'scale_pos_weight': 0.2,
 'subsample': 0.7}


In [30]:
hparam1 = {
              'colsample_bytree': [0.4, 0.5,0.6],
              'learning_rate': [0.01,0.02,0.03],
              'max_delta_step': [1.5,2,2.5],
              'max_depth': [12, 15, 18],
              'min_child_weight': [1,2,3],
              'n_estimators': [500, 1000],
              'reg_alpha': [0.9,1],
              'reg_lambda': [0.9,1],
              'scale_pos_weight': [0.1, 0.2, 0.3],
              'subsample': [0.6, 0.7,0.8],
}
model_base = xgb.XGBClassifier()
model_random1 = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam1,
                                  n_iter=70,
                                  n_jobs=-1,
                                  cv=3,
                                  verbose=1,
                                  scoring='accuracy',
                                  random_state=random_seed
                                 )
model_random1.fit(ASM_train_x, ASM_train_y)
best_ASM = model_random1.best_params_
pprint(best_ASM)

Fitting 3 folds for each of 70 candidates, totalling 210 fits
{'colsample_bytree': 0.5,
 'learning_rate': 0.03,
 'max_delta_step': 2,
 'max_depth': 18,
 'min_child_weight': 1,
 'n_estimators': 1000,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'scale_pos_weight': 0.3,
 'subsample': 0.7}


In [43]:
hparam_grid = {
              'colsample_bytree': [0.5],
              'learning_rate': [0.03,0.05],
              'max_delta_step': [2],
              'max_depth': [18],
              'min_child_weight': [0],
              'n_estimators': [1200],
              'reg_alpha': [1],
              'reg_lambda': [1],
              'scale_pos_weight': [0.5,0.6],
              'subsample': [0.7],
}
model_base = xgb.XGBClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam_grid,
                          n_jobs=-1,
                          cv=4,
                          verbose=1,
                          scoring='accuracy'
)
model_grid.fit(ASM_train_x, ASM_train_y)
best_ASM = model_grid.best_params_
pprint(best_ASM)

Fitting 4 folds for each of 4 candidates, totalling 16 fits
{'colsample_bytree': 0.5,
 'learning_rate': 0.05,
 'max_delta_step': 2,
 'max_depth': 18,
 'min_child_weight': 0,
 'n_estimators': 1200,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'scale_pos_weight': 0.5,
 'subsample': 0.7}


In [44]:
# Build RF regression model with optimal hyperparameters
ASM=model_grid.best_estimator_
# Predict test set data
score = ASM.score(ASM_test_x, ASM_test_y)
print(score)
joblib.dump(ASM, os.path.join(modeldir, ASM_model))

0.6428571428571429


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/XGBoost_ASM.model']

In [45]:
model= joblib.load(os.path.join(modeldir, ASM_model)) 
predict_label = model.predict(ASM_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(ASM_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = ASM_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6428571428571429,
 'recall': 0.782608695652174,
 'precision': 0.7058823529411765,
 'f1': 0.7422680412371134,
 'auc': 0.5615942028985508,
 'specificity': 0.375}

### Clinical

In [14]:
# Load the Clinical features.
Atag_cols = ['pid', 'label']
Clinical_train = pd.read_csv(os.path.join(traindir, 'clinical_lasso_sel.csv')) 
Clinical_test = pd.read_csv(os.path.join(testdir, 'clinical_test.csv')) 
# Train data
standardscaler = StandardScaler()
Clinical_train = Clinical_train.sample(frac=1.0, random_state=random_state) 
Clinical_train_y = Clinical_train['label'] 
Clinical_train_x = Clinical_train.drop(Atag_cols, axis=1) 
Clinical_train_x = standardscaler.fit_transform(Clinical_train_x) 
# Test data
Clinical_test = Clinical_test.sample(frac=1.0, random_state=random_state)
Clinical_test_y = Clinical_test['label']
Clinical_test_x =Clinical_test.drop(Dtag_cols, axis=1)
Clinical_test_x = standardscaler.fit_transform(Clinical_test_x)

Clinical_model = f'XGBoost_Clinical.model'

In [15]:
# Search optimal hyperparameter
random_seed = np.random.randint(low=1,high=230)

hparam = {
              'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'max_delta_step': [0, 0.2, 0.6, 1, 2],
              'max_depth': [5, 10, 15, 20, 25],
              'min_child_weight': [0, 2, 5, 10, 20],
              'n_estimators': [500, 1000, 2000, 3000, 5000],
              'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
              'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
              'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1],
              'subsample': [0.6, 0.7, 0.8, 0.85, 0.95]

}

model_base = xgb.XGBClassifier()
model_random = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam,
                                  n_iter=50,
                                  n_jobs=-1,
                                  cv=3,
                                  verbose=1,
                                  scoring='balanced_accuracy',
                                  random_state=random_seed
                                 )
model_random.fit(Clinical_train_x, Clinical_train_y)

best_Clinical = model_random.best_params_
pprint(best_Clinical)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'colsample_bytree': 0.5,
 'learning_rate': 0.15,
 'max_delta_step': 2,
 'max_depth': 5,
 'min_child_weight': 0,
 'n_estimators': 1000,
 'reg_alpha': 0.75,
 'reg_lambda': 0.6,
 'scale_pos_weight': 0.8,
 'subsample': 0.6}


In [16]:
hparam1 = {
              'colsample_bytree': [0.4,0.5,0.6],
              'learning_rate': [0.05,0.10,0.15,0.20],
              'max_delta_step': [1,1.5,2],
              'max_depth': [5,6,7],
              'min_child_weight': [0, 1],
              'n_estimators': [800, 1000, 1200],
              'reg_alpha': [0.7, 0.75,0.8],
              'reg_lambda': [0.5,0.6,0.7],
              'scale_pos_weight': [0.7,0.8,0.9],
              'subsample': [0.5,0.60,0.7],
}
model_base = xgb.XGBClassifier()
model_random1 = RandomizedSearchCV(estimator=model_base,
                                  param_distributions=hparam1,
                                  n_iter=70,
                                  n_jobs=-1,
                                  cv=3,
                                  verbose=1,
                                  scoring='balanced_accuracy',
                                  random_state=random_seed
                                 )
model_random1.fit(Clinical_train_x, Clinical_train_y)
best_Clinical = model_random1.best_params_
pprint(best_Clinical)

Fitting 3 folds for each of 70 candidates, totalling 210 fits
{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_delta_step': 2,
 'max_depth': 7,
 'min_child_weight': 0,
 'n_estimators': 800,
 'reg_alpha': 0.75,
 'reg_lambda': 0.5,
 'scale_pos_weight': 0.8,
 'subsample': 0.7}


In [17]:
hparam_grid = {
              'colsample_bytree': [0.5],
              'learning_rate': [0.1],
              'max_delta_step': [2,3],
              'max_depth': [7,9,11],
              'min_child_weight': [0],
              'n_estimators': [600,800],
              'reg_alpha': [0.75],
              'reg_lambda': [0.4,0.5],
              'scale_pos_weight': [0.8],
              'subsample': [0.7,0.8],
}
model_base = xgb.XGBClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam_grid,
                          n_jobs=-1,
                          cv=5,
                          verbose=1,
                          scoring='balanced_accuracy'
)
model_grid.fit(Clinical_train_x, Clinical_train_y)
best_Clinical = model_grid.best_params_
pprint(best_Clinical)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_delta_step': 2,
 'max_depth': 9,
 'min_child_weight': 0,
 'n_estimators': 600,
 'reg_alpha': 0.75,
 'reg_lambda': 0.5,
 'scale_pos_weight': 0.8,
 'subsample': 0.8}


In [18]:
# Build RF regression model with optimal hyperparameters
Clinical=model_grid.best_estimator_
# Predict test set data
score = Clinical.score(Clinical_test_x, Clinical_test_y)
print(score)
joblib.dump(Clinical, os.path.join(modeldir, Clinical_model))

0.8142857142857143


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/XGBoost_Clinical.model']

In [19]:
model= joblib.load(os.path.join(modeldir, Clinical_model)) 
predict_label = model.predict(Clinical_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(Clinical_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = Clinical_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.8142857142857143,
 'recall': 0.8695652173913043,
 'precision': 0.851063829787234,
 'f1': 0.8602150537634409,
 'auc': 0.8614130434782609,
 'specificity': 0.7083333333333334}