## Train the models

In [1]:
import random
import joblib
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn import metrics
from openpyxl import load_workbook
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

## (1) Random Forest

+ 重要参数:
  1. n_estimators: 
  2. max_depth
  3. max_features
  4. min_samples_split
  5. min_samples_leaf
  6. bootstrap

In [2]:
from sklearn.metrics import confusion_matrix    #导入计算混淆矩阵的包
def specificity_score(y_true, y_pred):
    C = confusion_matrix(y_true, y_pred)
    TP = C[1,1]
    FP = C[0,1]
    TN = C[0,0]
    FN = C[1,0]
    specificity = TN/(TN+FP)
    return specificity

def classification_evaluation(y_true, y_pred, y_score):
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    auc = metrics.roc_auc_score(y_true, y_score)
    f1 = metrics.f1_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    evaluation = {'accuracy':accuracy, 'recall':recall, 'precision':precision, 'f1':f1, 'auc':auc, 'specificity':specificity}
    return evaluation

### Global params setting and load data

In [3]:
cwd = os.getcwd()
traindir = '../Feature_filter/Feas_data'
testdir = '../Feature_filter/Feas_data_test'
imgdir = os.path.join(cwd, 'IMG')
modeldir = os.path.join(cwd, 'Model')
tag_cols = ['pid', 'label', 'series', 'image', 'mask']
sequence_id = [2, 3, 4]
# Generate the random seed
random_state = random.randint(1,10000)

# Load the SSM features.
SSM_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_mrmr_sel.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
SSM_test_slist = [pd.read_excel(os.path.join(testdir, 'SSM_test.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]

In [5]:
# Print features
SSM_features_list = [df.columns for df in SSM_train_slist]
print(SSM_features_list[0].to_list())
print(SSM_features_list[1].to_list())
print(SSM_features_list[2].to_list())
print(len(SSM_features_list[0])-4, len(SSM_features_list[1])-4, len(SSM_features_list[2])-4)

['pid', 'label', 'series', 'image', 'mask', 'glszm_SmallAreaEmphasis_logarithm', 'glcm_InverseVariance_exponential', 'glszm_GrayLevelNonUniformity_wavelet-HHH', 'firstorder_Skewness_logarithm', 'glcm_Correlation_log-sigma-3-0-mm-3D']
['pid', 'label', 'series', 'image', 'mask', 'gldm_DependenceVariance_wavelet-LLH', 'ngtdm_Contrast_wavelet-HHL', 'firstorder_Skewness_log-sigma-2-0-mm-3D', 'glcm_Imc2_wavelet-HHH', 'glrlm_RunEntropy_exponential']
['pid', 'label', 'series', 'image', 'mask', 'glrlm_ShortRunLowGrayLevelEmphasis_square', 'glszm_ZoneEntropy_exponential']
6 6 3


In [5]:
# Train data
standardscaler = StandardScaler()
SSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_train_slist]
SSM_train_y = [df['label'] for df in SSM_train_slist]
SSM_train_x = [df.drop(tag_cols, axis=1) for df in SSM_train_slist]
SSM_train_x = [standardscaler.fit_transform(df) for df in SSM_train_x]
# Test data
SSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_test_slist]
SSM_test_y = [df['label'] for df in SSM_test_slist]
SSM_test_x = [df.drop(tag_cols, axis=1) for df in SSM_test_slist]
SSM_test_x = [standardscaler.fit_transform(df) for df in SSM_test_x]

SSM2_train_y, SSM3_train_y, SSM4_train_y = (y_.to_list() for y_ in SSM_train_y)             
SSM2_train_x, SSM3_train_x, SSM4_train_x = (x_ for x_ in SSM_train_x)
SSM2_test_y, SSM3_test_y, SSM4_test_y = (y_.to_list() for y_ in SSM_test_y)
SSM2_test_x, SSM3_test_x, SSM4_test_x = (x_ for x_ in SSM_test_x)
SSM2_model, SSM3_model, SSM4_model = (f'RandomForest_SSM{i+2}.model' for i in range(3))

### 随机森林系统调参

### SSM2
#### Get data

#### 超参数随机匹配择优
+ 按照排列组合来计算的话，会有很多很多种组合方式，如果要一一尝试未免也太麻烦了。因此，我们用到RandomizedSearchCV这一功能——其将随机匹配每一种超参数组合，并输出最优的组合。换句话说，我们用RandomizedSearchCV来进行随机的排列，而不是对所有的超参数排列组合方法进行遍历。这样子确实可以节省很多时间。

In [6]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=3,
                                                   verbose=1,
                                                   random_state=random_forest_seed,
                                                   scoring='roc_auc'
                                                   )
random_forest_model_test_random.fit(SSM2_train_x, SSM2_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

Fitting 3 folds for each of 300 candidates, totalling 900 fits
{'max_depth': 490,
 'max_features': 'sqrt',
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1100}


#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [7]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[900, 1100, 1200],
                          'max_features':[2, 3, 4],
                          'max_depth':[300, 490, 600],
                          'min_samples_split':[2, 3, 4], # Greater than 1
                          'min_samples_leaf':[6, 8, 10]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=4,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='roc_auc'
                                              )
random_forest_model_test_2_random.fit(SSM2_train_x, SSM2_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

Fitting 4 folds for each of 243 candidates, totalling 972 fits
{'max_depth': 300,
 'max_features': 2,
 'min_samples_leaf': 10,
 'min_samples_split': 3,
 'n_estimators': 900}


In [23]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[700,800],
                          'max_features':[2],
                          'max_depth':[400,500],
                          'min_samples_split':[3], # Greater than 1
                          'min_samples_leaf':[12, 13]
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=6,
                                               verbose=1,
                                               n_jobs=-1,
                                              scoring='accuracy'
                                              )
random_forest_model_test_3_random.fit(SSM2_train_x, SSM2_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

Fitting 6 folds for each of 8 candidates, totalling 48 fits
{'max_depth': 500,
 'max_features': 2,
 'min_samples_leaf': 12,
 'min_samples_split': 3,
 'n_estimators': 700}


#### Build (select) and save model

In [24]:
# Build RF regression model with optimal hyperparameters
SSM2=random_forest_model_test_3_random.best_estimator_
# Predict test set data
random_forest_predict=SSM2.predict(SSM2_test_x)
score = SSM2.score(SSM2_test_x, SSM2_test_y)
print(score)
joblib.dump(SSM2, os.path.join(modeldir, SSM2_model))

0.7571428571428571


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/RandomForest_SSM2.model']

#### Model evaluation

In [25]:
model_forest = joblib.load(os.path.join(modeldir, SSM2_model)) 
predict_label = model_forest.predict(SSM2_test_x) #预测的标签
predict_score = model_forest.predict_proba(SSM2_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM2_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7571428571428571,
 'recall': 0.9782608695652174,
 'precision': 0.7377049180327869,
 'f1': 0.8411214953271028,
 'auc': 0.6548913043478262,
 'specificity': 0.3333333333333333}

### SSM3

#### 超参数随机匹配择优

In [26]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=4,
                                                   verbose=1,
                                                   random_state=random_forest_seed,
                                                   scoring='roc_auc'
                                                   )
random_forest_model_test_random.fit(SSM3_train_x, SSM3_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

Fitting 4 folds for each of 300 candidates, totalling 1200 fits
{'max_depth': 120,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 450}


#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [27]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[400,450,500],
                          'max_features':['auto'],
                          'max_depth':[100, 120, 200],
                          'min_samples_split':[4,5,8], # Greater than 1
                          'min_samples_leaf':[1,2]
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=4,
                                               verbose=1,
                                               n_jobs=-1)
random_forest_model_test_2_random.fit(SSM3_train_x, SSM3_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

Fitting 4 folds for each of 54 candidates, totalling 216 fits
{'max_depth': 120,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 400}


In [28]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[1600],
                          'max_features':[4,5],
                          'max_depth':[200, 210],
                          'min_samples_split':[2], # Greater than 1
                          'min_samples_leaf':[8]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='roc_auc'
                                              )
random_forest_model_test_3_random.fit(SSM3_train_x, SSM3_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'max_depth': 210,
 'max_features': 5,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1600}


#### Build (select) and save model

In [29]:
# Build RF regression model with optimal hyperparameters
SSM3=random_forest_model_test_3_random.best_estimator_
# Predict test set data
random_forest_predict=SSM3.predict(SSM3_test_x)
score = SSM3.score(SSM3_test_x, SSM3_test_y)
print(score)
joblib.dump(SSM3, os.path.join(modeldir, SSM3_model))

0.7


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/RandomForest_SSM3.model']

#### Model evaluation

In [30]:
model_forest = SSM3
predict_label = model_forest.predict(SSM3_test_x) #预测的标签
predict_score = model_forest.predict_proba(SSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM3_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7,
 'recall': 0.8695652173913043,
 'precision': 0.7272727272727273,
 'f1': 0.792079207920792,
 'auc': 0.6639492753623188,
 'specificity': 0.375}

### SSM4

#### 超参数随机匹配择优
+ 按照排列组合来计算的话，会有很多很多种组合方式，如果要一一尝试未免也太麻烦了。因此，我们用到RandomizedSearchCV这一功能——其将随机匹配每一种超参数组合，并输出最优的组合。换句话说，我们用RandomizedSearchCV来进行随机的排列，而不是对所有的超参数排列组合方法进行遍历。这样子确实可以节省很多时间。

In [31]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=4,
                                                   verbose=1,
                                                   random_state=random_forest_seed
                                                   )
random_forest_model_test_random.fit(SSM4_train_x, SSM4_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

Fitting 4 folds for each of 300 candidates, totalling 1200 fits
{'max_depth': 400,
 'max_features': 'auto',
 'min_samples_leaf': 8,
 'min_samples_split': 10,
 'n_estimators': 2750}


#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [32]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[2500, 2750],
                          'max_features':['auto'],
                          'max_depth':[300,400,500],
                          'min_samples_split':[8,10,12], # Greater than 1
                          'min_samples_leaf':[4,8,12]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='roc_auc'
                                              )
random_forest_model_test_2_random.fit(SSM4_train_x, SSM4_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
{'max_depth': 400,
 'max_features': 'auto',
 'min_samples_leaf': 8,
 'min_samples_split': 8,
 'n_estimators': 2500}


In [43]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[2500],
                          'max_features':['auto'],
                          'max_depth':[400],
                          'min_samples_split':[8], # Greater than 1
                          'min_samples_leaf':[8]
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='accuracy')
random_forest_model_test_3_random.fit(SSM4_train_x, SSM4_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'max_depth': 400,
 'max_features': 'auto',
 'min_samples_leaf': 8,
 'min_samples_split': 8,
 'n_estimators': 2500}


#### Build (select) and save model

In [44]:
# Build RF regression model with optimal hyperparameters
SSM4=random_forest_model_test_3_random.best_estimator_
# Predict test set data
random_forest_predict=SSM4.predict(SSM4_test_x)
score = SSM4.score(SSM4_test_x, SSM4_test_y)
print(score)
joblib.dump(SSM4, os.path.join(modeldir, SSM4_model))

0.7428571428571429


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/RandomForest_SSM4.model']

#### Model evaluation

In [45]:
model_forest = joblib.load(os.path.join(modeldir, SSM4_model)) 
predict_label = model_forest.predict(SSM4_test_x) #预测的标签
predict_score = model_forest.predict_proba(SSM4_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM4_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7428571428571429,
 'recall': 0.9347826086956522,
 'precision': 0.7413793103448276,
 'f1': 0.826923076923077,
 'auc': 0.7001811594202899,
 'specificity': 0.375}

### DSM

In [238]:
# Load the DSM features.
Dtag_cols = ['pid', 'label']
DSM_train_slist = [pd.read_excel(os.path.join(traindir, 'DSM_feas_mrmr_sel.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
DSM_test_slist = [pd.read_excel(os.path.join(testdir, 'DSM_test.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
# Train data
standardscaler = StandardScaler()
DSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_train_slist]
DSM_train_y = [df['label'] for df in DSM_train_slist]
DSM_train_x = [df.drop(Dtag_cols, axis=1) for df in DSM_train_slist]
#DSM_train_x = [standardscaler.fit_transform(df) for df in DSM_train_x]
# Test data
#DSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_test_slist]
DSM_test_y = [df['label'] for df in DSM_test_slist]
DSM_test_x = [df.drop(Dtag_cols, axis=1) for df in DSM_test_slist]
#DSM_test_x = [standardscaler.fit_transform(df) for df in DSM_test_x]

DSM2_train_y, DSM3_train_y, DSM4_train_y = (y_.to_list() for y_ in DSM_train_y)             
DSM2_train_x, DSM3_train_x, DSM4_train_x = (np.array(x_) for x_ in DSM_train_x)
DSM2_test_y, DSM3_test_y, DSM4_test_y = (y_.to_list() for y_ in DSM_test_y)
DSM2_test_x, DSM3_test_x, DSM4_test_x = (np.array(x_) for x_ in DSM_test_x)
DSM2_model, DSM3_model, DSM4_model = (f'RandomForest_DSM{i+2}.model' for i in range(3))

In [239]:
# 过采样
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import ClusterCentroids 
train_df = DSM_train_slist[1]
train_df = train_df.sample(frac=1.0, random_state=123)
train_df = train_df.drop(['pid'], axis=1)
train_X, train_Y = train_df.loc[:, train_df.columns != 'label'], train_df.loc[:, train_df.columns == 'label']

smote = SMOTE(k_neighbors=5, random_state=2022)
X_smote, Y_smote = smote.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())

bsmote = BorderlineSMOTE(k_neighbors=2, random_state=2022)
X_smote, Y_smote = bsmote.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())

adasyn = ADASYN(n_neighbors=3, random_state=42)
X_smote, Y_smote = adasyn.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())

tl = TomekLinks()
X_smote, Y_smote = tl.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())

cc = ClusterCentroids(random_state=42)
X_smote, Y_smote = cc.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)

DSM3_train_x = np.array(df_smote.drop(['label'], axis=1))
DSM3_train_y = np.array(df_smote['label'].to_list())
print(df_smote.groupby('label').count())

       glszm_SmallAreaEmphasis_logarithm2  glcm_InverseVariance_exponential2  \
label                                                                          
0                                      55                                 55   
1                                      55                                 55   

       glrlm_ShortRunLowGrayLevelEmphasis_square4  \
label                                               
0                                              55   
1                                              55   

       glszm_GrayLevelNonUniformity_wavelet-HHH2  \
label                                              
0                                             55   
1                                             55   

       glcm_Correlation_log-sigma-3-0-mm-3D2  glszm_ZoneEntropy_exponential4  \
label                                                                          
0                                         55                              55   
1                

In [240]:
print(DSM3_train_y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [241]:
print(len(DSM3_train_x))
np.array(DSM3_train_y).sum()

110


55

In [261]:
# external test
DSM_test_external = pd.read_csv(os.path.join(testdir, 'DSM_test_external.csv'))
DSM3_extest_y = DSM_test_external['label']
DSM3_extest_x = DSM_test_external.drop(Dtag_cols, axis=1)
DSM3_extest_x = DSM3_extest_x
DSM_test_external.head()

Unnamed: 0,pid,label,glszm_SmallAreaEmphasis_logarithm2,glcm_InverseVariance_exponential2,glrlm_ShortRunLowGrayLevelEmphasis_square4,glszm_GrayLevelNonUniformity_wavelet-HHH2,glcm_Correlation_log-sigma-3-0-mm-3D2,glszm_ZoneEntropy_exponential4,firstorder_Skewness_logarithm2
0,1,1,-0.082571,-1.467897,-0.371897,-0.403232,-0.163733,-0.537051,-0.044727
1,2,0,0.343019,1.377483,-0.326643,2.612101,0.663478,0.0465,-1.606117
2,3,0,1.248603,0.698196,-0.326643,-0.536871,-2.005267,0.0465,-0.960588
3,4,1,0.044844,-0.316359,-1.180298,0.074876,0.596528,-0.746848,1.090658
4,5,1,-1.067449,2.091546,0.837405,-0.577231,0.388151,1.330975,-2.159078


In [243]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=5,
                                                   verbose=1,
                                                   random_state=random_forest_seed,
                                                   scoring='balanced_accuracy'
                                                   )
random_forest_model_test_random.fit(DSM3_train_x, DSM3_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


KeyboardInterrupt: 

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [165]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[150,200,250],
                          'max_features':[2,3,4],
                          'max_depth':[10, 20, 30],
                          'min_samples_split':[8,10,12], # Greater than 1
                          'min_samples_leaf':[4,8,16]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=3,
                                               verbose=1,
                                               n_jobs=-1,
                                               #scoring='balanced_accuracy'
                                               scoring='neg_log_loss'
                                              )
random_forest_model_test_2_random.fit(DSM3_train_x, DSM3_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


KeyboardInterrupt: 

In [244]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[100,150,160],
                          'max_features':[3,4],
                          'max_depth':[20,25],
                          'min_samples_split':[6,7,8], # Greater than 1
                          'min_samples_leaf':[8,16]
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='balanced_accuracy')
random_forest_model_test_3_random.fit(DSM3_train_x, DSM3_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'max_depth': 25,
 'max_features': 3,
 'min_samples_leaf': 16,
 'min_samples_split': 8,
 'n_estimators': 160}


#### Build (select) and save model

In [235]:
# Build RF regression model with optimal hyperparameters
DSM3=random_forest_model_test_3_random.best_estimator_
# Predict test set data
score = DSM3.score(DSM3_test_x, DSM3_test_y)
print(score)
joblib.dump(DSM3, os.path.join(modeldir, DSM3_model))

0.6857142857142857


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/RandomForest_DSM3.model']

#### Model evaluation

In [236]:
model_forest = joblib.load(os.path.join(modeldir, DSM3_model)) 
predict_label = model_forest.predict(DSM3_test_x) #预测的标签
predict_score = model_forest.predict_proba(DSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = DSM3_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6857142857142857,
 'recall': 0.782608695652174,
 'precision': 0.75,
 'f1': 0.7659574468085107,
 'auc': 0.720108695652174,
 'specificity': 0.5}

In [237]:
model= joblib.load(os.path.join(modeldir, DSM3_model)) 
predict_label = model.predict(DSM3_extest_x) #预测的标签
predict_score = model.predict_proba(DSM3_extest_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = DSM3_extest_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6444444444444445,
 'recall': 0.6923076923076923,
 'precision': 0.6923076923076923,
 'f1': 0.6923076923076923,
 'auc': 0.6153846153846154,
 'specificity': 0.5789473684210527}

### ASM

In [52]:
# Load the ASM features.
Atag_cols = ['pid', 'label']
ASM_train = pd.read_csv(os.path.join(traindir, 'ASM_mrmr_feas.csv')) 
ASM_test = pd.read_csv(os.path.join(testdir, 'ASM_test.csv')) 
# Train data
standardscaler = StandardScaler()
ASM_train = ASM_train.sample(frac=1.0, random_state=random_state) 
ASM_train_y = ASM_train['label'] 
ASM_train_x = ASM_train.drop(Atag_cols, axis=1) 
ASM_train_x = standardscaler.fit_transform(ASM_train_x) 
# Test data
ASM_test = ASM_test.sample(frac=1.0, random_state=random_state)
ASM_test_y = ASM_test['label']
ASM_test_x =ASM_test.drop(Dtag_cols, axis=1)
ASM_test_x = standardscaler.fit_transform(ASM_test_x)

ASM_model = f'RandomForest_ASM.model'

In [102]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=4,
                                                   verbose=1,
                                                   random_state=random_forest_seed
                                                   )
random_forest_model_test_random.fit(ASM_train_x, ASM_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

Fitting 4 folds for each of 300 candidates, totalling 1200 fits
{'max_depth': 220,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 150}


#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [106]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[100, 150, 200],
                          'max_features':['auto'],
                          'max_depth':[200, 220, 300],
                          'min_samples_split':[4,5,6], # Greater than 1
                          'min_samples_leaf':[4,8],
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=4,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='roc_auc')
random_forest_model_test_2_random.fit(ASM_train_x, ASM_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

Fitting 4 folds for each of 54 candidates, totalling 216 fits
{'max_depth': 200,
 'max_features': 'auto',
 'min_samples_leaf': 8,
 'min_samples_split': 5,
 'n_estimators': 200}


In [152]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[200, 300],
                          'max_features':['auto'],
                          'max_depth':[200],
                          'min_samples_split':[5], # Greater than 1
                          'min_samples_leaf':[8]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='accuracy')
random_forest_model_test_3_random.fit(ASM_train_x, ASM_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'max_depth': 200,
 'max_features': 'auto',
 'min_samples_leaf': 8,
 'min_samples_split': 5,
 'n_estimators': 300}


#### Build (select) and save model

In [153]:
# Build RF regression model with optimal hyperparameters
ASM=random_forest_model_test_3_random.best_estimator_
# Predict test set data
score = ASM.score(ASM_test_x, ASM_test_y)
print(score)
joblib.dump(ASM, os.path.join(modeldir, ASM_model))

0.7571428571428571


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/RandomForest_ASM.model']

#### Model evaluation

In [154]:
model_forest = joblib.load(os.path.join(modeldir, ASM_model)) 
predict_label = model_forest.predict(ASM_test_x) #预测的标签
predict_score = model_forest.predict_proba(ASM_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = ASM_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7571428571428571,
 'recall': 0.9565217391304348,
 'precision': 0.7457627118644068,
 'f1': 0.8380952380952381,
 'auc': 0.6838768115942029,
 'specificity': 0.375}

### Clinical

In [255]:
# Load the Clinical features.
Atag_cols = ['pid', 'label']
Clinical_train = pd.read_csv(os.path.join(traindir, 'clinical_lasso_sel.csv')) 
Clinical_test = pd.read_csv(os.path.join(testdir, 'clinical_test.csv')) 
# Train data
standardscaler = StandardScaler()
Clinical_train = Clinical_train.sample(frac=1.0, random_state=random_state) 
Clinical_train_y = Clinical_train['label'] 
Clinical_train_x = Clinical_train.drop(Atag_cols, axis=1) 
#Clinical_train_x = standardscaler.fit_transform(Clinical_train_x) 
# Test data
#Clinical_test = Clinical_test.sample(frac=1.0, random_state=random_state)
Clinical_test_y = np.array(Clinical_test['label'])
Clinical_test_x = np.array(Clinical_test.drop(Dtag_cols, axis=1))
#Clinical_test_x = standardscaler.fit_transform(Clinical_test_x)

Clinical_model = f'RandomForest_Clinical.model'

In [256]:
# 过采样
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import ClusterCentroids 
Clinical_train = Clinical_train.sample(frac=1.0, random_state=123)
Clinical_train = Clinical_train.drop(['pid'], axis=1)
train_X, train_Y = Clinical_train.loc[:, Clinical_train.columns != 'label'], Clinical_train.loc[:, Clinical_train.columns == 'label']

smote = SMOTE(k_neighbors=3, random_state=2022)
X_smote, Y_smote = smote.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

bsmote = BorderlineSMOTE(k_neighbors=4, random_state=2022)
X_smote, Y_smote = bsmote.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

adasyn = ADASYN(n_neighbors=3, random_state=42)
X_smote, Y_smote = adasyn.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

tl = TomekLinks()
X_smote, Y_smote = tl.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

cc = ClusterCentroids(random_state=42)
X_smote, Y_smote = cc.fit_resample(train_X, train_Y)
df_smote = pd.concat([X_smote, Y_smote], axis=1)
Clinical_train_x = np.array(df_smote.drop(['label'], axis=1))
Clinical_train_y = np.array(df_smote['label'].to_list())

print(df_smote.groupby('label').count())

       mass_feature  NLR  diaphram_nodule  CA125  parenchymal_organs  HE4  \
label                                                                       
0                55   55               55     55                  55   55   
1                55   55               55     55                  55   55   

       ascites_amount  relationship_on_T1_dual_echo_images  \
label                                                        
0                  55                                   55   
1                  55                                   55   

       peritoneum_mesentery_nodules  LDH  
label                                     
0                                55   55  
1                                55   55  


In [257]:
# Search optimal hyperparameter
random_seed=44
random_forest_seed=np.random.randint(low=1,high=230)
n_estimators_range=[int(x) for x in np.linspace(start=50,stop=3000,num=60)]
max_features_range=['auto','sqrt']
max_depth_range=[int(x) for x in np.linspace(10,500,num=50)]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,4,8]
bootstrap_range=[True,False]

random_forest_hp_range={'n_estimators':n_estimators_range,
                        'max_features':max_features_range,
                        'max_depth':max_depth_range,
                        'min_samples_split':min_samples_split_range,
                        'min_samples_leaf':min_samples_leaf_range
                        # 'bootstrap':bootstrap_range
                        }

# Random search
random_forest_model_test_base=RandomForestClassifier()
random_forest_model_test_random=RandomizedSearchCV(estimator=random_forest_model_test_base,
                                                   param_distributions=random_forest_hp_range,
                                                   n_iter=300,
                                                   n_jobs=-1,
                                                   cv=4,
                                                   verbose=1,
                                                   random_state=random_forest_seed,
                                                   scoring='roc_auc'
                                                   )
random_forest_model_test_random.fit(Clinical_train_x, Clinical_train_y)

best_hp_now=random_forest_model_test_random.best_params_
pprint(best_hp_now)

Fitting 4 folds for each of 300 candidates, totalling 1200 fits


KeyboardInterrupt: 

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [28]:
# Grid Search

random_forest_hp_range_2={'n_estimators':[40, 50, 60],
                          'max_features':[3,4,5],
                          'max_depth':[30, 40, 50],
                          'min_samples_split':[2,3,4], # Greater than 1
                          'min_samples_leaf':[1, 2, 3],
                          }
random_forest_model_test_2_base=RandomForestClassifier()
random_forest_model_test_2_random=GridSearchCV(estimator=random_forest_model_test_2_base,
                                               param_grid=random_forest_hp_range_2,
                                               cv=4,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='balanced_accuracy')
random_forest_model_test_2_random.fit(Clinical_train_x, Clinical_train_y)

best_hp_now_2=random_forest_model_test_2_random.best_params_
pprint(best_hp_now_2)

Fitting 4 folds for each of 243 candidates, totalling 972 fits
{'max_depth': 30,
 'max_features': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 40}


In [258]:
# Grid Search

random_forest_hp_range_3={'n_estimators':[30,40],
                          'max_features':[5,6],
                          'max_depth':[20,30],
                          'min_samples_split':[3,4,5], # Greater than 1
                          'min_samples_leaf':[2,4,8]
                          # 'bootstrap':bootstrap_range
                          }
random_forest_model_test_3_base=RandomForestClassifier()
random_forest_model_test_3_random=GridSearchCV(estimator=random_forest_model_test_3_base,
                                               param_grid=random_forest_hp_range_3,
                                               cv=5,
                                               verbose=1,
                                               n_jobs=-1,
                                               scoring='balanced_accuracy')
random_forest_model_test_3_random.fit(Clinical_train_x, Clinical_train_y)

best_hp_now_3=random_forest_model_test_3_random.best_params_
pprint(best_hp_now_3)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'max_depth': 30,
 'max_features': 6,
 'min_samples_leaf': 8,
 'min_samples_split': 3,
 'n_estimators': 30}


#### Build (select) and save model

In [259]:
# Build RF regression model with optimal hyperparameters
Clinical=random_forest_model_test_3_random.best_estimator_
# Predict test set data
score = Clinical.score(Clinical_test_x, Clinical_test_y)
print(score)
joblib.dump(Clinical, os.path.join(modeldir, Clinical_model))

0.7428571428571429


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/RandomForest_Clinical.model']

#### Model evaluation

In [260]:
model_forest = joblib.load(os.path.join(modeldir, Clinical_model)) 
predict_label = model_forest.predict(Clinical_test_x) #预测的标签
predict_score = model_forest.predict_proba(Clinical_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = Clinical_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7428571428571429,
 'recall': 0.7608695652173914,
 'precision': 0.8333333333333334,
 'f1': 0.7954545454545455,
 'auc': 0.8740942028985507,
 'specificity': 0.7083333333333334}