## NaiveBayes

+ 重要参数:
sklearn GaussianNB（高斯朴素贝叶斯）模型使用RandomSearchCV获取最优参数及可视化

 

GaussianNB是高斯贝叶斯分类器，它假设特征的条件分布概率满足高斯分布，其原型为：

sklearn.naive_bayes.GaussianNB

GaussianNB没有参数，所以不需要调参。

但是：

param_dict = dict(
        priors=[[0.68,0.32],[0.7,0.3],[0.8,0.2]],
        var_smoothing = [1e-9,1e-8,1e-7],
    )

依旧是可以进行设置的。


In [1]:
import random
import joblib
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from openpyxl import load_workbook
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
warnings.filterwarnings("ignore")

In [2]:
from sklearn.metrics import confusion_matrix    #导入计算混淆矩阵的包
def specificity_score(y_true, y_pred):
    C = confusion_matrix(y_true, y_pred)
    TP = C[1,1]
    FP = C[0,1]
    TN = C[0,0]
    FN = C[1,0]
    specificity = TN/(TN+FP)
    return specificity

def classification_evaluation(y_true, y_pred, y_score):
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    auc = metrics.roc_auc_score(y_true, y_score)
    f1 = metrics.f1_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    evaluation = {'accuracy':accuracy, 'recall':recall, 'precision':precision, 'f1':f1, 'auc':auc, 'specificity':specificity}
    return evaluation

### Global params setting and load data

In [3]:
cwd = os.getcwd()
traindir = '../Feature_filter/Feas_data'
testdir = '../Feature_filter/Feas_data_test'
imgdir = os.path.join(cwd, 'IMG')
modeldir = os.path.join(cwd, 'Model')
tag_cols = ['pid', 'label', 'series', 'image', 'mask']
sequence_id = [2, 3, 4]
# Generate the random seed
random_state = random.randint(1,10000)

# Load the SSM features.
SSM_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_mrmr_sel.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
SSM_test_slist = [pd.read_excel(os.path.join(testdir, 'SSM_test.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]

In [4]:
# Print features
SSM_features_list = [df.columns for df in SSM_train_slist]
print(SSM_features_list[0].to_list())
print(SSM_features_list[1].to_list())
print(SSM_features_list[2].to_list())
print(len(SSM_features_list[0])-4, len(SSM_features_list[1])-4, len(SSM_features_list[2])-4)

['pid', 'label', 'series', 'image', 'mask', 'glszm_SmallAreaEmphasis_logarithm', 'glcm_InverseVariance_exponential', 'glszm_GrayLevelNonUniformity_wavelet-HHH', 'firstorder_Skewness_logarithm', 'glcm_Correlation_log-sigma-3-0-mm-3D']
['pid', 'label', 'series', 'image', 'mask', 'gldm_DependenceVariance_wavelet-LLH', 'ngtdm_Contrast_wavelet-HHL', 'firstorder_Skewness_log-sigma-2-0-mm-3D', 'glcm_Imc2_wavelet-HHH', 'glrlm_RunEntropy_exponential']
['pid', 'label', 'series', 'image', 'mask', 'glrlm_ShortRunLowGrayLevelEmphasis_square', 'glszm_ZoneEntropy_exponential']
6 6 3


In [5]:
# Train data
standardscaler = StandardScaler()
SSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_train_slist]
SSM_train_y = [df['label'] for df in SSM_train_slist]
SSM_train_x = [df.drop(tag_cols, axis=1) for df in SSM_train_slist]
SSM_train_x = [standardscaler.fit_transform(df) for df in SSM_train_x]
# Test data
SSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_test_slist]
SSM_test_y = [df['label'] for df in SSM_test_slist]
SSM_test_x = [df.drop(tag_cols, axis=1) for df in SSM_test_slist]
SSM_test_x = [standardscaler.fit_transform(df) for df in SSM_test_x]

SSM2_train_y, SSM3_train_y, SSM4_train_y = (y_.to_list() for y_ in SSM_train_y)             
SSM2_train_x, SSM3_train_x, SSM4_train_x = (x_ for x_ in SSM_train_x)
SSM2_test_y, SSM3_test_y, SSM4_test_y = (y_.to_list() for y_ in SSM_test_y)
SSM2_test_x, SSM3_test_x, SSM4_test_x = (x_ for x_ in SSM_test_x)
SSM2_model, SSM3_model, SSM4_model = (f'NaiveBayes_SSM{i+2}.model' for i in range(3))

### 系统调参

### SSM2

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [48]:
# Search optimal hyperparameter
hparam = { 
#        'priors':[[0.68,0.32],[0.7,0.3],[0.8,0.2]],
        'var_smoothing':[1e-12, 1e-11, 1e-10, 1e-9,1e-8,1e-7]
}

# Random search
model_base = GaussianNB()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=6,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(SSM2_train_x, SSM2_train_y)

best_SSM2 = model_grid.best_params_
pprint(best_SSM2)

Fitting 6 folds for each of 6 candidates, totalling 36 fits
{'var_smoothing': 1e-12}


In [49]:
# Build RF regression model with optimal hyperparameters
SSM2=model_grid.best_estimator_
# Predict test set data
score = SSM2.score(SSM2_test_x, SSM2_test_y)
print(score)
joblib.dump(SSM2, os.path.join(modeldir, SSM2_model))

0.7142857142857143


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/NaiveBayes_SSM2.model']

In [50]:
model= joblib.load(os.path.join(modeldir, SSM2_model)) 
predict_label = model.predict(SSM2_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM2_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM2_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7142857142857143,
 'recall': 0.8695652173913043,
 'precision': 0.7407407407407407,
 'f1': 0.7999999999999999,
 'auc': 0.6521739130434783,
 'specificity': 0.4166666666666667}

### SSM3

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [57]:
# Search optimal hyperparameter
hparam = { 
#        'priors':[[0.68,0.32],[0.7,0.3],[0.8,0.2]],
        'var_smoothing':[1e-12,1e-9,1e-8,1e-7]
}

# Random search
model_base = GaussianNB()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=3,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(SSM3_train_x, SSM3_train_y)

best_SSM3 = model_grid.best_params_
pprint(best_SSM3)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'var_smoothing': 1e-12}


In [58]:
# Build RF regression model with optimal hyperparameters
SSM3=model_grid.best_estimator_
# Predict test set data
score = SSM3.score(SSM3_test_x, SSM3_test_y)
print(score)
joblib.dump(SSM3, os.path.join(modeldir, SSM3_model))

0.7285714285714285


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/NaiveBayes_SSM3.model']

In [59]:
model= joblib.load(os.path.join(modeldir, SSM3_model)) 
predict_label = model.predict(SSM3_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM3_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7285714285714285,
 'recall': 0.8913043478260869,
 'precision': 0.7454545454545455,
 'f1': 0.8118811881188119,
 'auc': 0.6802536231884058,
 'specificity': 0.4166666666666667}

### SSM4

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [84]:
# Search optimal hyperparameter
hparam = { 
#        'priors':[[0.68,0.32],[0.7,0.3],[0.8,0.2]],
        'var_smoothing':[1e-10,1e-9,1e-8,1e-7]
}

# Random search
model_base = GaussianNB()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=7,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(SSM4_train_x, SSM4_train_y)

best_SSM4 = model_grid.best_params_
pprint(best_SSM4)

Fitting 7 folds for each of 4 candidates, totalling 28 fits
{'var_smoothing': 1e-10}


In [85]:
# Build RF regression model with optimal hyperparameters
SSM4=model_grid.best_estimator_
# Predict test set data
score = SSM4.score(SSM4_test_x, SSM4_test_y)
print(score)
joblib.dump(SSM4, os.path.join(modeldir, SSM4_model))

0.6714285714285714


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/NaiveBayes_SSM4.model']

In [86]:
model= joblib.load(os.path.join(modeldir, SSM4_model)) 
predict_label = model.predict(SSM4_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM4_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM4_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6714285714285714,
 'recall': 0.8913043478260869,
 'precision': 0.6949152542372882,
 'f1': 0.780952380952381,
 'auc': 0.7047101449275364,
 'specificity': 0.25}

### DSM

In [6]:
# Load the DSM features.
Dtag_cols = ['pid', 'label']
DSM_train_slist = [pd.read_excel(os.path.join(traindir, 'DSM_feas_mrmr_sel.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
DSM_test_slist = [pd.read_excel(os.path.join(testdir, 'DSM_test.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
# Train data
standardscaler = StandardScaler()
DSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_train_slist]
DSM_train_y = [df['label'] for df in DSM_train_slist]
DSM_train_x = [df.drop(Dtag_cols, axis=1) for df in DSM_train_slist]
DSM_train_x = [standardscaler.fit_transform(df) for df in DSM_train_x]
# Test data
DSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_test_slist]
DSM_test_y = [df['label'] for df in DSM_test_slist]
DSM_test_x = [df.drop(Dtag_cols, axis=1) for df in DSM_test_slist]
DSM_test_x = [standardscaler.fit_transform(df) for df in DSM_test_x]

DSM2_train_y, DSM3_train_y, DSM4_train_y = (y_.to_list() for y_ in DSM_train_y)             
DSM2_train_x, DSM3_train_x, DSM4_train_x = (x_ for x_ in DSM_train_x)
DSM2_test_y, DSM3_test_y, DSM4_test_y = (y_.to_list() for y_ in DSM_test_y)
DSM2_test_x, DSM3_test_x, DSM4_test_x = (x_ for x_ in DSM_test_x)
DSM2_model, DSM3_model, DSM4_model = (f'NaiveBayes_DSM{i+2}.model' for i in range(3))

In [98]:
# Search optimal hyperparameter
hparam = { 
#        'priors':[[0.68,0.32],[0.7,0.3],[0.8,0.2]],
        'var_smoothing':[1e-12,1e-9,1e-8,1e-7]
}

# Random search
model_base = GaussianNB()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=3,
                          verbose=1,
                          scoring='accuracy'
)
model_grid.fit(DSM3_train_x, DSM3_train_y)

best_DSM3 = model_grid.best_params_
pprint(best_DSM3)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'var_smoothing': 1e-12}


In [99]:
# Build RF regression model with optimal hyperparameters
DSM3=model_grid.best_estimator_
# Predict test set data
score = DSM3.score(DSM3_test_x, DSM3_test_y)
print(score)
joblib.dump(DSM3, os.path.join(modeldir, DSM3_model))

0.7428571428571429


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/NaiveBayes_DSM3.model']

In [100]:
model= joblib.load(os.path.join(modeldir, DSM3_model)) 
predict_label = model.predict(DSM3_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(DSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = DSM3_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7428571428571429,
 'recall': 0.8478260869565217,
 'precision': 0.78,
 'f1': 0.8125,
 'auc': 0.7028985507246376,
 'specificity': 0.5416666666666666}

### ASM

In [101]:
# Load the ASM features.
Atag_cols = ['pid', 'label']
ASM_train = pd.read_csv(os.path.join(traindir, 'ASM_mrmr_feas.csv')) 
ASM_test = pd.read_csv(os.path.join(testdir, 'ASM_test.csv')) 
# Train data
standardscaler = StandardScaler()
ASM_train = ASM_train.sample(frac=1.0, random_state=random_state) 
ASM_train_y = ASM_train['label'] 
ASM_train_x = ASM_train.drop(Atag_cols, axis=1) 
ASM_train_x = standardscaler.fit_transform(ASM_train_x) 
# Test data
ASM_test = ASM_test.sample(frac=1.0, random_state=random_state)
ASM_test_y = ASM_test['label']
ASM_test_x =ASM_test.drop(Dtag_cols, axis=1)
ASM_test_x = standardscaler.fit_transform(ASM_test_x)

ASM_model = f'NaiveBayes_ASM.model'

In [102]:
# Search optimal hyperparameter
hparam = { 
#        'priors':[[0.68,0.32],[0.7,0.3],[0.8,0.2]],
        'var_smoothing':[1e-12,1e-9,1e-8,1e-7]
}

# Random search
model_base = GaussianNB()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=5,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(ASM_train_x, ASM_train_y)

best_ASM = model_grid.best_params_
pprint(best_ASM)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
{'var_smoothing': 1e-12}


In [103]:
# Build RF regression model with optimal hyperparameters
ASM=model_grid.best_estimator_
# Predict test set data
score = ASM.score(ASM_test_x, ASM_test_y)
print(score)
joblib.dump(ASM, os.path.join(modeldir, ASM_model))

0.7


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/NaiveBayes_ASM.model']

In [104]:
model= joblib.load(os.path.join(modeldir, ASM_model)) 
predict_label = model.predict(ASM_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(ASM_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = ASM_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.7,
 'recall': 0.782608695652174,
 'precision': 0.7659574468085106,
 'f1': 0.7741935483870968,
 'auc': 0.6829710144927537,
 'specificity': 0.5416666666666666}

### Clinical

In [20]:
# Load the Clinical features.
Atag_cols = ['pid', 'label']
Clinical_train = pd.read_csv(os.path.join(traindir, 'clinical_lasso_sel.csv')) 
Clinical_test = pd.read_csv(os.path.join(testdir, 'clinical_test.csv')) 
# Train data
standardscaler = StandardScaler()
Clinical_train = Clinical_train.sample(frac=1.0, random_state=random_state) 
Clinical_train_y = Clinical_train['label'] 
Clinical_train_x = Clinical_train.drop(Atag_cols, axis=1) 
Clinical_train_x = standardscaler.fit_transform(Clinical_train_x) 
# Test data
Clinical_test = Clinical_test.sample(frac=1.0, random_state=random_state)
Clinical_test_y = Clinical_test['label']
Clinical_test_x =Clinical_test.drop(Dtag_cols, axis=1)
Clinical_test_x = standardscaler.fit_transform(Clinical_test_x)

Clinical_model = f'NaiveBayes_Clinical.model'

In [22]:
# Search optimal hyperparameter
hparam = { 
#        'priors':[[0.68,0.32],[0.7,0.3],[0.8,0.2]],
        'var_smoothing':[1e-12,1e-9,1e-8,1e-7]
}

# Random search
model_base = GaussianNB()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=2,
                          verbose=1,
                          scoring='balanced_accuracy'
)
model_grid.fit(Clinical_train_x, Clinical_train_y)

best_Clinical = model_grid.best_params_
pprint(best_Clinical)

# Build RF regression model with optimal hyperparameters
Clinical=model_grid.best_estimator_
# Predict test set data
score = Clinical.score(Clinical_test_x, Clinical_test_y)
print(score)
joblib.dump(Clinical, os.path.join(modeldir, Clinical_model))

Fitting 2 folds for each of 4 candidates, totalling 8 fits
{'var_smoothing': 1e-07}
0.6571428571428571


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/NaiveBayes_Clinical.model']

In [23]:
model= joblib.load(os.path.join(modeldir, Clinical_model)) 
predict_label = model.predict(Clinical_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(Clinical_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = Clinical_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6571428571428571,
 'recall': 1.0,
 'precision': 0.6571428571428571,
 'f1': 0.7931034482758621,
 'auc': 0.5,
 'specificity': 0.0}