## KNN

+ 重要参数:
以下是 GridSearchCV 方法中常用的超参数。
1. estimator ：创建的算法对象。
2. param_grid ：值为字典或者列表，需要最优化的参数的取值。
3. scoring ：准确度评价标准，默认 None ,这时需要使用 score 函数；或者如 scoring='roc_auc' ，根据所选模型不同，评价准则不同。字符串（函数名）或是可调用对象，需要其函数签名，形如 scorer(estimator, X, y) ；如果是 None ，则使用 estimator 的误差估计函数。
4. n_jobs ：并行数，默认为 1 ， n_jobs = -1 表示跟CPU核数一致。
5. cv ：交叉验证参数，默认 None ，使用三折交叉验证。指定 fold 数量，默认为 3 ，也可以是 yield 训练或测试数据的生成器。
6. verbose ：日志冗长度。 verbose = 0 表示不输出训练过程， verbose = 1 表示偶尔输出， verbose > 1 表示对每个子模型都输出。
GridSearchCV 还内置了一些属性。
1. best_estimator_ ：效果最好的分类器。
2. best_score_ ：成员提供优化过程期间观察到的最好的评分。
3. best_params_ ：描述了已取得最佳结果的参数的组合。
4. best_index_ ：对应于最佳候选参数设置的索引（ cv_results_ 数组的索引）。

sklearn 库中的k-NN方法有很多超参数，常用的超参数如下：
1. weights ：用于分配权重。基本的最近邻回归使用统一的权重，即本地邻域内的每个邻点对查询点的分类贡献一致。在某些环境下，对邻点加权可能是有利的，使得附近点对于回归所作出的贡献多于远处点。默认为 weights = 'uniform' ，表示为所有点分配同等权重。 weights = 'distance' 表示分配的权重与查询点距离呈反比。此外，我们还可以自定义一个距离函数用来计算权重。
2. n_neighbors ：邻居个数。
3. p ： p 参数只有在 weights = 'distance' 时才有。 p 是一个大于或等于1的值。 p = 1 表示曼哈顿距离 (Manhattan Distance)， p = 2 表示欧式距离 (Euclidean Distance)， p = ∞ 表示它是各个坐标距离的最大值。

In [6]:
import random
import joblib
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from openpyxl import load_workbook
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
warnings.filterwarnings("ignore")

In [7]:
from sklearn.metrics import confusion_matrix    #导入计算混淆矩阵的包
def specificity_score(y_true, y_pred):
    C = confusion_matrix(y_true, y_pred)
    TP = C[1,1]
    FP = C[0,1]
    TN = C[0,0]
    FN = C[1,0]
    specificity = TN/(TN+FP)
    return specificity

def classification_evaluation(y_true, y_pred, y_score):
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    auc = metrics.roc_auc_score(y_true, y_score)
    f1 = metrics.f1_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    evaluation = {'accuracy':accuracy, 'recall':recall, 'precision':precision, 'f1':f1, 'auc':auc, 'specificity':specificity}
    return evaluation

### Global params setting and load data

In [8]:
cwd = os.getcwd()
traindir = '../Feature_filter/Feas_data'
testdir = '../Feature_filter/Feas_data_test'
imgdir = os.path.join(cwd, 'IMG')
modeldir = os.path.join(cwd, 'Model')
tag_cols = ['pid', 'label', 'series', 'image', 'mask']
sequence_id = [2, 3, 4]
# Generate the random seed
random_state = random.randint(1,10000)

# Load the SSM features.
# SSM_train_slist = [pd.read_excel(os.path.join(traindir, 'feas_lasso.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
SSM_train_slist = [pd.read_excel(os.path.join(traindir,'feas_mrmr_sel.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]
SSM_test_slist = [pd.read_excel(os.path.join(testdir, 'SSM_test.xlsx'), sheet_name=f'sequence{num}') for num in sequence_id]

In [9]:
# Print features
SSM_features_list = [df.columns for df in SSM_train_slist]
print(SSM_features_list[0].to_list())
print(SSM_features_list[1].to_list())
print(SSM_features_list[2].to_list())
print(len(SSM_features_list[0])-5, len(SSM_features_list[1])-5, len(SSM_features_list[2])-5)

['pid', 'label', 'series', 'image', 'mask', 'glszm_SmallAreaEmphasis_logarithm', 'glcm_InverseVariance_exponential', 'glszm_GrayLevelNonUniformity_wavelet-HHH', 'firstorder_Skewness_logarithm', 'glcm_Correlation_log-sigma-3-0-mm-3D']
['pid', 'label', 'series', 'image', 'mask', 'gldm_DependenceVariance_wavelet-LLH', 'ngtdm_Contrast_wavelet-HHL', 'firstorder_Skewness_log-sigma-2-0-mm-3D', 'glcm_Imc2_wavelet-HHH', 'glrlm_RunEntropy_exponential']
['pid', 'label', 'series', 'image', 'mask', 'glrlm_ShortRunLowGrayLevelEmphasis_square', 'glszm_ZoneEntropy_exponential']
5 5 2


In [10]:
# Train data
standardscaler = StandardScaler()
SSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_train_slist]
SSM_train_y = [df['label'] for df in SSM_train_slist]
SSM_train_x = [df.drop(tag_cols, axis=1) for df in SSM_train_slist]
SSM_train_x = [standardscaler.fit_transform(df) for df in SSM_train_x]
# Test data
SSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in SSM_test_slist]
SSM_test_y = [df['label'] for df in SSM_test_slist]
SSM_test_x = [df.drop(tag_cols, axis=1) for df in SSM_test_slist]
SSM_test_x = [standardscaler.fit_transform(df) for df in SSM_test_x]

SSM2_train_y, SSM3_train_y, SSM4_train_y = (y_.to_list() for y_ in SSM_train_y)             
SSM2_train_x, SSM3_train_x, SSM4_train_x = (x_ for x_ in SSM_train_x)
SSM2_test_y, SSM3_test_y, SSM4_test_y = (y_.to_list() for y_ in SSM_test_y)
SSM2_test_x, SSM3_test_x, SSM4_test_x = (x_ for x_ in SSM_test_x)
SSM2_model, SSM3_model, SSM4_model = (f'KNN_SSM{i+2}.model' for i in range(3))

### 系统调参

### SSM2

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [133]:
# Search optimal hyperparameter
random_regression_seed=np.random.randint(low=1,high=230)

hparam = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

# Random search
model_base = KNeighborsClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=3,
                          verbose=1,
                          scoring='f1'
)
model_grid.fit(SSM2_train_x, SSM2_train_y)

best_SSM2 = model_grid.best_params_
pprint(best_SSM2)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
{'n_neighbors': 8, 'p': 1, 'weights': 'distance'}


In [134]:
# Build RF regression model with optimal hyperparameters
SSM2=model_grid.best_estimator_
# Predict test set data
score = SSM2.score(SSM2_test_x, SSM2_test_y)
print(score)
joblib.dump(SSM2, os.path.join(modeldir, SSM2_model))

0.6285714285714286


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/KNN_SSM2.model']

In [135]:
model= joblib.load(os.path.join(modeldir, SSM2_model)) 
predict_label = model.predict(SSM2_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM2_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM2_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6285714285714286,
 'recall': 0.8260869565217391,
 'precision': 0.6785714285714286,
 'f1': 0.7450980392156864,
 'auc': 0.5996376811594203,
 'specificity': 0.25}

### SSM3

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [151]:
# Search optimal hyperparameter
random_regression_seed=np.random.randint(low=1,high=230)

hparam = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

# Random search
model_base = KNeighborsClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=5,
                          verbose=1,
                          scoring='accuracy'
)
model_grid.fit(SSM3_train_x, SSM3_train_y)

best_SSM3 = model_grid.best_params_
pprint(best_SSM3)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
{'n_neighbors': 7, 'p': 3, 'weights': 'distance'}


In [152]:
# Build RF regression model with optimal hyperparameters
SSM3=model_grid.best_estimator_
# Predict test set data
score = SSM3.score(SSM3_test_x, SSM3_test_y)
print(score)
joblib.dump(SSM3, os.path.join(modeldir, SSM3_model))

0.6428571428571429


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/KNN_SSM3.model']

In [153]:
model= joblib.load(os.path.join(modeldir, SSM3_model)) 
predict_label = model.predict(SSM3_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM3_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6428571428571429,
 'recall': 0.7391304347826086,
 'precision': 0.723404255319149,
 'f1': 0.7311827956989247,
 'auc': 0.6304347826086957,
 'specificity': 0.4583333333333333}

### SSM4

#### 超参数遍历匹配择优
+ 依据上述所得到的随机最优匹配结果，进行遍历全部组合的匹配择优。

In [181]:
# Search optimal hyperparameter
random_regression_seed=np.random.randint(low=1,high=230)

hparam = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

# Random search
model_base = KNeighborsClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=5,
                          verbose=1,
                          scoring='accuracy'
)
model_grid.fit(SSM4_train_x, SSM4_train_y)

best_SSM4 = model_grid.best_params_
pprint(best_SSM4)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
{'n_neighbors': 10, 'weights': 'uniform'}


In [182]:
# Build RF regression model with optimal hyperparameters
SSM4=model_grid.best_estimator_
# Predict test set data
score = SSM4.score(SSM4_test_x, SSM4_test_y)
print(score)
joblib.dump(SSM4, os.path.join(modeldir, SSM4_model))

0.6571428571428571


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/KNN_SSM4.model']

In [183]:
model= joblib.load(os.path.join(modeldir, SSM4_model)) 
predict_label = model.predict(SSM4_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(SSM4_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = SSM4_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6571428571428571,
 'recall': 0.782608695652174,
 'precision': 0.72,
 'f1': 0.7499999999999999,
 'auc': 0.6413043478260869,
 'specificity': 0.4166666666666667}

### DSM

In [184]:
# Load the DSM features.
Dtag_cols = ['pid', 'label']
DSM_train_slist = [pd.read_excel(os.path.join(traindir, 'DSM_feas_mrmr_sel.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
DSM_test_slist = [pd.read_excel(os.path.join(testdir, 'DSM_test.xlsx'), sheet_name=f'no_sequence{num}') for num in sequence_id]
# Train data
standardscaler = StandardScaler()
DSM_train_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_train_slist]
DSM_train_y = [df['label'] for df in DSM_train_slist]
DSM_train_x = [df.drop(Dtag_cols, axis=1) for df in DSM_train_slist]
DSM_train_x = [standardscaler.fit_transform(df) for df in DSM_train_x]
# Test data
DSM_test_slist = [df.sample(frac=1.0, random_state=random_state) for df in DSM_test_slist]
DSM_test_y = [df['label'] for df in DSM_test_slist]
DSM_test_x = [df.drop(Dtag_cols, axis=1) for df in DSM_test_slist]
DSM_test_x = [standardscaler.fit_transform(df) for df in DSM_test_x]

DSM2_train_y, DSM3_train_y, DSM4_train_y = (y_.to_list() for y_ in DSM_train_y)             
DSM2_train_x, DSM3_train_x, DSM4_train_x = (x_ for x_ in DSM_train_x)
DSM2_test_y, DSM3_test_y, DSM4_test_y = (y_.to_list() for y_ in DSM_test_y)
DSM2_test_x, DSM3_test_x, DSM4_test_x = (x_ for x_ in DSM_test_x)
DSM2_model, DSM3_model, DSM4_model = (f'KNN_DSM{i+2}.model' for i in range(3))

In [257]:
# Search optimal hyperparameter
random_regression_seed=np.random.randint(low=1,high=230)

hparam = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

# Random search
model_base = KNeighborsClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=4,
                          verbose=1,
                          scoring='f1'
)
model_grid.fit(DSM3_train_x, DSM3_train_y)

best_DSM3 = model_grid.best_params_
pprint(best_DSM3)

Fitting 4 folds for each of 60 candidates, totalling 240 fits
{'n_neighbors': 10, 'p': 1, 'weights': 'distance'}


In [258]:
# Build RF regression model with optimal hyperparameters
DSM3=model_grid.best_estimator_
# Predict test set data
score = DSM3.score(DSM3_test_x, DSM3_test_y)
print(score)
joblib.dump(DSM3, os.path.join(modeldir, DSM3_model))

0.6285714285714286


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/KNN_DSM3.model']

In [259]:
model= joblib.load(os.path.join(modeldir, DSM3_model)) 
predict_label = model.predict(DSM3_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(DSM3_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = DSM3_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6285714285714286,
 'recall': 0.8043478260869565,
 'precision': 0.6851851851851852,
 'f1': 0.74,
 'auc': 0.6268115942028987,
 'specificity': 0.2916666666666667}

### ASM

In [260]:
# Load the ASM features.
Atag_cols = ['pid', 'label']
ASM_train = pd.read_csv(os.path.join(traindir, 'ASM_mrmr_feas.csv')) 
ASM_test = pd.read_csv(os.path.join(testdir, 'ASM_test.csv')) 
# Train data
standardscaler = StandardScaler()
ASM_train = ASM_train.sample(frac=1.0, random_state=random_state) 
ASM_train_y = ASM_train['label'] 
ASM_train_x = ASM_train.drop(Atag_cols, axis=1) 
ASM_train_x = standardscaler.fit_transform(ASM_train_x) 
# Test data
ASM_test = ASM_test.sample(frac=1.0, random_state=random_state)
ASM_test_y = ASM_test['label']
ASM_test_x =ASM_test.drop(Dtag_cols, axis=1)
ASM_test_x = standardscaler.fit_transform(ASM_test_x)

ASM_model = f'KNN_ASM.model'

In [270]:
# Search optimal hyperparameter
random_regression_seed=np.random.randint(low=1,high=230)

hparam = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

# Random search
model_base = KNeighborsClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=4,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(ASM_train_x, ASM_train_y)

best_ASM = model_grid.best_params_
pprint(best_ASM)

Fitting 4 folds for each of 60 candidates, totalling 240 fits
{'n_neighbors': 8, 'p': 1, 'weights': 'distance'}


In [271]:
# Build RF regression model with optimal hyperparameters
ASM=model_grid.best_estimator_
# Predict test set data
score = ASM.score(ASM_test_x, ASM_test_y)
print(score)
joblib.dump(ASM, os.path.join(modeldir, ASM_model))

0.6857142857142857


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/KNN_ASM.model']

In [272]:
model= joblib.load(os.path.join(modeldir, ASM_model)) 
predict_label = model.predict(ASM_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(ASM_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = ASM_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.6857142857142857,
 'recall': 0.8913043478260869,
 'precision': 0.7068965517241379,
 'f1': 0.7884615384615384,
 'auc': 0.6231884057971014,
 'specificity': 0.2916666666666667}

### Clinical

In [18]:
# Load the Clinical features.
Atag_cols = ['pid', 'label']
Clinical_train = pd.read_csv(os.path.join(traindir, 'clinical_lasso_sel.csv')) 
Clinical_test = pd.read_csv(os.path.join(testdir, 'clinical_test.csv')) 
# Train data
standardscaler = StandardScaler()
Clinical_train = Clinical_train.sample(frac=1.0, random_state=random_state) 
Clinical_train_y = Clinical_train['label'] 
Clinical_train_x = Clinical_train.drop(Atag_cols, axis=1) 
Clinical_train_x = standardscaler.fit_transform(Clinical_train_x) 
# Test data
Clinical_test = Clinical_test.sample(frac=1.0, random_state=random_state)
Clinical_test_y = Clinical_test['label']
Clinical_test_x =Clinical_test.drop(Atag_cols, axis=1)
Clinical_test_x = standardscaler.fit_transform(Clinical_test_x)

Clinical_model = f'KNN_Clinical.model'

In [19]:
# Search optimal hyperparameter
random_regression_seed=np.random.randint(low=1,high=230)

hparam = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

# Random search
model_base = KNeighborsClassifier()
model_grid = GridSearchCV(estimator=model_base,
                          param_grid=hparam,
                          n_jobs=-1,
                          cv=5,
                          verbose=1,
                          scoring='roc_auc'
)
model_grid.fit(Clinical_train_x, Clinical_train_y)

best_Clinical = model_grid.best_params_
pprint(best_Clinical)

# Build RF regression model with optimal hyperparameters
Clinical=model_grid.best_estimator_
# Predict test set data
score = Clinical.score(Clinical_test_x, Clinical_test_y)
print(score)
joblib.dump(Clinical, os.path.join(modeldir, Clinical_model))

Fitting 5 folds for each of 60 candidates, totalling 300 fits
{'n_neighbors': 7, 'p': 2, 'weights': 'distance'}
0.8285714285714286


['/media/tx-deepocean/Data/2022/chongfu1/Model/Train/Model/KNN_Clinical.model']

In [20]:
model= joblib.load(os.path.join(modeldir, Clinical_model)) 
predict_label = model.predict(Clinical_test_x)#, probability=True) #预测的标签
predict_score = model.predict_proba(Clinical_test_x)#得到标签0 (y_predict[:,0])和1 (y_predict[:,1])对应的概率
label = Clinical_test_y  #真实标签
eva_dic = classification_evaluation(label, predict_label, predict_score[:,1])
eva_dic

{'accuracy': 0.8285714285714286,
 'recall': 0.8913043478260869,
 'precision': 0.8541666666666666,
 'f1': 0.8723404255319149,
 'auc': 0.8858695652173914,
 'specificity': 0.7083333333333334}