In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix,roc_curve,auc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,KFold,cross_validate
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score,roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics as sm

In [2]:
dataset = pd.read_csv('wrapper_63.csv')
X= dataset.drop(['label'], axis=1)   # 取出用于建模的特征列X
Y = dataset['label']   # 取出标签列Y

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=420)

In [3]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [4]:
model_GB = GradientBoostingClassifier(random_state = 420)

In [5]:
param_grid = { 'n_estimators': np.arange(50, 130, 10), 'max_depth':  np.arange(4, 20,2 ), 
              'learning_rate': [0.3,0.1,0.08,0.05,0.03 ,0.01,0.001],'min_samples_leaf':[1,2,3,4,5] } 

In [6]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model_GB, param_grid, n_jobs = -1, verbose = 1, cv=10)     
grid_search.fit(X_train, Y_train)   
classifier = grid_search.best_estimator_ 
grid_search.best_params_ 

Fitting 10 folds for each of 2240 candidates, totalling 22400 fits


{'learning_rate': 0.3,
 'max_depth': 4,
 'min_samples_leaf': 3,
 'n_estimators': 70}

In [7]:
classifier.fit(X_train, Y_train)

GradientBoostingClassifier(learning_rate=0.3, max_depth=4, min_samples_leaf=3,
                           n_estimators=70, random_state=420)

In [8]:
y_pred = classifier.predict(X_test)

In [9]:
cp = sm.classification_report(Y_test, y_pred)
print("---------------分类报告\n", cp)

---------------分类报告
               precision    recall  f1-score   support

           0       0.83      0.91      0.87       100
           1       0.83      0.76      0.79        33
           2       0.75      0.79      0.77        53
           3       0.89      0.74      0.81        54

    accuracy                           0.82       240
   macro avg       0.83      0.80      0.81       240
weighted avg       0.83      0.82      0.82       240



In [10]:
print('Accuracy score:', accuracy_score(Y_test, y_pred))
print('Recall:', recall_score(Y_test, y_pred, average='weighted'))
print('F1-score:', f1_score(Y_test, y_pred, average='weighted'))
print('Precision score:', precision_score(Y_test, y_pred, average='weighted'))
# y_test_proba_rfc =classifier.predict_proba(X_test)
# false_positive_rate_rfc, recall_rfc, thresholds_rfc = roc_curve(Y_test, y_test_proba_rfc[:, 1])  
# roc_auc_rfc = auc(false_positive_rate_rfc, recall_rfc)   
# print(roc_auc_rfc )

Accuracy score: 0.825
Recall: 0.825
F1-score: 0.8239675676541924
Precision score: 0.8280676605504588


In [11]:
def bootstrap_auc(y, pred, classes, bootstraps = 100, fold_size = 1000):
    statistics = np.zeros((len(classes), bootstraps))

    for c in range(len(classes)):
        df = pd.DataFrame(columns=['y', 'pred'])
        # df.
        df.loc[:, 'y'] = y
        df.loc[:, 'pred'] = pred
        df_pos = df[df.y == 1]
        df_neg = df[df.y == 0]
        prevalence = len(df_pos) / len(df)
        for i in range(bootstraps):
            pos_sample = df_pos.sample(n = int(fold_size * prevalence), replace=True)
            neg_sample = df_neg.sample(n = int(fold_size * (1-prevalence)), replace=True)

            y_sample = np.concatenate([pos_sample.y.values, neg_sample.y.values])
            pred_sample = np.concatenate([pos_sample.pred.values, neg_sample.pred.values])
            score = roc_auc_score(y_sample, pred_sample)
            statistics[c][i] = score
    return statistics

In [12]:
statistics = bootstrap_auc(Y_test, y_pred,[0,1])
print("均值:",np.mean(statistics,axis=1))
print("最大值:",np.max(statistics,axis=1))
print("最小值:",np.min(statistics,axis=1))

均值: [0.83463679 0.83476852]
最大值: [0.86133157 0.86469356]
最小值: [0.80368166 0.80390212]


In [13]:
import math
from scipy.stats import norm
def calculate_confidence_intervals(tp, fp, tn, fn, confidence=0.95): 
    alpha = 1 - confidence       
# Calculate sensitivity and its confidence interval    
    sensitivity = tp / (tp + fn)   
    se_sensitivity = math.sqrt(sensitivity * (1 - sensitivity) / (tp + fn))  
    z_score = norm.ppf(1 - alpha / 2)   
    ci_sensitivity = (sensitivity - z_score * se_sensitivity, sensitivity + z_score * se_sensitivity)      
# Calculate specificity and its confidence interval    
    specificity = tn / (tn + fp)   
    se_specificity = math.sqrt(specificity * (1 - specificity) / (tn + fp))  
    ci_specificity = (specificity - z_score * se_specificity, specificity + z_score * se_specificity)    
# Calculate PPV and its confidence interval   
    ppv = tp / (tp + fp)  
    se_ppv = math.sqrt(ppv * (1 - ppv) / (tp + fp))   
    ci_ppv = (ppv - z_score * se_ppv, ppv + z_score * se_ppv)        
# Calculate NPV and its confidence interval   

    npv = tn / (tn + fn)    
    se_npv = math.sqrt(npv * (1 - npv) / (tn + fn))    
    ci_npv = (npv - z_score * se_npv, npv + z_score * se_npv)       
    return ci_sensitivity, ci_specificity, ci_ppv, ci_npv

In [14]:
tn, fp, fn, tp = confusion_matrix(Y_test, y_pred).ravel()
ci_sensitivity, ci_specificity, ci_ppv, ci_npv = calculate_confidence_intervals(tp, fp, tn, fn)
print("Sensitivity: {:.2f} (95% CI: {:.2f}, {:.2f})".format(tp / (tp + fn), ci_sensitivity[0], ci_sensitivity[1]))
print("Specificity: {:.2f} (95% CI: {:.2f}, {:.2f})".format(tn / (tn + fp), ci_specificity[0], ci_specificity[1]))
print("PPV: {:.2f} (95% CI: {:.2f}, {:.2f})".format(tp / (tp + fp), ci_ppv[0], ci_ppv[1]))
print("NPV: {:.2f} (95% CI: {:.2f}, {:.2f})".format(tn / (tn + fn), ci_npv[0], ci_npv[1]))

Sensitivity: 0.81 (95% CI: 0.62, 1.00)
Specificity: 0.86 (95% CI: 0.71, 1.01)
PPV: 0.81 (95% CI: 0.62, 1.00)
NPV: 0.86 (95% CI: 0.71, 1.01)
