In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score
# import seaborn as sns
import sklearn.metrics as sm

In [4]:
dataset = pd.read_csv('wrapper_63.csv')
X= dataset.drop(['label'], axis=1)   # 取出用于建模的特征列X
Y = dataset['label']   # 取出标签列Y

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=420)

In [5]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
param_grid=[{
           'n_estimators':[i for i in range(80,180,10)],
            'max_depth':  np.arange(4, 20,2 ),
            'min_samples_split':[3,4,5],
            'min_samples_leaf':[1,2,3],
            'max_leaf_nodes':[30,40,50,60,70],            
            },
           ]

In [7]:
classifier = RandomForestClassifier(class_weight="balanced",random_state=420)

In [8]:

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, n_jobs = -1, verbose = 1, cv=10)     
grid_search.fit(X_train, Y_train)   
classifier = grid_search.best_estimator_ 
grid_search.best_params_ 

Fitting 10 folds for each of 3600 candidates, totalling 36000 fits


{'max_depth': 8,
 'max_leaf_nodes': 60,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 130}

In [9]:
classifier.fit(X_train, Y_train)

RandomForestClassifier(class_weight='balanced', max_depth=8, max_leaf_nodes=60,
                       min_samples_split=3, n_estimators=130, random_state=420)

In [10]:
y_pred = classifier.predict(X_test)

In [11]:
cm = confusion_matrix(Y_test, y_pred)
print(cm)

[[134   3   6   6]
 [  8  41   2   0]
 [  3   6  68   7]
 [ 13   4   7  52]]


In [12]:
cp = sm.classification_report(Y_test, y_pred)
print("---------------分类报告\n", cp)


---------------分类报告
               precision    recall  f1-score   support

           0       0.85      0.90      0.87       149
           1       0.76      0.80      0.78        51
           2       0.82      0.81      0.81        84
           3       0.80      0.68      0.74        76

    accuracy                           0.82       360
   macro avg       0.81      0.80      0.80       360
weighted avg       0.82      0.82      0.82       360



In [13]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score,roc_curve, auc 

print('Accuracy score:', accuracy_score(Y_test, y_pred))
print('Recall:', recall_score(Y_test, y_pred, average='weighted'))
print('F1-score:', f1_score(Y_test, y_pred, average='weighted'))
print('Precision score:', precision_score(Y_test, y_pred, average='weighted'))

# y_test_proba_rfc = classifier.predict_proba(X_test)
# false_positive_rate_rfc, recall_rfc, thresholds_rfc = roc_curve(Y_test, y_test_proba_rfc[:, 1])  
# roc_auc_rfc = auc(false_positive_rate_rfc, recall_rfc)   
# print("---------------AUC\n", cp)
# print(roc_auc_rfc )

Accuracy score: 0.8194444444444444
Recall: 0.8194444444444444
F1-score: 0.8176782107881985
Precision score: 0.8186349664951413


In [12]:
empty = np.array([[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0],[0]])
n=1
while n < 51:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = n)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    classifier = RandomForestClassifier(class_weight='balanced', max_depth=8, max_leaf_nodes=40,
                       min_samples_leaf=2, min_samples_split=5, n_estimators=30,
                       random_state=420)
    classifier.fit(X_train, Y_train)
    
    y_pred = classifier.predict(X_test)   
    y_updatapred = classifier.predict(X_train)
    
    cm = confusion_matrix(Y_test, y_pred)   
    cm_updata = confusion_matrix(Y_train, y_updatapred)
    auc = roc_auc_score(Y_test, y_pred)
    auc_updata = roc_auc_score(Y_train, y_updatapred)
    a = cm[0,0]
    b = cm[0,1]
    c = cm[1,0]
    d = cm[1,1]
    sen = d/(d+c)
    sep = a/(a+b)
    acc = (a+d)/(a+b+c+d)
    pre = a/(a+c)
    F1 = (2*a)/(2*a+c+b)
    
    a_updata = cm_updata[0,0]
    b_updata = cm_updata[0,1]
    c_updata = cm_updata[1,0]
    d_updata = cm_updata[1,1]
    
    sen_updata = d_updata/(d_updata + c_updata)
    sep_updata = a_updata/(a_updata + b_updata)
    acc_updata = (a_updata+d_updata)/(a_updata+b_updata+c_updata+d_updata)
    pre_updata = a_updata/(a_updata + c_updata)
    F1_updata = (2*a_updata)/(2*a_updata+c_updata+b_updata)
    
    first = np.array([[n],[sen],[sep],[acc],[pre],[F1],[auc],[sen_updata],[sep_updata],[acc_updata],[pre_updata],[F1_updata],[auc_updata]])
    second = np.hstack((empty,first))
    empty = second
    n = n + 1
    print(n)
final_par = np.delete(second,0,axis=1)
print (final_par)
final_parT = final_par.T
np.savetxt('RandomForestClassifier_zongcr',final_parT,delimiter=',')

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
[[1.00000000e+00 2.00000000e+00 3.00000000e+00 4.00000000e+00
  5.00000000e+00 6.00000000e+00 7.00000000e+00 8.00000000e+00
  9.00000000e+00 1.00000000e+01 1.10000000e+01 1.20000000e+01
  1.30000000e+01 1.40000000e+01 1.50000000e+01 1.60000000e+01
  1.70000000e+01 1.80000000e+01 1.90000000e+01 2.00000000e+01
  2.10000000e+01 2.20000000e+01 2.30000000e+01 2.40000000e+01
  2.50000000e+01 2.60000000e+01 2.70000000e+01 2.80000000e+01
  2.90000000e+01 3.00000000e+01 3.10000000e+01 3.20000000e+01
  3.30000000e+01 3.40000000e+01 3.50000000e+01 3.60000000e+01
  3.70000000e+01 3.80000000e+01 3.90000000e+01 4.00000000e+01
  4.10000000e+01 4.20000000e+01 4.30000000e+01 4.40000000e+01
  4.50000000e+01 4.60000000e+01 4.70000000e+01 4.80000000e+01
  4.90000000e+01 5.00000000e+01]
 [1.30434783e-01 4.34782609e-02 8.69565217e-02 4.34782609e-02
  2.17391304e-01 8.