## 网格搜索案例

In [1]:
from sklearn import ensemble,datasets
from sklearn.model_selection import train_test_split,GridSearchCV

In [2]:
iris = datasets.load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [3]:
rf = ensemble.RandomForestClassifier()
param = {"n_estimators":range(1,11)}
gs = GridSearchCV(rf,param)

In [4]:
gs.fit(iris.data,iris.target)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [5]:
gs.cv_results_


{'mean_fit_time': array([0.00307941, 0.00624053, 0.0079298 , 0.00742443, 0.00833583,
        0.01254567, 0.01301757, 0.01521564, 0.01379259, 0.02083747]),
 'std_fit_time': array([0.00055379, 0.0005129 , 0.0010706 , 0.00114889, 0.0002229 ,
        0.00273757, 0.00262877, 0.00381227, 0.00024347, 0.00412939]),
 'mean_score_time': array([0.00111564, 0.00086967, 0.00092689, 0.00109458, 0.00096742,
        0.00147939, 0.00166877, 0.00126203, 0.00140222, 0.00147168]),
 'std_score_time': array([1.91993370e-04, 1.12219336e-04, 8.25542094e-05, 1.53153954e-04,
        1.00971199e-05, 5.52673911e-04, 6.44095468e-04, 8.14238912e-05,
        1.92436863e-04, 9.62735548e-05]),
 'param_n_estimators': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 1},
  {'n_estimators': 2},
  {'n_estimators': 3},
  {'n_estimato

In [6]:
gs.best_score_

0.9666666666666667

In [7]:
# 最优参数
gs.best_params_

{'n_estimators': 5}

## Pipeline的使用

In [8]:
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

In [9]:
parameters = {"rf__n_estimators": range(1,11)}
pipeline = Pipeline([('scaler',preprocessing.StandardScaler()),('rf',rf)])

In [10]:
gs2 = GridSearchCV(estimator=pipeline,param_grid=parameters)

In [11]:
gs2.fit(iris.data,iris.target)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
              

In [12]:
gs2.best_score_

0.96

In [13]:
gs2.best_params_

{'rf__n_estimators': 3}

## 信用卡违约率进行分析

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [15]:
data = pd.read_csv('./data/credit_default-master/UCI_Credit_Card.csv')
data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,1,3,1,39,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,-1,-1,-1,-1,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,4,3,2,-1,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,1,-1,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


In [16]:
data.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [17]:
data.iloc[:,-1].value_counts()

0    23364
1     6636
Name: default.payment.next.month, dtype: int64

In [18]:
X = data.drop(labels=['ID','default.payment.next.month'],axis=1)
X

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2,2,2,26,-1,2,0,0,0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0
2,90000.0,2,2,2,34,0,0,0,0,0,...,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
3,50000.0,2,2,1,37,0,0,0,0,0,...,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000.0,1,3,1,39,0,0,0,0,0,...,208365.0,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0
29996,150000.0,1,3,2,43,-1,-1,-1,-1,0,...,3502.0,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0
29997,30000.0,1,2,2,37,4,3,2,-1,0,...,2758.0,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0
29998,80000.0,1,3,1,41,1,-1,0,0,0,...,76304.0,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0


In [19]:
y = data.iloc[:,-1]
y

0        1
1        1
2        0
3        0
4        0
        ..
29995    0
29996    0
29997    1
29998    1
29999    1
Name: default.payment.next.month, Length: 30000, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
# 对具体的分类器进行GridSearchCV参数调优
def Grid_work(ml,X_train,X_test,y_train,y_test,param_grid,score='accuracy'):
    gs = GridSearchCV(estimator=ml,param_grid=param_grid)
    search = gs.fit(X_train,y_train)
    print("最优参数及score：",search.best_params_,search.best_score_)
    y_pred= gs.predict(X_test)
    print("模型效果：")
    print(metrics.classification_report(y_test,y_pred))
    print("网格搜索的结果参数：")
    print(search.cv_results_)

In [22]:
svm = SVC(random_state=1,kernel='rbf')
parm_svm = {"C":np.linspace(0.1,1,5),"gamma":np.linspace(0,0.05,5)}
Grid_work(svm,X_train,X_test,y_train,y_test,parm_svm)



最优参数及score： {'C': 1.0, 'gamma': 0.0125} 0.7780099502487562
模型效果：
              precision    recall  f1-score   support

           0       0.78      0.99      0.88      7742
           1       0.50      0.02      0.03      2158

    accuracy                           0.78      9900
   macro avg       0.64      0.51      0.46      9900
weighted avg       0.72      0.78      0.69      9900

网格搜索的结果参数：
{'mean_fit_time': array([ 5.29113634, 21.28293626, 20.25249998, 21.12712828, 20.17563343,
        3.42495966, 22.43085837, 22.39392726, 30.68734463, 28.04198798,
        3.27629193, 22.97847136, 22.44687406, 26.83450898, 28.07311996,
        6.67505566, 25.65668694, 28.56415208, 28.2610964 , 31.73350024,
        4.26447765, 28.95501502, 29.29432742, 31.02361663, 28.9999756 ]), 'std_fit_time': array([0.70958977, 1.83114582, 0.65201687, 0.35983345, 0.49153813,
       0.16059718, 1.10794308, 1.2411389 , 6.87827741, 6.24924377,
       0.13082117, 2.31046126, 2.57643144, 4.18414916, 0.87772136,


In [23]:
dtc = DecisionTreeClassifier(random_state=1,)
param_dtc ={"max_depth":[6,9,11]}
Grid_work(dtc,X_train,X_test,y_train,y_test,param_dtc)



最优参数及score： {'max_depth': 6} 0.8163681592039801
模型效果：
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      7742
           1       0.65      0.37      0.47      2158

    accuracy                           0.82      9900
   macro avg       0.75      0.66      0.68      9900
weighted avg       0.80      0.82      0.80      9900

网格搜索的结果参数：
{'mean_fit_time': array([0.12833055, 0.16414897, 0.19922503]), 'std_fit_time': array([0.01063216, 0.00254247, 0.00324091]), 'mean_score_time': array([0.00356897, 0.00307941, 0.00326792]), 'std_score_time': array([0.0006383 , 0.00022744, 0.00029381]), 'param_max_depth': masked_array(data=[6, 9, 11],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 6}, {'max_depth': 9}, {'max_depth': 11}], 'split0_test_score': array([0.82092225, 0.8140576 , 0.80002985]), 'split1_test_score': array([0.81477612, 0.80447761, 0.79462687]), 'split2_test_score

In [24]:
rf2 = ensemble.RandomForestClassifier(random_state=1)
param_rf2 ={"n_estimators":[5,10,15]}
Grid_work(rf2,X_train,X_test,y_train,y_test,param_rf2)



最优参数及score： {'n_estimators': 15} 0.8080099502487562
模型效果：
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      7742
           1       0.59      0.37      0.45      2158

    accuracy                           0.81      9900
   macro avg       0.71      0.65      0.67      9900
weighted avg       0.79      0.81      0.79      9900

网格搜索的结果参数：
{'mean_fit_time': array([0.24665443, 0.43754053, 0.60728542]), 'std_fit_time': array([0.00532663, 0.03760278, 0.02190647]), 'mean_score_time': array([0.02427944, 0.02372138, 0.02694368]), 'std_score_time': array([0.01703305, 0.00366266, 0.00089642]), 'param_n_estimators': masked_array(data=[5, 10, 15],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 5}, {'n_estimators': 10}, {'n_estimators': 15}], 'split0_test_score': array([0.79420982, 0.81226683, 0.81629608]), 'split1_test_score': array([0.78343284, 0.80119403, 0.80373134]), 

In [25]:
knn = KNeighborsClassifier(metric='minkowski')
param_knn ={"n_neighbors":[3,6,9]}
Grid_work(knn,X_train,X_test,y_train,y_test,param_knn)



最优参数及score： {'n_neighbors': 6} 0.7685572139303483
模型效果：
              precision    recall  f1-score   support

           0       0.79      0.96      0.87      7742
           1       0.39      0.10      0.16      2158

    accuracy                           0.77      9900
   macro avg       0.59      0.53      0.51      9900
weighted avg       0.71      0.77      0.71      9900

网格搜索的结果参数：
{'mean_fit_time': array([0.02533189, 0.02353962, 0.02228014]), 'std_fit_time': array([0.00544813, 0.0004014 , 0.00124226]), 'mean_score_time': array([0.9139599 , 1.12794097, 1.18793909]), 'std_score_time': array([0.06514748, 0.0882278 , 0.19447819]), 'param_n_neighbors': masked_array(data=[3, 6, 9],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_neighbors': 3}, {'n_neighbors': 6}, {'n_neighbors': 9}], 'split0_test_score': array([0.73675571, 0.76973586, 0.76466199]), 'split1_test_score': array([0.74686567, 0.76910448, 0.76791045]), 'split2_te