# 本文件说明

- 优浪公司项目
- 预处理及特征值计算

# 基本设置

In [7]:
##load packages, needed
# encoding=utf-8

import jieba
import sys
import re
import time
import string
from sklearn import feature_extraction
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn import metrics

from sklearn.base import BaseEstimator, TransformerMixin

import joblib
%matplotlib inline
import numpy as np
import pandas as pd
import pre_cor
import os
from sklearn.model_selection import train_test_split
from collections import Counter

from jieba import analyse

import warnings
warnings.filterwarnings('ignore')

In [8]:
def getkeywords(X, N = 1000):
    '''
    训练时生成，合并所有记录，取N个关键词
    '''
    textrank = analyse.textrank

    text_combined = ' '.join(X)
    keywords = textrank(text_combined, topK = N)
    print('keywords num: ', len(keywords))
    if len(keywords) < N : 
        N  = len(keywords)

    if keywords:
        f = open("corpus/keywords.txt","w+", encoding='UTF-8')
        for content in keywords:
            content = content.strip()
            if content != ':AB:':
                f.write(content + '\n')
        f.close()

In [9]:
class Statskeywords(BaseEstimator, TransformerMixin):
    
    def __init__(self, topk = 100):
        self.topk = topk
#         print(self.topk)
        self.keywords = set()
        f = open("corpus/keywords.txt","r+", encoding='UTF-8')
        num = 0
        for content in f:
            if num < topk:
                self.keywords.add(content.strip().replace('\n', ''))
            num += 1
        f.close() 
        
        #初始化字典liwc
        self.liwc = {} 
        f2 = open("corpus/scliwc.txt",'r', encoding = 'gb18030')
        for ii in f2:     #ii在scliwc.txt中循环
            i = ii.strip().split() 
            self.liwc[i[0]] = i[1:len(i)]
        f2.close      
        
        self.category = set()
        for i in list(self.liwc.values()):
            for j in i:
                self.category.add(j)        
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        '''
        文本中关键词的词频
        '''                        
        data = []
        for x in X:
            words = x.split()
            word_tf = []
            keycnt = 0
            for kw in self.keywords:
                word_tf.append(words.count(kw)) # 各个关键词的词频
                if kw in words:keycnt+=1
            word_tf.append(keycnt) # 关键词的个数
            
            psy = []
            for w in words:
                if w in self.liwc: #是否liwc字典包含分词结果列表words的哪些分词
                    psy += self.liwc[w]  
            cat_tf = []
            for cat in self.category:
                cat_tf.append(psy.count(cat))                
                
            data.append(word_tf + cat_tf)            
        return data        

In [10]:
class StatsFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.neg = set()
        f = open("corpus/neg_words.txt","r+", encoding='UTF-8')
        for content in f:
            self.neg.add(content)
        f.close()       

    def fit(self, X, y=None):
        return self

    def getcnt(self,x): 
        '''词个数'''
        return len(list(set(x.split())))

    def getnegcnt(self,x):
        '''负面词个数'''
        negcnt = 0
        words = x.split()
        for w in words:
            if w in self.neg:
                negcnt = negcnt+1
        return negcnt

    def getrepcnt(self,x):
        '''重复词个数'''
        repcnt =0
        words = x.split()        
        for w in list(set(words)):
            if words.count(w)>1: # 记录重复词汇（词频大于1）
                repcnt += 1
        return repcnt
    
    def transform(self, X):
        '''
        文本长度、词个数、词比例、
        负面词个数、负面词比例、
        重复词个数、重复词比例
        '''
        data = []
        for x in X:
            if len(x) == 0:
                length  = 1
            else :
                length = len(x)
            data.append([len(x),self.getcnt(x),self.getcnt(x)/length,
                         self.getnegcnt(x),self.getnegcnt(x)/length,
                         self.getrepcnt(x),self.getrepcnt(x)/length])            
        return data

# 导入数据

## 预处理后数据

In [11]:
# 诈骗电话
corpus_pos = []
label_pos = []

filename = 'data/pos_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_pos.append(f)
    label_pos.append(1)
fid.close()
print(len(corpus_pos))
print(len(label_pos))

3
3


In [12]:
# 非诈骗电话
corpus_neg = []
label_neg = []

filename = 'data/neg_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_neg.append(f)
    label_neg.append(0)
fid.close()
print(len(corpus_neg))
print(len(label_neg))

3
3


In [13]:
folder = '20180703'

# 相关数据
corpus_cor = []
label_cor = []

filename = 'data/{0}/corpus_pre_cor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:    
    corpus_cor.append(f)
    label_cor.append(1)
fid.close()
print(len(corpus_cor))
print(len(label_cor))

6111
6111


In [14]:
# 不相关数据
corpus_uncor = []
label_uncor = []

filename = 'data/{0}/corpus_pre_uncor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_uncor.append(f)
    label_uncor.append(0)
fid.close()
print(len(corpus_uncor))
print(len(label_uncor))

8949
8949


## 分割数据

In [15]:
# corpus = corpus_pos + corpus_neg
# label = label_pos + label_neg
corpus = corpus_cor[:100] + corpus_uncor[:100]
label = label_cor[:100] + label_uncor[:100]

X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.1, random_state=42)
print('训练集：',len(y_train))
print('训练集-各类数量：',Counter(y_train))
print('测试集：',len(y_test))
print('测试集-各类数量：',Counter(y_test))

训练集： 180
训练集-各类数量： Counter({0: 90, 1: 90})
测试集： 20
测试集-各类数量： Counter({1: 10, 0: 10})


## 生成词典

In [16]:
if 0: getkeywords(corpus, N = 1000)

# 训练模型:RF

## 不调参

In [19]:
combined_features = FeatureUnion([
                                    ('tf_idf', Pipeline([
                                        ('counts', CountVectorizer()),
                                        ('tfidf', TfidfTransformer()),
                                        ('chi', SelectKBest(chi2, k=20000))
                                        ])),
                                    ('tf', Statskeywords(topk = 100)),
                                    ('len_stats', StatsFeatures())
                                ])

In [20]:
pipeline = Pipeline([('features', combined_features), 
                     ('classifier', GradientBoostingClassifier(random_state=0))])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline
# clf_xgb = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, cv = 10)
# clf_xgb.fit(X_train, y_train)

1.0


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('tf_idf', Pipeline(memory=None,
     steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0...         presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False))])

In [21]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

accuracy_score:  0.9
             precision    recall  f1-score   support

          0       0.90      0.90      0.90        10
          1       0.90      0.90      0.90        10

avg / total       0.90      0.90      0.90        20

confusion_matrix: 
[[9 1]
 [1 9]]


## 调参

In [26]:
score_1_best_para = {}
score_2_best_para = {}
cv = 5
score_1 = 'roc_auc'
score_2 = 'recall_macro'

### topk、chi_k

In [29]:
pipeline = Pipeline([('features', combined_features), 
                     ('classifier', GradientBoostingClassifier(random_state=0))])

param_grid_0 = dict(features__tf_idf__chi__k = [2000, 5000, 20000, 40000],
                   features__tf__topk=[50,100, 500, 1000]) 
param_grid_0

{'features__tf__topk': [50, 100, 500, 1000]}

In [30]:
clf = GridSearchCV(pipeline, param_grid=param_grid_0, 
                   cv = cv, scoring=score_1, n_jobs=-1, verbose=10)
clf.fit(X_train, y_train)
score_1_best_para['chi__k'] = clf.best_params_['features__tf_idf__chi__k']
score_1_best_para['topk'] = clf.best_params_['features__tf__topk']
print('score_1_chi__k: ', score_1_best_para['chi__k'])
print('score_1_topk: ', score_1_best_para['topk'])
clf.grid_scores_, clf.best_params_, clf.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] features__tf__topk=50 ...........................................
[CV] features__tf__topk=50 ...........................................
[CV] features__tf__topk=50 ...........................................
[CV] features__tf__topk=50 ...........................................
[CV] .. features__tf__topk=50, score=0.9660493827160493, total=   4.0s
[CV] features__tf__topk=50 ...........................................
[CV] .. features__tf__topk=50, score=0.8117283950617283, total=   4.6s
[CV] features__tf__topk=100 ..........................................
[CV] .. features__tf__topk=50, score=0.8209876543209876, total=   3.7s
[CV] features__tf__topk=100 ..........................................
[CV] .. features__tf__topk=50, score=0.9012345679012346, total=   5.2s
[CV] features__tf__topk=100 ..........................................
[CV] .. features__tf__topk=50, score=0.9629629629629629, total=   3.6s
[CV] features__tf

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.7s


[CV] . features__tf__topk=100, score=0.9012345679012346, total=   4.1s
[CV] features__tf__topk=100 ..........................................
[CV] . features__tf__topk=100, score=0.9660493827160493, total=   4.8s
[CV] features__tf__topk=500 ..........................................
[CV] . features__tf__topk=100, score=0.8117283950617283, total=   4.7s
[CV] features__tf__topk=500 ..........................................
[CV] . features__tf__topk=100, score=0.8209876543209876, total=   4.3s
[CV] features__tf__topk=500 ..........................................
[CV] . features__tf__topk=100, score=0.9629629629629629, total=   3.6s
[CV] features__tf__topk=500 ..........................................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   21.3s


[CV] . features__tf__topk=500, score=0.9012345679012346, total=   4.0s
[CV] features__tf__topk=500 ..........................................
[CV] . features__tf__topk=500, score=0.9660493827160493, total=   5.9s
[CV] features__tf__topk=1000 .........................................
[CV] . features__tf__topk=500, score=0.8117283950617283, total=   4.0s
[CV] features__tf__topk=1000 .........................................
[CV] . features__tf__topk=500, score=0.8209876543209876, total=   5.5s
[CV] features__tf__topk=1000 .........................................
[CV] . features__tf__topk=500, score=0.9629629629629629, total=   4.4s
[CV] features__tf__topk=1000 .........................................
[CV]  features__tf__topk=1000, score=0.9012345679012346, total=   3.6s
[CV] features__tf__topk=1000 .........................................


[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:   31.8s remaining:    8.0s


[CV]  features__tf__topk=1000, score=0.8117283950617283, total=   3.6s
[CV]  features__tf__topk=1000, score=0.9660493827160493, total=   5.1s
[CV]  features__tf__topk=1000, score=0.9629629629629629, total=   3.4s
[CV]  features__tf__topk=1000, score=0.8209876543209876, total=   4.9s


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   37.8s finished


KeyError: 'features__tf_idf__chi__k'

In [31]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

accuracy_score:  0.9
             precision    recall  f1-score   support

          0       0.90      0.90      0.90        10
          1       0.90      0.90      0.90        10

avg / total       0.90      0.90      0.90        20

confusion_matrix: 
[[9 1]
 [1 9]]


In [50]:
pipeline = Pipeline([('features', combined_features), 
                     ('classifier', GradientBoostingClassifier(random_state=0))])

param_grid_0 = dict(features__tf_idf__chi__k = [2000, 5000, 20000, 40000],
                   features__tf__topk=[50,100, 500, 1000]) 
param_grid_0

{'features__tf_idf__chi__k': [2000, 5000, 20000, 40000],
 'features__tf__topk': [50, 100, 500, 1000]}

In [51]:
clf = GridSearchCV(pipeline, param_grid=param_grid_0, 
                   cv = cv, scoring=score_2, n_jobs=-1, verbose=10)
clf.fit(X_train, y_train)
score_2_best_para['chi__k'] = clf.best_params_['features__tf_idf__chi__k']
score_2_best_para['topk'] = clf.best_params_['features__tf__topk']
print('score_2_chi__k: ', score_2_best_para['chi__k'])
print('score_2_topk: ', score_2_best_para['topk'])
clf.grid_scores_, clf.best_params_, clf.best_score_

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8627427274966499, total= 7.1min
[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8601406028613202, total= 7.3min
[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8703773589169377, total= 7.5min
[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8562142532704515, total= 7.5min
[CV] features__tf__topk=50, features__tf

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 28.7min


[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8612106037426304, total= 8.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=2000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8706733887278175, total= 8.5min
[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8597631741196098, total= 8.7min
[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8622384216443622, total= 7.9min
[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=2000, score=0.8682769375838684, total= 8.3min


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 44.6min


[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.8562142532704515, total= 8.2min
[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.8749217554793762, total= 8.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.8474734646560744, total= 7.2min
[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.8615845956029404, total= 6.7min
[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.86145465885389, total= 7.7min
[CV] features__tf__topk=50, features__tf_idf__chi__k=5000 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.85179979

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 72.7min


[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.8600102455300476, total= 7.3min
[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.8431799223878431, total= 7.2min
[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=5000, score=0.8624854930548, total= 7.2min
[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=20000, score=0.8649448729218594, total= 8.1min
[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=20000, score=0.8400049262976161, total= 7.9min
[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=20000, score=0.829887

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 88.6min


[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=20000, score=0.8552380328254126, total= 7.6min
[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=20000, score=0.8514517889465631, total= 7.5min
[CV] features__tf__topk=50, features__tf_idf__chi__k=20000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=20000, score=0.856100134908242, total= 7.8min
[CV] features__tf__topk=50, features__tf_idf__chi__k=40000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=20000, score=0.8422891052841548, total= 8.0min
[CV] features__tf__topk=50, features__tf_idf__chi__k=40000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=20000, score=0.8493181735755994, total= 7.2min
[CV] features__tf__topk=50, features__tf_idf__chi__k=40000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=40000, score=0.8

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 130.3min


[CV] features__tf__topk=50, features__tf_idf__chi__k=40000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=40000, score=0.8495456733322335, total= 7.6min
[CV] features__tf__topk=50, features__tf_idf__chi__k=40000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=40000, score=0.8488135079984542, total= 7.2min
[CV] features__tf__topk=50, features__tf_idf__chi__k=40000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=40000, score=0.8574763345634012, total= 7.5min
[CV] features__tf__topk=50, features__tf_idf__chi__k=40000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=40000, score=0.8432250979045157, total= 7.6min
[CV] features__tf__topk=100, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=40000, score=0.8551844195408552, total= 7.8min
[CV] features__tf__topk=100, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=50, features__tf_idf__chi__k=40000, score=0.

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 159.2min


[CV] features__tf__topk=100, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=2000, score=0.8562142532704515, total= 7.3min
[CV] features__tf__topk=100, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=2000, score=0.8601406028613202, total= 7.8min
[CV] features__tf__topk=100, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=2000, score=0.8706733887278175, total= 7.3min
[CV] features__tf__topk=100, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=2000, score=0.8612106037426304, total= 6.9min
[CV] features__tf__topk=100, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=2000, score=0.8606705188204906, total= 7.6min
[CV] features__tf__topk=100, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=2000, score=0.

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 202.7min


[CV] features__tf__topk=100, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=5000, score=0.8474734646560744, total= 8.3min
[CV] features__tf__topk=100, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=5000, score=0.8517997934570632, total= 8.4min
[CV] features__tf__topk=100, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=5000, score=0.8615845956029404, total= 7.9min
[CV] features__tf__topk=100, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=5000, score=0.8600102455300476, total= 7.0min
[CV] features__tf__topk=100, features__tf_idf__chi__k=20000 ..........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=5000, score=0.8786300373087952, total= 7.1min
[CV] features__tf__topk=100, features__tf_idf__chi__k=20000 ..........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=5000, score=0.

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 232.1min


[CV] features__tf__topk=100, features__tf_idf__chi__k=20000 ..........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=20000, score=0.8552380328254126, total= 8.1min
[CV] features__tf__topk=100, features__tf_idf__chi__k=20000 ..........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=20000, score=0.856100134908242, total= 7.9min
[CV] features__tf__topk=100, features__tf_idf__chi__k=20000 ..........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=20000, score=0.8422891052841548, total= 7.9min
[CV] features__tf__topk=100, features__tf_idf__chi__k=40000 ..........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=20000, score=0.8514517889465631, total= 8.0min
[CV] features__tf__topk=100, features__tf_idf__chi__k=40000 ..........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=20000, score=0.8439234033293439, total= 7.7min
[CV]  features__tf__topk=100, features__tf_idf__chi__k=20000, score=0.8493181735755994, total= 8.0min
[CV] features__tf__topk=100, featu

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 287.6min


[CV] features__tf__topk=500, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=40000, score=0.8551844195408552, total= 7.6min
[CV] features__tf__topk=500, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=40000, score=0.857015921372357, total= 7.2min
[CV] features__tf__topk=500, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=100, features__tf_idf__chi__k=40000, score=0.841992166249592, total= 7.3min
[CV] features__tf__topk=500, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=2000, score=0.8703773589169377, total= 7.5min
[CV] features__tf__topk=500, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=2000, score=0.8562142532704515, total= 7.2min
[CV] features__tf__topk=500, features__tf_idf__chi__k=2000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=2000, score=0

[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 330.5min


[CV] features__tf__topk=500, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=2000, score=0.8622384216443622, total= 7.8min
[CV] features__tf__topk=500, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=5000, score=0.8562142532704515, total= 7.4min
[CV] features__tf__topk=500, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=5000, score=0.8474734646560744, total= 7.8min
[CV] features__tf__topk=500, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=5000, score=0.8517997934570632, total= 7.7min
[CV] features__tf__topk=500, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=5000, score=0.86145465885389, total= 7.7min
[CV] features__tf__topk=500, features__tf_idf__chi__k=5000 ...........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=5000, score=0.86

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 386.4min


[CV] features__tf__topk=500, features__tf_idf__chi__k=20000 ..........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=20000, score=0.856100134908242, total= 7.3min
[CV] features__tf__topk=500, features__tf_idf__chi__k=20000 ..........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=20000, score=0.8422891052841548, total= 7.4min
[CV]  features__tf__topk=500, features__tf_idf__chi__k=20000, score=0.8514517889465631, total= 7.7min
[CV] features__tf__topk=500, features__tf_idf__chi__k=40000 ..........
[CV] features__tf__topk=500, features__tf_idf__chi__k=40000 ..........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=20000, score=0.8493181735755994, total= 7.6min
[CV] features__tf__topk=500, features__tf_idf__chi__k=40000 ..........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=20000, score=0.8439234033293439, total= 7.7min
[CV] features__tf__topk=500, features__tf_idf__chi__k=40000 ..........
[CV]  features__tf__topk=500, features__tf_idf__chi__k=40000, sco

[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 430.2min


[CV] features__tf__topk=1000, features__tf_idf__chi__k=2000 ..........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=2000, score=0.8703773589169377, total= 7.6min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=2000 ..........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=2000, score=0.8562142532704515, total= 7.2min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=2000 ..........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=2000, score=0.8627427274966499, total= 7.1min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=2000 ..........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=2000, score=0.8601406028613202, total= 7.2min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=2000 ..........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=2000, score=0.8606705188204906, total= 7.6min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=2000 ..........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=2000, sc

[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 498.6min


[CV] features__tf__topk=1000, features__tf_idf__chi__k=20000 .........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=5000, score=0.8600102455300476, total= 7.3min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=20000 .........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=5000, score=0.8431799223878431, total= 7.1min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=20000 .........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=5000, score=0.8624854930548, total= 7.4min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=20000 .........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=20000, score=0.8649448729218594, total= 7.4min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=20000 .........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=20000, score=0.8400049262976161, total= 7.7min
[CV] features__tf__topk=1000, features__tf_idf__chi__k=20000 .........
[CV]  features__tf__topk=1000, features__tf_idf__chi__k=20000, sc

[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 570.9min finished


score_2_chi__k:  2000
score_2_topk:  50


([mean: 0.86323, std: 0.00463, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 2000},
  mean: 0.85978, std: 0.01049, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 5000},
  mean: 0.84887, std: 0.00953, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 20000},
  mean: 0.85033, std: 0.00819, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 40000},
  mean: 0.86323, std: 0.00463, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 2000},
  mean: 0.85978, std: 0.01049, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 5000},
  mean: 0.84887, std: 0.00953, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 20000},
  mean: 0.85033, std: 0.00819, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 40000},
  mean: 0.86323, std: 0.00463, params: {'features__tf__topk': 500, 'features__tf_idf__chi__k': 2000},
  mean: 0.85978, std: 0.01049, params: {'features__tf__topk': 500, 'features__tf_i

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### combined_features

In [32]:
combined_features_1 = FeatureUnion([
                                    ('tf_idf', Pipeline([
                                        ('counts', CountVectorizer()),
                                        ('tfidf', TfidfTransformer()),
                                        ('chi', SelectKBest(chi2, k=score_1_best_para['chi__k']))
                                        ])),
                                    ('tf', Statskeywords(topk = score_1_best_para['topk'])),
                                    ('len_stats', StatsFeatures())
                                ])

KeyError: 'chi__k'

In [53]:
combined_features_2 = FeatureUnion([
                                    ('tf_idf', Pipeline([
                                        ('counts', CountVectorizer()),
                                        ('tfidf', TfidfTransformer()),
                                        ('chi', SelectKBest(chi2, k=score_2_best_para['chi__k']))
                                        ])),
                                    ('tf', Statskeywords(topk = score_2_best_para['topk'])),
                                    ('len_stats', StatsFeatures())
                                ])

### n_estimators
- n_estimators
    - 弱学习器的最大迭代次数，或者说最大的弱学习器的个数。
    - 一般来说n_estimators太小，容易欠拟合，n_estimators太大，又容易过拟合，一般选择一个适中的数值。默认是100。
    - 在实际调参的过程中，我们常常将n_estimators和参数learning_rate一起考虑。
- learning_rate    
    - 每个弱学习器的权重缩减系数ν，也称作步长，取值范围为0<ν≤11。
    - 对于同样的训练集拟合效果，较小的ν 意味着我们需要更多的弱学习器的迭代次数。
    - 通常我们用步长和迭代最大次数一起来决定算法的拟合效果。所以这两个参数n_estimators和learning_rate要一起调参。
    - 一般来说，可以从一个小一点的ν开始调参，默认是1。    

In [54]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', GradientBoostingClassifier(learning_rate = 0.1, random_state=0))])

param_grid_1 = dict(classifier__n_estimators = range(10,71,10)) # 迭代次数/分类器个数
param_grid_1

{'classifier__n_estimators': range(10, 71, 10)}

In [55]:
clf = GridSearchCV(pipeline, param_grid=param_grid_1, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['n_estimators'] = clf.best_params_['classifier__n_estimators']
print('score_1_n_estimators: ', score_1_best_para['n_estimators'])
clf.grid_scores_, clf.best_params_, clf.best_score_

score_1_n_estimators:  70


([mean: 0.94152, std: 0.00668, params: {'classifier__n_estimators': 10},
  mean: 0.95314, std: 0.00413, params: {'classifier__n_estimators': 20},
  mean: 0.95705, std: 0.00432, params: {'classifier__n_estimators': 30},
  mean: 0.95916, std: 0.00407, params: {'classifier__n_estimators': 40},
  mean: 0.96038, std: 0.00391, params: {'classifier__n_estimators': 50},
  mean: 0.96138, std: 0.00365, params: {'classifier__n_estimators': 60},
  mean: 0.96215, std: 0.00356, params: {'classifier__n_estimators': 70}],
 {'classifier__n_estimators': 70},
 0.9621511471661051)

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [56]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', GradientBoostingClassifier(learning_rate = 0.1, random_state=0))])

param_grid_1 = dict(classifier__n_estimators = range(10,71,10)) # 迭代次数/分类器个数
param_grid_1

{'classifier__n_estimators': range(10, 71, 10)}

In [57]:
clf = GridSearchCV(pipeline, param_grid=param_grid_1, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['n_estimators'] = clf.best_params_['classifier__n_estimators']
print('score_2_n_estimators: ', score_2_best_para['n_estimators'])
clf.grid_scores_, clf.best_params_, clf.best_score_

score_2_n_estimators:  70


([mean: 0.85901, std: 0.00820, params: {'classifier__n_estimators': 10},
  mean: 0.87585, std: 0.00496, params: {'classifier__n_estimators': 20},
  mean: 0.87948, std: 0.00652, params: {'classifier__n_estimators': 30},
  mean: 0.88215, std: 0.00786, params: {'classifier__n_estimators': 40},
  mean: 0.88283, std: 0.00622, params: {'classifier__n_estimators': 50},
  mean: 0.88432, std: 0.00678, params: {'classifier__n_estimators': 60},
  mean: 0.88619, std: 0.00861, params: {'classifier__n_estimators': 70}],
 {'classifier__n_estimators': 70},
 0.8861906589379215)

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### max_depth、min_samples_split
- 决策树最大深度max_depth: 
    - 默认可以不输入，如果不输入的话，决策树在建立子树的时候不会限制子树的深度。
    - 一般来说，数据少或者特征少的时候可以不管这个值。如果模型样本量多，特征也多的情况下，推荐限制这个最大深度，具体的取值取决于数据的分布。
    - 常用的可以取值10-100之间。
- 内部节点再划分所需最小样本数min_samples_split: 
    - 这个值限制了子树继续划分的条件，如果某节点的样本数少于min_samples_split，则不会继续再尝试选择最优特征来进行划分。 
    - 默认是2.如果样本量不大，不需要管这个值。如果样本量数量级非常大，则推荐增大这个值。

In [58]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', GradientBoostingClassifier(n_estimators = score_1_best_para['n_estimators'],
                                                               learning_rate = 0.1, 
                                                           random_state=0))])

param_grid_2 = dict(classifier__max_depth=range(3,14,2), # 决策树最大深度
                    classifier__min_samples_split=range(50,201,20)) # 内部节点再划分所需最小样本数)
param_grid_2

{'classifier__max_depth': range(3, 14, 2),
 'classifier__min_samples_split': range(50, 201, 20)}

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_2, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['max_depth'] = clf.best_params_['classifier__max_depth']
score_1_best_para['min_samples_split'] = clf.best_params_['classifier__min_samples_split']
print('score_1_max_depth: ', score_1_best_para['max_depth'])
print('score_1_min_samples_split: ', score_1_best_para['min_samples_split'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', GradientBoostingClassifier(n_estimators = score_1_best_para['n_estimators'],
                                                               learning_rate = 0.1, 
                                                           random_state=0))])

param_grid_2 = dict(classifier__max_depth=range(3,14,2), # 决策树最大深度
                    classifier__min_samples_split=range(50,201,20)) # 内部节点再划分所需最小样本数)
param_grid_2

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_2, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['max_depth'] = clf.best_params_['classifier__max_depth']
score_2_best_para['min_samples_split'] = clf.best_params_['classifier__min_samples_split']
print('score_2_max_depth: ', score_2_best_para['max_depth'])
print('score_2_min_samples_split: ', score_2_best_para['min_samples_split'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### min_samples_leaf、min_samples_split
- 叶子节点最少样本数min_samples_leaf: 
    - 这个值限制了叶子节点最少的样本数，如果某叶子节点数目小于样本数，则会和兄弟节点一起被剪枝。 
    - 默认是1,可以输入最少的样本数的整数，或者最少样本数占样本总数的百分比。
    - 如果样本量不大，不需要管这个值。如果样本量数量级非常大，则推荐增大这个值。

In [None]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', GradientBoostingClassifier(n_estimators = score_1_best_para['n_estimators'], 
                                                           max_depth = score_1_best_para['max_depth'], 
                                                               learning_rate = 0.1, 
                                                           random_state=0))])

param_grid_3 = dict(classifier__min_samples_leaf=range(10,60,10), 
                    classifier__min_samples_split=range(50,201,20)) # 内部节点再划分所需最小样本数)
param_grid_3

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_3, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['min_samples_leaf'] = clf.best_params_['classifier__min_samples_leaf']
score_1_best_para['min_samples_split'] = clf.best_params_['classifier__min_samples_split']
print('score_1_min_samples_leaf: ', score_1_best_para['min_samples_leaf'])
print('score_1_min_samples_split: ', score_1_best_para['min_samples_split'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', GradientBoostingClassifier(n_estimators = score_1_best_para['n_estimators'], 
                                                           max_depth = score_1_best_para['max_depth'], 
                                                               learning_rate = 0.1, 
                                                           random_state=0))])

param_grid_3 = dict(classifier__min_samples_leaf=range(10,60,10), 
                    classifier__min_samples_split=range(50,201,20)) # 内部节点再划分所需最小样本数)
param_grid_3

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_3, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['min_samples_leaf'] = clf.best_params_['classifier__min_samples_leaf']
score_2_best_para['min_samples_split'] = clf.best_params_['classifier__min_samples_split']
print('score_2_min_samples_leaf: ', score_2_best_para['min_samples_leaf'])
print('score_2_min_samples_split: ', score_2_best_para['min_samples_split'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### max_features
- 划分时考虑的最大特征数max_features: 
    - 可以使用很多种类型的值，默认是"None",意味着划分时考虑所有的特征数；
    - 如果是"log2"意味着划分时最多考虑log 2 N log2N个特征；
    - 如果是"sqrt"或者"auto"意味着划分时最多考虑N − −  √  N个特征。
    - 如果是整数，代表考虑的特征绝对数。
    - 如果是浮点数，代表考虑特征百分比，即考虑（百分比xN）取整后的特征数。其中N为样本总特征数。
    - 一般来说，如果样本特征数不多，比如小于50，我们用默认的"None"就可以了，如果特征数非常多，我们可以灵活使用刚才描述的其他取值来控制划分时考虑的最大特征数，以控制决策树的生成时间。

In [None]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', GradientBoostingClassifier(n_estimators = score_1_best_para['n_estimators'], 
                                                           max_depth = score_1_best_para['max_depth'],  
                                                           min_samples_leaf = score_1_best_para['min_samples_leaf'],
                                                           min_samples_split = score_1_best_para['min_samples_split'],
                                                               learning_rate = 0.1, 
                                                           random_state=0))])

param_grid_4 = dict(classifier__max_features=[0.25, 0.5, 0.75, 1.0])
param_grid_4

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_4, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['max_features'] = clf.best_params_['classifier__max_features']
print('score_1_max_features: ', score_1_best_para['max_features'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', GradientBoostingClassifier(n_estimators = score_1_best_para['n_estimators'], 
                                                           max_depth = score_1_best_para['max_depth'],  
                                                           min_samples_leaf = score_1_best_para['min_samples_leaf'],
                                                           min_samples_split = score_1_best_para['min_samples_split'],
                                                               learning_rate = 0.1, 
                                                           random_state=0))])

param_grid_4 = dict(classifier__max_features=[0.25, 0.5, 0.75, 1.0])
param_grid_4

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_4, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['max_features'] = clf.best_params_['classifier__max_features']
print('score_2_max_features: ', score_2_best_para['max_features'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### subsample
- 子采样的比例

In [None]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', GradientBoostingClassifier(n_estimators = score_1_best_para['n_estimators'], 
                                                           max_depth = score_1_best_para['max_depth'],  
                                                           min_samples_leaf = score_1_best_para['min_samples_leaf'],
                                                           min_samples_split = score_1_best_para['min_samples_split'],
                                                           max_features = score_1_best_para['max_features'],
                                                               learning_rate = 0.1, 
                                                           random_state=0))])

param_grid_5 = dict(classifier__subsample=[0.25, 0.5, 0.75, 1.0])
param_grid_5

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_5, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['subsample'] = clf.best_params_['classifier__subsample']
print('score_1_subsample: ', score_1_best_para['subsample'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', GradientBoostingClassifier(n_estimators = score_2_best_para['n_estimators'], 
                                                           max_depth = score_2_best_para['max_depth'],  
                                                           min_samples_leaf = score_2_best_para['min_samples_leaf'],
                                                           min_samples_split = score_2_best_para['min_samples_split'],
                                                           max_features = score_2_best_para['max_features'],
                                                               learning_rate = 0.1, 
                                                           random_state=0))])

param_grid_5 = dict(classifier__subsample=[0.25, 0.5, 0.75, 1.0])
param_grid_5

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_5, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['subsample'] = clf.best_params_['classifier__subsample']
print('score_2_subsample: ', score_2_best_para['subsample'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### 最终模型

In [None]:
score_1_best_para

In [None]:
score_2_best_para

In [None]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=score_1_best_para['chi__k']))
        ])),
        ('tf', Statskeywords(topk = score_1_best_para['topk'])),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', GradientBoostingClassifier( n_estimators = score_1_best_para['n_estimators'], 
                                          max_depth = score_1_best_para['max_depth'],  
                                          min_samples_leaf = score_1_best_para['min_samples_leaf'],
                                          min_samples_split = score_1_best_para['min_samples_split'],
                                          max_features = score_1_best_para['max_features'],
                                          subsample = score_1_best_para['subsample'],
                                              learning_rate = 0.1, 
                                          random_state=0))
])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline

In [None]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=score_2_best_para['chi__k']))
        ])),
        ('tf', Statskeywords(topk = score_2_best_para['topk'])),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', GradientBoostingClassifier( n_estimators = score_2_best_para['n_estimators'], 
                                          max_depth = score_2_best_para['max_depth'],  
                                          min_samples_leaf = score_2_best_para['min_samples_leaf'],
                                          min_samples_split = score_2_best_para['min_samples_split'],
                                          max_features = score_2_best_para['max_features'],
                                          subsample = score_2_best_para['subsample'],
                                              learning_rate = 0.1, 
                                          random_state=0))
])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline

In [None]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))