# 本文件说明

- 优浪公司项目
- 预处理及特征值计算

# 基本设置

In [21]:
import os 
file_name = os.listdir(r"model/")
file_name

['youlang_model.pkl.z']

In [1]:
import jieba
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.base import BaseEstimator, TransformerMixin

import joblib
%matplotlib inline
import numpy as np
import pandas as pd
import pre_cor
import os
from sklearn.model_selection import train_test_split
from collections import Counter
from jieba import analyse

import warnings
warnings.filterwarnings('ignore')

In [2]:
def getkeywords(X, N = 1000):
    '''
    训练时生成，合并所有记录，取N个关键词
    '''
    textrank = analyse.textrank

    text_combined = ' '.join(X)
    keywords = textrank(text_combined, topK = N)
    print('keywords num: ', len(keywords))
    if len(keywords) < N : 
        N  = len(keywords)

    if keywords:
        f = open("corpus/keywords.txt","w+", encoding='UTF-8')
        for content in keywords:
            content = content.strip()
            if content != ':AB:':
                f.write(content + '\n')
        f.close()

In [3]:
class Statskeywords(BaseEstimator, TransformerMixin):
    
    def __init__(self, topk = 100):
        self.topk = topk
#         print(self.topk)
        self.keywords = set()
        f = open("corpus/keywords.txt","r+", encoding='UTF-8')
        num = 0
        for content in f:
            if num < topk:
                self.keywords.add(content.strip().replace('\n', ''))
            num += 1
        f.close() 
        
        #初始化字典liwc
        self.liwc = {} 
        f2 = open("corpus/scliwc.txt",'r', encoding = 'gb18030')
        for ii in f2:     #ii在scliwc.txt中循环
            i = ii.strip().split() 
            self.liwc[i[0]] = i[1:len(i)]
        f2.close      
        
        self.category = set()
        for i in list(self.liwc.values()):
            for j in i:
                self.category.add(j)        
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        '''
        文本中关键词的词频
        '''                        
        data = []
        for x in X:
            words = x.split()
            word_tf = []
            keycnt = 0
            for kw in self.keywords:
                word_tf.append(words.count(kw)) # 各个关键词的词频
                if kw in words:keycnt+=1
            word_tf.append(keycnt) # 关键词的个数
            
            psy = []
            for w in words:
                if w in self.liwc: #是否liwc字典包含分词结果列表words的哪些分词
                    psy += self.liwc[w]  
            cat_tf = []
            for cat in self.category:
                cat_tf.append(psy.count(cat))                
                
            data.append(word_tf + cat_tf)            
        return data        

In [4]:
class StatsFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.neg = set()
        f = open("corpus/neg_words.txt","r+", encoding='UTF-8')
        for content in f:
            self.neg.add(content)
        f.close()       

    def fit(self, X, y=None):
        return self

    def getcnt(self,x): 
        '''词个数'''
        return len(list(set(x.split())))

    def getnegcnt(self,x):
        '''负面词个数'''
        negcnt = 0
        words = x.split()
        for w in words:
            if w in self.neg:
                negcnt = negcnt+1
        return negcnt

    def getrepcnt(self,x):
        '''重复词个数'''
        repcnt =0
        words = x.split()        
        for w in list(set(words)):
            if words.count(w)>1: # 记录重复词汇（词频大于1）
                repcnt += 1
        return repcnt
    
    def transform(self, X):
        '''
        文本长度、词个数、词比例、
        负面词个数、负面词比例、
        重复词个数、重复词比例
        '''
        data = []
        for x in X:
            if len(x) == 0:
                length  = 1
            else :
                length = len(x)
            data.append([len(x),self.getcnt(x),self.getcnt(x)/length,
                         self.getnegcnt(x),self.getnegcnt(x)/length,
                         self.getrepcnt(x),self.getrepcnt(x)/length])            
        return data

# 导入数据

## 预处理后数据

In [5]:
# 诈骗电话
corpus_pos = []
label_pos = []

filename = 'data/pos_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_pos.append(f)
    label_pos.append(1)
fid.close()
print(len(corpus_pos))
print(len(label_pos))

3
3


In [6]:
# 非诈骗电话
corpus_neg = []
label_neg = []

filename = 'data/neg_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_neg.append(f)
    label_neg.append(0)
fid.close()
print(len(corpus_neg))
print(len(label_neg))

3
3


In [7]:
folder = '20180703'

# 相关数据
corpus_cor = []
label_cor = []

filename = 'data/{0}/corpus_pre_cor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:    
    corpus_cor.append(f)
    label_cor.append(1)
fid.close()
print(len(corpus_cor))
print(len(label_cor))

6111
6111


In [8]:
# 不相关数据
corpus_uncor = []
label_uncor = []

filename = 'data/{0}/corpus_pre_uncor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_uncor.append(f)
    label_uncor.append(0)
fid.close()
print(len(corpus_uncor))
print(len(label_uncor))

8949
8949


## 分割数据

In [9]:
# corpus = corpus_pos + corpus_neg
# label = label_pos + label_neg
corpus = corpus_cor[:1000] + corpus_uncor[:1000]
label = label_cor[:1000] + label_uncor[:1000]

X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.1, random_state=42)
print('训练集：',len(y_train))
print('训练集-各类数量：',Counter(y_train))
print('测试集：',len(y_test))
print('测试集-各类数量：',Counter(y_test))

训练集： 1800
训练集-各类数量： Counter({1: 901, 0: 899})
测试集： 200
测试集-各类数量： Counter({0: 101, 1: 99})


## 生成词典

In [103]:
if 0: getkeywords(corpus, N = 1000)

# 训练模型:xgboost

## 不调参

In [10]:
combined_features = FeatureUnion([
                                    ('tf_idf', Pipeline([
                                        ('counts', CountVectorizer()),
                                        ('tfidf', TfidfTransformer()),
                                        ('chi', SelectKBest(chi2, k=200))
                                        ])),
                                    ('tf', Statskeywords(topk = 100)),
                                    ('len_stats', StatsFeatures())
                                ])

In [11]:
pipeline = Pipeline([('features', combined_features), 
                     ('classifier', XGBClassifier(nthread = 4,# cpu 线程数
                                                  objective='multi:softmax', num_class=2))])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline
# clf_xgb = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, cv = 10)
# clf_xgb.fit(X_train, y_train)

0.9872222222222222


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('tf_idf', Pipeline(memory=None,
     steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0...tate=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])

In [12]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

accuracy_score:  0.965
             precision    recall  f1-score   support

          0       0.94      0.99      0.97       101
          1       0.99      0.94      0.96        99

avg / total       0.97      0.96      0.96       200

confusion_matrix: 
[[100   1]
 [  6  93]]


# 模型保存

In [14]:
from sklearn.externals import joblib
joblib.dump(pipeline, "model/youlang_model.pkl.z")

['model/youlang_model.pkl.z']

## 调参

In [110]:
score_1_best_para = {}
score_2_best_para = {}
cv = 5
score_1 = 'roc_auc'
score_2 = 'recall_macro'

### topk、chi_k

In [111]:
pipeline = Pipeline([('features', combined_features), 
                     ('classifier', XGBClassifier(max_depth = 7, gamma = 0,
                                                  objective='multi:softmax', num_class=2))])

param_grid_0 = dict(features__tf_idf__chi__k = [100, 200],# [2000, 5000, 20000, 40000],
                   features__tf__topk=[50,100, 200]) 
param_grid_0

{'features__tf_idf__chi__k': [100, 200], 'features__tf__topk': [50, 100, 200]}

In [112]:
clf = GridSearchCV(pipeline, param_grid=param_grid_0, 
                   cv = cv, scoring=score_1, n_jobs=-1, verbose=10)
clf.fit(X_train, y_train)
score_1_best_para['chi__k'] = clf.best_params_['features__tf_idf__chi__k']
score_1_best_para['topk'] = clf.best_params_['features__tf__topk']
print('score_1_chi__k: ', score_1_best_para['chi__k'])
print('score_1_topk: ', score_1_best_para['topk'])
clf.grid_scores_, clf.best_params_, clf.best_score_

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9499219620958752, total= 1.3min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9600000000000001, total= 1.3min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.9567607973421925, total= 1.3min
[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9584274640088594, total= 1.3min
[CV] features__tf__topk=100, features__tf_idf__

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  4.1min


[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.9566666666666667, total= 1.3min
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9600000000000001, total= 1.3min
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9584274640088594, total= 1.3min
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9499219620958752, total= 1.3min
[CV] features__tf__topk=200, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.9482552954292085, total= 1.3min
[CV] features__tf__topk=200, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.9566666666666667, total= 1.4min


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  6.3min


[CV] features__tf__topk=200, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.9567607973421925, total= 1.4min
[CV] features__tf__topk=200, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.9584274640088594, total= 1.3min
[CV] features__tf__topk=200, features__tf_idf__chi__k=200 ............


[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:  8.2min remaining:  3.2min


[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.9600000000000001, total= 1.3min
[CV] features__tf__topk=200, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.9499219620958752, total= 1.3min


[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:  8.3min remaining:  1.7min


[CV]  features__tf__topk=200, features__tf_idf__chi__k=200, score=0.9567607973421925, total= 1.3min
[CV]  features__tf__topk=200, features__tf_idf__chi__k=200, score=0.9482552954292085, total= 1.1min
[CV]  features__tf__topk=200, features__tf_idf__chi__k=200, score=0.9566666666666667, total= 1.1min


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 10.1min finished


score_1_chi__k:  100
score_1_topk:  50


([mean: 0.95612, std: 0.00443, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 100},
  mean: 0.95390, std: 0.00399, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 200},
  mean: 0.95612, std: 0.00443, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 100},
  mean: 0.95390, std: 0.00399, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 200},
  mean: 0.95612, std: 0.00443, params: {'features__tf__topk': 200, 'features__tf_idf__chi__k': 100},
  mean: 0.95390, std: 0.00399, params: {'features__tf__topk': 200, 'features__tf_idf__chi__k': 200}],
 {'features__tf__topk': 50, 'features__tf_idf__chi__k': 100},
 0.9561212006470854)

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [113]:
pipeline = Pipeline([('features', combined_features), 
                     ('classifier', XGBClassifier(max_depth = 7, gamma = 0, 
                                                  objective='multi:softmax', num_class=2))])

param_grid_0 = dict(features__tf_idf__chi__k = [100, 200, 300],# [2000, 5000, 20000, 40000],
                   features__tf__topk=[50,100, 200]) 
param_grid_0

{'features__tf_idf__chi__k': [100, 200, 300],
 'features__tf__topk': [50, 100, 200]}

In [114]:
clf = GridSearchCV(pipeline, param_grid=param_grid_0, 
                   cv = cv, scoring=score_2, n_jobs=-1, verbose=10)
clf.fit(X_train, y_train)
score_2_best_para['chi__k'] = clf.best_params_['features__tf_idf__chi__k']
score_2_best_para['topk'] = clf.best_params_['features__tf__topk']
print('score_2_chi__k: ', score_2_best_para['chi__k'])
print('score_2_topk: ', score_2_best_para['topk'])
clf.grid_scores_, clf.best_params_, clf.best_score_

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.95, total= 1.7min
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9833333333333334, total= 1.7min
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9611111111111111, total= 1.7min
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9559218559218559, total= 1.7min
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 ...

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  5.8min


[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.961111111111111, total= 1.6min
[CV] features__tf__topk=50, features__tf_idf__chi__k=100 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9722222222222222, total= 1.6min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9722222222222223, total= 1.6min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9722222222222222, total= 1.5min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=100, score=0.9609862671660424, total= 1.5min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  8.4min


[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.9503663003663003, total= 1.5min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.9833333333333334, total= 1.5min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.9555555555555556, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.95, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.9666666666666667, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.9333333333333333, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=200 ........

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 13.3min


[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.9666666666666667, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=300 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.95, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=300 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=200, score=0.949812734082397, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=300 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=300, score=0.9559218559218559, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=300 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=300, score=0.9611111111111111, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=300 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=300, score=0.9444444444444444, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=300 .........

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 15.8min


[CV]  features__tf__topk=50, features__tf_idf__chi__k=300, score=0.961111111111111, total= 1.3min
[CV] features__tf__topk=50, features__tf_idf__chi__k=300 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=300, score=0.9722222222222223, total= 1.4min
[CV] features__tf__topk=50, features__tf_idf__chi__k=300 .............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=300, score=0.95, total= 1.5min
[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=300, score=0.9722222222222222, total= 1.5min
[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=50, features__tf_idf__chi__k=300, score=0.949812734082397, total= 1.5min
[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9559218559218559, total= 1.5min
[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ........

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 23.9min


[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9611111111111111, total= 1.6min
[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9722222222222223, total= 1.7min
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9388888888888889, total= 1.7min
[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ............
[CV] features__tf__topk=100, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9722222222222222, total= 1.6min
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.961111111111111, total= 1.6min
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=100, score=0.9609862

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 29.3min


[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.9555555555555556, total= 1.6min
[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.95, total= 1.5min
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.9666666666666667, total= 1.4min
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.9333333333333333, total= 1.4min
[CV] features__tf__topk=100, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.95, total= 1.3min
[CV] features__tf__topk=100, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=200, score=0.9777777777777777, total= 1.4min
[CV] features__tf__topk=100, features__tf_idf__chi__k=300 ............
[C

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 36.5min


[CV]  features__tf__topk=100, features__tf_idf__chi__k=300, score=0.9444444444444444, total= 1.4min
[CV] features__tf__topk=100, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=300, score=0.9722222222222223, total= 1.4min
[CV] features__tf__topk=100, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=300, score=0.95, total= 1.4min
[CV] features__tf__topk=100, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=300, score=0.9722222222222222, total= 1.4min
[CV] features__tf__topk=200, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=300, score=0.961111111111111, total= 1.4min
[CV] features__tf__topk=200, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=100, features__tf_idf__chi__k=300, score=0.9555555555555555, total= 1.5min
[CV] features__tf__topk=200, features__tf_idf__chi__k=100 ..

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 42.5min


[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.9722222222222223, total= 1.7min
[CV] features__tf__topk=200, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.9388888888888889, total= 1.7min
[CV] features__tf__topk=200, features__tf_idf__chi__k=100 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.9722222222222222, total= 1.6min
[CV] features__tf__topk=200, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.961111111111111, total= 1.6min
[CV] features__tf__topk=200, features__tf_idf__chi__k=200 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.9609862671660424, total= 1.6min
[CV]  features__tf__topk=200, features__tf_idf__chi__k=100, score=0.9722222222222222, total= 1.5min
[CV] features__tf__topk=200, features__tf_idf__chi__k=200 ............
[CV] features__tf__topk=200, features__tf_idf_

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 53.1min


[CV]  features__tf__topk=200, features__tf_idf__chi__k=200, score=0.9666666666666667, total= 1.3min
[CV] features__tf__topk=200, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=200, score=0.9777777777777777, total= 1.4min
[CV] features__tf__topk=200, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=200, score=0.949812734082397, total= 1.4min
[CV] features__tf__topk=200, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=300, score=0.9833333333333334, total= 1.4min
[CV] features__tf__topk=200, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=300, score=0.9559218559218559, total= 1.4min
[CV] features__tf__topk=200, features__tf_idf__chi__k=300 ............
[CV]  features__tf__topk=200, features__tf_idf__chi__k=300, score=0.9611111111111111, total= 1.4min
[CV] features__tf__topk=200, features__tf_idf_

[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 60.3min finished


score_2_chi__k:  100
score_2_topk:  50


([mean: 0.96280, std: 0.01216, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 100},
  mean: 0.95835, std: 0.01432, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 200},
  mean: 0.96058, std: 0.01151, params: {'features__tf__topk': 50, 'features__tf_idf__chi__k': 300},
  mean: 0.96280, std: 0.01216, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 100},
  mean: 0.95835, std: 0.01432, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 200},
  mean: 0.96058, std: 0.01151, params: {'features__tf__topk': 100, 'features__tf_idf__chi__k': 300},
  mean: 0.96280, std: 0.01216, params: {'features__tf__topk': 200, 'features__tf_idf__chi__k': 100},
  mean: 0.95835, std: 0.01432, params: {'features__tf__topk': 200, 'features__tf_idf__chi__k': 200},
  mean: 0.96058, std: 0.01151, params: {'features__tf__topk': 200, 'features__tf_idf__chi__k': 300}],
 {'features__tf__topk': 50, 'features__tf_idf__chi__k': 100},
 0.9627991098580987)

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### combined_features

In [115]:
combined_features_1 = FeatureUnion([
                                    ('tf_idf', Pipeline([
                                        ('counts', CountVectorizer()),
                                        ('tfidf', TfidfTransformer()),
                                        ('chi', SelectKBest(chi2, k=score_1_best_para['chi__k']))
                                        ])),
                                    ('tf', Statskeywords(topk = score_1_best_para['topk'])),
                                    ('len_stats', StatsFeatures())
                                ])

In [116]:
combined_features_2 = FeatureUnion([
                                    ('tf_idf', Pipeline([
                                        ('counts', CountVectorizer()),
                                        ('tfidf', TfidfTransformer()),
                                        ('chi', SelectKBest(chi2, k=score_2_best_para['chi__k']))
                                        ])),
                                    ('tf', Statskeywords(topk = score_2_best_para['topk'])),
                                    ('len_stats', StatsFeatures())
                                ])

### n_estimators、learning_rate
- 选择对应于此学习速率的理想决策树数量。
- learning_rate
    - 通过减少每一步的权重，可以提高模型的鲁棒性。 取值范围为：[0,1]。缺省值为0.3。
    - 选择对应于此学习速率的理想决策树数量。

In [117]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', XGBClassifier(max_depth = 7, gamma = 0, 
                                                  objective='multi:softmax', num_class=2))])

param_grid_1 = dict(classifier__n_estimators = range(10,71,10), 
                    classifier__learning_rate=[0.01, 0.1, 0.3]) # 理想的学习速率有时候会在0.05到0.3之间波动
param_grid_1

{'classifier__n_estimators': range(10, 71, 10),
 'classifier__learning_rate': [0.01, 0.1, 0.3]}

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_1, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['n_estimators'] = clf.best_params_['classifier__n_estimators']
score_1_best_para['learning_rate'] = clf.best_params_['classifier__learning_rate']
print('score_1_n_estimators: ', score_1_best_para['n_estimators'])
print('score_1_learning_rate: ', score_1_best_para['learning_rate'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', XGBClassifier(max_depth = 7, gamma = 0, 
                                                  objective='multi:softmax', num_class=2))])

param_grid_1 = dict(classifier__n_estimators = range(10,71,10), # 迭代次数/分类器个数
                   classifier__learning_rate=[0.01, 0.1, 0.3]) 
param_grid_1

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_1, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['n_estimators'] = clf.best_params_['classifier__n_estimators']
score_2_best_para['learning_rate'] = clf.best_params_['classifier__learning_rate']
print('score_2_n_estimators: ', score_2_best_para['n_estimators'])
print('score_2_learning_rate: ', score_2_best_para['learning_rate'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### max_depth、min_child_weight
- max_depth：树的最大深度
    - 用来避免过拟合的。max_depth越大，模型会学到更具体更局部的样本。 
    - 需要使用CV函数来进行调优。 典型值：3-10。起始值在4-6之间都是不错的选择。取值范围为：[1,∞]
    - 树的深度越大，则对数据的拟合程度越高（过拟合程度也越高）。即该参数也是控制过拟合
- min_child_weight：最小样本权重的和
    - 用于避免过拟合。当它的值较大时，可以避免模型学习到局部的特殊样本。 但是如果这个值过高，会导致欠拟合。
    - 这个参数需要使用CV来调整。取值范围为: [0,∞]
    - 如果一个叶子节点的样本权重和小于min_child_weight则拆分过程结束。在现行回归模型中，这个参数是指建立每个模型所需要的最小样本数。该成熟越大算法越conservative。即调大这个参数能够控制过拟合。

In [None]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', XGBClassifier(n_estimators = score_1_best_para['n_estimators'],
                                                  learning_rate = score_1_best_para['learning_rate'],
                                                  objective='multi:softmax', num_class=2))])

param_grid_2 = dict(classifier__max_depth=range(3,14,2), # 决策树最大深度
                    classifier__min_child_weight=[4, 5, 6]) 
param_grid_2

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_2, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['max_depth'] = clf.best_params_['classifier__max_depth']
score_1_best_para['min_child_weight'] = clf.best_params_['classifier__min_child_weight']
print('score_1_max_depth: ', score_1_best_para['max_depth'])
print('score_1_min_child_weight: ', score_1_best_para['min_child_weight'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', XGBClassifier(n_estimators = score_2_best_para['n_estimators'],
                                                  learning_rate = score_2_best_para['learning_rate'],
                                                  objective='multi:softmax', num_class=2))])

param_grid_2 = dict(classifier__max_depth=range(3,14,2), # 决策树最大深度
                    classifier__min_child_weight=[4, 5, 6]) 
param_grid_2

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_2, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['max_depth'] = clf.best_params_['classifier__max_depth']
score_2_best_para['min_child_weight'] = clf.best_params_['classifier__min_child_weight']
print('score_2_max_depth: ', score_2_best_para['max_depth'])
print('score_2_min_child_weight: ', score_2_best_para['min_child_weight'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### gamma
- 在节点分裂时，只有分裂后损失函数的值下降了，才会分裂这个节点。Gamma指定了节点分裂所需的最小损失函数下降值。 这个参数的值越大，算法越保守。
- range: [0,∞]
- 模型在默认情况下，对于一个节点的划分只有在其loss function 得到结果大于0的情况下才进行，而gamma 给定了所需的最低loss function的值gamma值使得算法更conservation，且其值依赖于loss function ，在模型中应该进行调参。

In [None]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', XGBClassifier(n_estimators = score_1_best_para['n_estimators'],
                                                  learning_rate = score_1_best_para['learning_rate'],
                                                  max_depth = score_1_best_para['max_depth'],
                                                  min_child_weight = score_1_best_para['min_child_weight'],                                                  
                                                  objective='multi:softmax', num_class=2))])

param_grid_2 = dict(classifier__gamma=[i/10.0 for i in range(0,5)]) 
param_grid_2

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_2, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['gamma'] = clf.best_params_['classifier__gamma']
print('score_1_gamma: ', score_1_best_para['gamma'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', XGBClassifier(n_estimators = score_2_best_para['n_estimators'],
                                                  learning_rate = score_2_best_para['learning_rate'],
                                                  max_depth = score_2_best_para['max_depth'],
                                                  min_child_weight = score_2_best_para['min_child_weight'],                                                  
                                                  objective='multi:softmax', num_class=2))])

param_grid_2 = dict(classifier__gamma=[i/10.0 for i in range(0,5)]) 
param_grid_2

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_2, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['gamma'] = clf.best_params_['classifier__gamma']
print('score_2_gamma: ', score_2_best_para['gamma'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### subsample、colsample_bytree
- subsample
    - 控制对于每棵树，随机采样的比例。 减小这个参数的值，算法会更加保守，避免过拟合。但是，如果这个值设置得过小，它可能会导致欠拟合。 典型值：0.5-1
    - 如果设置为0.5则意味着XGBoost将随机的从整个样本集合中抽取出50%的子样本建立树模型，这能够防止过拟合。
- colsample_bytree
    - 和GBM里面的max_features参数类似。用来控制每棵随机采样的列数的占比(每一列是一个特征)。 
    - 典型值：0.5-1。取值范围为：(0,1]。

In [None]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', XGBClassifier(n_estimators = score_1_best_para['n_estimators'],
                                                  learning_rate = score_1_best_para['learning_rate'],
                                                  max_depth = score_1_best_para['max_depth'],
                                                  min_child_weight = score_1_best_para['min_child_weight'],      
                                                  gamma = score_1_best_para['gamma'],
                                                  objective='multi:softmax', num_class=2))])

param_grid_3 = dict(classifier__subsample=[0.6,0.8,1.0], 
                    classifier__colsample_bytree=[0.6,0.8,1.0]) # 内部节点再划分所需最小样本数)
param_grid_3

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_3, 
                   cv = cv, scoring=score_1, n_jobs=-1)
clf.fit(X_train, y_train)
score_1_best_para['subsample'] = clf.best_params_['classifier__subsample']
score_1_best_para['colsample_bytree'] = clf.best_params_['classifier__colsample_bytree']
print('score_1_subsample: ', score_1_best_para['subsample'])
print('score_1_colsample_bytree: ', score_1_best_para['colsample_bytree'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', XGBClassifier(n_estimators = score_2_best_para['n_estimators'],
                                                  learning_rate = score_2_best_para['learning_rate'],
                                                  max_depth = score_2_best_para['max_depth'],
                                                  min_child_weight = score_2_best_para['min_child_weight'],      
                                                  gamma = score_2_best_para['gamma'],
                                                  objective='multi:softmax', num_class=2))])

param_grid_3 = dict(classifier__subsample=[0.6,0.8,1.0], 
                    classifier__colsample_bytree=[0.6,0.8,1.0]) # 内部节点再划分所需最小样本数)
param_grid_3

In [None]:
clf = GridSearchCV(pipeline, param_grid=param_grid_3, 
                   cv = cv, scoring=score_2, n_jobs=-1)
clf.fit(X_train, y_train)
score_2_best_para['subsample'] = clf.best_params_['classifier__subsample']
score_2_best_para['colsample_bytree'] = clf.best_params_['classifier__colsample_bytree']
print('score_2_subsample: ', score_2_best_para['subsample'])
print('score_2_colsample_bytree: ', score_2_best_para['colsample_bytree'])
clf.grid_scores_, clf.best_params_, clf.best_score_

In [None]:
y_pred_class = clf.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

### 最终模型

In [None]:
score_1_best_para

In [None]:
score_2_best_para

In [None]:
pipeline = Pipeline([('features', combined_features_1), 
                     ('classifier', XGBClassifier(n_estimators = score_1_best_para['n_estimators'],
                                                  learning_rate = score_1_best_para['learning_rate'],
                                                  max_depth = score_1_best_para['max_depth'],
                                                  min_child_weight = score_1_best_para['min_child_weight'],      
                                                  gamma = score_1_best_para['gamma'],
                                                  subsample = score_1_best_para['subsample'],      
                                                  colsample_bytree = score_1_best_para['colsample_bytree'],                                                  
                                                  objective='multi:softmax', num_class=2))])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline

In [None]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
pipeline = Pipeline([('features', combined_features_2), 
                     ('classifier', XGBClassifier(n_estimators = score_2_best_para['n_estimators'],
                                                  learning_rate = score_2_best_para['learning_rate'],
                                                  max_depth = score_2_best_para['max_depth'],
                                                  min_child_weight = score_2_best_para['min_child_weight'],      
                                                  gamma = score_2_best_para['gamma'],
                                                  subsample = score_2_best_para['subsample'],      
                                                  colsample_bytree = score_2_best_para['colsample_bytree'],                                                  
                                                  objective='multi:softmax', num_class=2))])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline

In [None]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))