# 本文件说明

- 优浪公司项目
- 预处理及特征值计算

# 基本设置

In [1]:
##load packages, needed
# encoding=utf-8

import jieba
import sys
import re
import time
import string
from sklearn import feature_extraction
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn import metrics

from sklearn.base import BaseEstimator, TransformerMixin

import joblib
%matplotlib inline
import numpy as np
import pandas as pd
import pre_cor
import os
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from pandas.io import sql
from collections import Counter

from jieba import analyse

import warnings
warnings.filterwarnings('ignore')

In [2]:
def getkeywords(X, N = 1000):
    '''
    训练时生成，合并所有记录，取N个关键词
    '''
    textrank = analyse.textrank

    text_combined = ' '.join(X)
    keywords = textrank(text_combined, topK = N)
    print('keywords num: ', len(keywords))
    if len(keywords) < N : 
        N  = len(keywords)

    if keywords:
        f = open("corpus/keywords.txt","w+", encoding='UTF-8')
        for content in keywords:
            content = content.strip()
            if content != ':AB:':
                f.write(content + '\n')
        f.close()

In [3]:
class Statskeywords(BaseEstimator, TransformerMixin):
    
    def __init__(self, topk = 100):
        self.topk = topk
#         print(self.topk)
        self.keywords = set()
        f = open("corpus/keywords.txt","r+", encoding='UTF-8')
        num = 0
        for content in f:
            if num < topk:
                self.keywords.add(content.strip().replace('\n', ''))
            num += 1
        f.close() 
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        '''
        文本中关键词的词频
        '''                        
        data = []
        for x in X:
            words = x.split()
            word_tf = []
            keycnt = 0
            for kw in self.keywords:
                word_tf.append(words.count(kw)) # 各个关键词的词频
                if kw in words:keycnt+=1
            word_tf.append(keycnt) # 关键词的个数
            data.append(word_tf)            
        return data        

In [4]:
class StatsFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.neg = set()
        f = open("corpus/neg_words.txt","r+", encoding='UTF-8')
        for content in f:
            self.neg.add(content)
        f.close()       

    def fit(self, X, y=None):
        return self

    def getcnt(self,x): 
        '''词个数'''
        return len(list(set(x.split())))

    def getnegcnt(self,x):
        '''负面词个数'''
        negcnt = 0
        words = x.split()
        for w in words:
            if w in self.neg:
                negcnt = negcnt+1
        return negcnt

    def getrepcnt(self,x):
        '''重复词个数'''
        repcnt =0
        words = x.split()        
        for w in list(set(words)):
            if words.count(w)>1: # 记录重复词汇（词频大于1）
                repcnt += 1
        return repcnt
    
    def transform(self, X):
        '''
        文本长度、词个数、词比例、
        负面词个数、负面词比例、
        重复词个数、重复词比例
        '''
        data = []
        for x in X:
            if len(x) == 0:
                length  = 1
            else :
                length = len(x)
            data.append([len(x),self.getcnt(x),self.getcnt(x)/length,
                         self.getnegcnt(x),self.getnegcnt(x)/length,
                         self.getrepcnt(x),self.getrepcnt(x)/length])            
        return data

# 导入数据

## 预处理后数据

In [5]:
# 诈骗电话
corpus_pos = []
label_pos = []

filename = 'data/pos_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_pos.append(f)
    label_pos.append(1)
fid.close()
print(len(corpus_pos))
print(len(label_pos))

3
3


In [6]:
# 非诈骗电话
corpus_neg = []
label_neg = []

filename = 'data/neg_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_neg.append(f)
    label_neg.append(0)
fid.close()
print(len(corpus_neg))
print(len(label_neg))

3
3


In [7]:
folder = '20180703'

# 相关数据
corpus_cor = []
label_cor = []

filename = 'data/{0}/corpus_pre_cor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:    
    corpus_cor.append(f)
    label_cor.append(1)
fid.close()
print(len(corpus_cor))
print(len(label_cor))

6111
6111


In [8]:
# 不相关数据
corpus_uncor = []
label_uncor = []

filename = 'data/{0}/corpus_pre_uncor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_uncor.append(f)
    label_uncor.append(0)
fid.close()
print(len(corpus_uncor))
print(len(label_uncor))

8949
8949


## 分割数据

In [9]:
# corpus = corpus_pos + corpus_neg
# label = label_pos + label_neg
corpus = corpus_cor[:100] + corpus_uncor[:100]
label = label_cor[:100] + label_uncor[:100]

X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.1, random_state=42)
print('训练集：',len(y_train))
print('训练集-各类数量：',Counter(y_train))
print('测试集：',len(y_test))
print('测试集-各类数量：',Counter(y_test))

训练集： 180
训练集-各类数量： Counter({0: 90, 1: 90})
测试集： 20
测试集-各类数量： Counter({1: 10, 0: 10})


## 生成词典

In [10]:
if 0: getkeywords(corpus, N = 1000)

# 训练模型:RF

In [11]:
def train_print(pipeline, param_grid):
    train_res = []
    scores = ['roc_auc', 'precision_macro', 'recall_macro', 'f1_macro']
    for score in scores:
        print("### Tuning hyper-parameters for %s" % score)
        
        t0 = datetime.datetime.now()
        clf = GridSearchCV(pipeline, param_grid, cv=3, # n_jobs = 2, 
                           scoring = score, iid=False)
        clf.fit(X_train, y_train)
        t1 = datetime.datetime.now()
        print ('  耗时： %s s'%(t1 - t0).seconds)

        print("---- Best parameters set found on development set:")
        print(clf.best_params_)
        print("---- Grid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("    %0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

        print("---- Detailed classification report:")
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()

        train_res.append([score,clf.cv_results_,  clf.grid_scores_, 
                          clf.best_params_, clf.best_score_])
        
    return train_res

## 不调参

In [69]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( random_state=0))
])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline
# clf_xgb = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, cv = 10)
# clf_xgb.fit(X_train, y_train)

0.9944444444444445


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('tf_idf', Pipeline(memory=None,
     steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0...estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])

In [70]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

accuracy_score:  0.95
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.91      1.00      0.95        10

avg / total       0.95      0.95      0.95        20

confusion_matrix: 
[[ 9  1]
 [ 0 10]]


## 调参

In [46]:
auc_best_para = {}
recall_best_para = {}

### n_estimators

In [47]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( random_state=0))
])

param_grid_1 = dict(classifier__n_estimators = range(10,71,10)) # 迭代次数/分类器个数)
param_grid_1

{'classifier__n_estimators': range(10, 71, 10)}

In [48]:
clf = GridSearchCV(pipeline, param_grid=param_grid_1, cv = 3, scoring='roc_auc')
clf.fit(X_train, y_train)
auc_best_para['n_estimators'] = clf.best_params_['classifier__n_estimators']
print('auc_n_estimators: ', auc_best_para['n_estimators'])
clf.grid_scores_, clf.best_params_, clf.best_score_

auc_n_estimators:  70


([mean: 0.88222, std: 0.05346, params: {'classifier__n_estimators': 10},
  mean: 0.87630, std: 0.06468, params: {'classifier__n_estimators': 20},
  mean: 0.87741, std: 0.07103, params: {'classifier__n_estimators': 30},
  mean: 0.88259, std: 0.05725, params: {'classifier__n_estimators': 40},
  mean: 0.88315, std: 0.06275, params: {'classifier__n_estimators': 50},
  mean: 0.88222, std: 0.06802, params: {'classifier__n_estimators': 60},
  mean: 0.88611, std: 0.06859, params: {'classifier__n_estimators': 70}],
 {'classifier__n_estimators': 70},
 0.8861111111111111)

In [49]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( random_state=0))
])

param_grid_1 = dict(classifier__n_estimators = range(10,71,10))  # 迭代次数/分类器个数)
param_grid_1

{'classifier__n_estimators': range(10, 71, 10)}

In [50]:
clf = GridSearchCV(pipeline, param_grid=param_grid_1, cv = 3, scoring='recall_macro')
clf.fit(X_train, y_train)
recall_best_para['n_estimators'] = clf.best_params_['classifier__n_estimators']
print('recall_n_estimators: ', recall_best_para['n_estimators'])
clf.grid_scores_, clf.best_params_, clf.best_score_

recall_n_estimators:  40


([mean: 0.81111, std: 0.04374, params: {'classifier__n_estimators': 10},
  mean: 0.81111, std: 0.08315, params: {'classifier__n_estimators': 20},
  mean: 0.80556, std: 0.07495, params: {'classifier__n_estimators': 30},
  mean: 0.82778, std: 0.06136, params: {'classifier__n_estimators': 40},
  mean: 0.80556, std: 0.06983, params: {'classifier__n_estimators': 50},
  mean: 0.80000, std: 0.06804, params: {'classifier__n_estimators': 60},
  mean: 0.82222, std: 0.06431, params: {'classifier__n_estimators': 70}],
 {'classifier__n_estimators': 40},
 0.8277777777777777)

### max_depth、min_samples_split

In [51]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( n_estimators = auc_best_para['n_estimators'], random_state=0))
])

param_grid_2 = dict(classifier__max_depth=range(3,14,2), # 决策树最大深度
                    classifier__min_samples_split=range(50,201,20)) # 内部节点再划分所需最小样本数)
param_grid_2

{'classifier__max_depth': range(3, 14, 2),
 'classifier__min_samples_split': range(50, 201, 20)}

In [52]:
clf = GridSearchCV(pipeline, param_grid=param_grid_2, cv = 3, scoring='roc_auc')
clf.fit(X_train, y_train)
auc_best_para['max_depth'] = clf.best_params_['classifier__max_depth']
auc_best_para['min_samples_split'] = clf.best_params_['classifier__min_samples_split']
print('auc_max_depth: ', auc_best_para['max_depth'])
print('auc_min_samples_split: ', auc_best_para['min_samples_split'])
clf.grid_scores_, clf.best_params_, clf.best_score_

auc_max_depth:  11
auc_min_samples_split:  50


([mean: 0.87222, std: 0.07238, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 50},
  mean: 0.85111, std: 0.06919, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 70},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 90},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 110},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 130},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 150},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 170},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 190},
  mean: 0.87574, std: 0.07249, params: {'classifier__max_depth': 5, 'classifier__min_samples_split': 50},
  mean: 0.85111, std: 0.06919, params: {'

In [53]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( n_estimators = recall_best_para['n_estimators'], random_state=0))
])

param_grid_2 = dict(classifier__max_depth=range(3,14,2), # 决策树最大深度
                    classifier__min_samples_split=range(50,201,20)) # 内部节点再划分所需最小样本数)
param_grid_2

{'classifier__max_depth': range(3, 14, 2),
 'classifier__min_samples_split': range(50, 201, 20)}

In [54]:
clf = GridSearchCV(pipeline, param_grid=param_grid_2, cv = 3, scoring='recall_macro')
clf.fit(X_train, y_train)
recall_best_para['max_depth'] = clf.best_params_['classifier__max_depth']
recall_best_para['min_samples_split'] = clf.best_params_['classifier__min_samples_split']
print('recall_max_depth: ', recall_best_para['max_depth'])
print('recall_min_samples_split: ', recall_best_para['min_samples_split'])
clf.grid_scores_, clf.best_params_, clf.best_score_

recall_max_depth:  5
recall_min_samples_split:  50


([mean: 0.78333, std: 0.05443, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 50},
  mean: 0.77778, std: 0.05500, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 70},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 90},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 110},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 130},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 150},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 170},
  mean: 0.50000, std: 0.00000, params: {'classifier__max_depth': 3, 'classifier__min_samples_split': 190},
  mean: 0.79444, std: 0.04374, params: {'classifier__max_depth': 5, 'classifier__min_samples_split': 50},
  mean: 0.77778, std: 0.05500, params: {'

### min_samples_leaf、min_samples_split

In [56]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( n_estimators = auc_best_para['n_estimators'], 
                                          max_depth = auc_best_para['max_depth'],                                          
                                          random_state=0))
])

param_grid_3 = dict(classifier__min_samples_leaf=range(10,60,10), 
                    classifier__min_samples_split=range(50,201,20)) # 内部节点再划分所需最小样本数)
param_grid_3

{'classifier__min_samples_leaf': range(10, 60, 10),
 'classifier__min_samples_split': range(50, 201, 20)}

In [58]:
clf = GridSearchCV(pipeline, param_grid=param_grid_3, cv = 3, scoring='roc_auc')
clf.fit(X_train, y_train)
auc_best_para['min_samples_leaf'] = clf.best_params_['classifier__min_samples_leaf']
auc_best_para['min_samples_split'] = clf.best_params_['classifier__min_samples_split']
print('auc_min_samples_leaf: ', auc_best_para['min_samples_leaf'])
print('auc_min_samples_split: ', auc_best_para['min_samples_split'])
clf.grid_scores_, clf.best_params_, clf.best_score_

auc_min_samples_leaf:  10
auc_min_samples_split:  50


([mean: 0.85093, std: 0.06932, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 50},
  mean: 0.83852, std: 0.07765, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 70},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 90},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 110},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 130},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 150},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 170},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 190},
  mean: 0.76185, std: 0.07366, params: {'classifier__min_samples_leaf': 20, 'classi

In [59]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( n_estimators = recall_best_para['n_estimators'], 
                                          max_depth = recall_best_para['max_depth'], 
                                          random_state=0))
])

param_grid_3 = dict(classifier__min_samples_leaf=range(10,60,10), 
                    classifier__min_samples_split=range(50,201,20)) # 内部节点再划分所需最小样本数)
param_grid_3

{'classifier__min_samples_leaf': range(10, 60, 10),
 'classifier__min_samples_split': range(50, 201, 20)}

In [60]:
clf = GridSearchCV(pipeline, param_grid=param_grid_3, cv = 3, scoring='recall_macro')
clf.fit(X_train, y_train)
recall_best_para['min_samples_leaf'] = clf.best_params_['classifier__min_samples_leaf']
recall_best_para['min_samples_split'] = clf.best_params_['classifier__min_samples_split']
print('recall_min_samples_leaf: ', recall_best_para['min_samples_leaf'])
print('recall_min_samples_split: ', recall_best_para['min_samples_split'])
clf.grid_scores_, clf.best_params_, clf.best_score_

recall_min_samples_leaf:  10
recall_min_samples_split:  50


([mean: 0.77222, std: 0.04157, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 50},
  mean: 0.75000, std: 0.05443, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 70},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 90},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 110},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 130},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 150},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 170},
  mean: 0.50000, std: 0.00000, params: {'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 190},
  mean: 0.66667, std: 0.04907, params: {'classifier__min_samples_leaf': 20, 'classi

### max_features

In [61]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( n_estimators = auc_best_para['n_estimators'], 
                                          max_depth = auc_best_para['max_depth'],  
                                          min_samples_leaf = auc_best_para['min_samples_leaf'],
                                          min_samples_split = auc_best_para['min_samples_split'],
                                          random_state=0))
])

param_grid_4 = dict(classifier__max_features=[0.25, 0.5, 0.75, 1.0])
param_grid_4

{'classifier__max_features': [0.25, 0.5, 0.75, 1.0]}

In [62]:
clf = GridSearchCV(pipeline, param_grid=param_grid_4, cv = 3, scoring='roc_auc')
clf.fit(X_train, y_train)
auc_best_para['max_features'] = clf.best_params_['classifier__max_features']
print('auc_max_features: ', auc_best_para['max_features'])
clf.grid_scores_, clf.best_params_, clf.best_score_

auc_max_features:  0.5


([mean: 0.84000, std: 0.05644, params: {'classifier__max_features': 0.25},
  mean: 0.84111, std: 0.04914, params: {'classifier__max_features': 0.5},
  mean: 0.82648, std: 0.03532, params: {'classifier__max_features': 0.75},
  mean: 0.82241, std: 0.02688, params: {'classifier__max_features': 1.0}],
 {'classifier__max_features': 0.5},
 0.8411111111111111)

In [63]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( n_estimators = recall_best_para['n_estimators'], 
                                          max_depth = recall_best_para['max_depth'],  
                                          min_samples_leaf = recall_best_para['min_samples_leaf'],
                                          min_samples_split = recall_best_para['min_samples_split'],
                                          random_state=0))
])

param_grid_4 = dict(classifier__max_features=[0.25, 0.5, 0.75, 1.0])
param_grid_4

{'classifier__max_features': [0.25, 0.5, 0.75, 1.0]}

In [64]:
clf = GridSearchCV(pipeline, param_grid=param_grid_4, cv = 3, scoring='recall_macro')
clf.fit(X_train, y_train)
recall_best_para['max_features'] = clf.best_params_['classifier__max_features']
print('recall_max_features: ', recall_best_para['max_features'])
clf.grid_scores_, clf.best_params_, clf.best_score_

recall_max_features:  1.0


([mean: 0.73333, std: 0.03600, params: {'classifier__max_features': 0.25},
  mean: 0.73333, std: 0.06236, params: {'classifier__max_features': 0.5},
  mean: 0.73333, std: 0.04714, params: {'classifier__max_features': 0.75},
  mean: 0.73889, std: 0.03425, params: {'classifier__max_features': 1.0}],
 {'classifier__max_features': 1.0},
 0.7388888888888889)

### 最终模型

In [71]:
auc_best_para

{'max_depth': 11,
 'max_features': 0.5,
 'min_samples_leaf': 10,
 'min_samples_split': 50,
 'n_estimators': 70}

In [72]:
recall_best_para

{'max_depth': 5,
 'max_features': 1.0,
 'min_samples_leaf': 10,
 'min_samples_split': 50,
 'n_estimators': 40}

In [65]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( n_estimators = auc_best_para['n_estimators'], 
                                          max_depth = auc_best_para['max_depth'],  
                                          min_samples_leaf = auc_best_para['min_samples_leaf'],
                                          min_samples_split = auc_best_para['min_samples_split'],
                                          max_features = auc_best_para['max_features'],
                                          random_state=0))
])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline

0.8555555555555555


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('tf_idf', Pipeline(memory=None,
     steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0...estimators=70, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])

In [66]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

accuracy_score:  0.95
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        10
          1       1.00      0.90      0.95        10

avg / total       0.95      0.95      0.95        20

confusion_matrix: 
[[10  0]
 [ 1  9]]


In [67]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', RandomForestClassifier( n_estimators = recall_best_para['n_estimators'], 
                                          max_depth = recall_best_para['max_depth'],  
                                          min_samples_leaf = recall_best_para['min_samples_leaf'],
                                          min_samples_split = recall_best_para['min_samples_split'],
                                          max_features = recall_best_para['max_features'],
                                          random_state=0))
])

pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline

0.85


Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('tf_idf', Pipeline(memory=None,
     steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0...estimators=40, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])

In [68]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

accuracy_score:  0.95
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        10
          1       1.00      0.90      0.95        10

avg / total       0.95      0.95      0.95        20

confusion_matrix: 
[[10  0]
 [ 1  9]]
