# 本文件说明

- 优浪公司项目
- 预处理及特征值计算

# 基本设置

In [9]:
##load packages, needed
# encoding=utf-8

import jieba
import sys
import re
import time
import string
from sklearn import feature_extraction
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn import metrics

from sklearn.base import BaseEstimator, TransformerMixin

import joblib
%matplotlib inline
import numpy as np
import pandas as pd
import pre_cor
import os
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from pandas.io import sql
from collections import Counter

from jieba import analyse

import warnings
warnings.filterwarnings('ignore')

In [10]:
def getkeywords(X, N = 1000):
    '''
    训练时生成，合并所有记录，取N个关键词
    '''
    textrank = analyse.textrank

    text_combined = ' '.join(X)
    keywords = textrank(text_combined, topK = N)
    print('keywords num: ', len(keywords))
    if len(keywords) < N : 
        N  = len(keywords)

    if keywords:
        f = open("corpus/keywords.txt","w+", encoding='UTF-8')
        for content in keywords:
            content = content.strip()
            if content != ':AB:':
                f.write(content + '\n')
        f.close()

In [11]:
class Statskeywords(BaseEstimator, TransformerMixin):
    
    def __init__(self, topk = 100):
        self.topk = topk
#         print(self.topk)
        self.keywords = set()
        f = open("corpus/keywords.txt","r+", encoding='UTF-8')
        num = 0
        for content in f:
            if num < topk:
                self.keywords.add(content.strip().replace('\n', ''))
            num += 1
        f.close() 
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        '''
        文本中关键词的词频
        '''                        
        data = []
        for x in X:
            words = x.split()
            word_tf = []
            keycnt = 0
            for kw in self.keywords:
                word_tf.append(words.count(kw)) # 各个关键词的词频
                if kw in words:keycnt+=1
            word_tf.append(keycnt) # 关键词的个数
            data.append(word_tf)            
        return data        

In [12]:
class StatsFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.neg = set()
        f = open("corpus/neg_words.txt","r+", encoding='UTF-8')
        for content in f:
            self.neg.add(content)
        f.close()       

    def fit(self, X, y=None):
        return self

    def getcnt(self,x): 
        '''词个数'''
        return len(list(set(x.split())))

    def getnegcnt(self,x):
        '''负面词个数'''
        negcnt = 0
        words = x.split()
        for w in words:
            if w in self.neg:
                negcnt = negcnt+1
        return negcnt

    def getrepcnt(self,x):
        '''重复词个数'''
        repcnt =0
        words = x.split()        
        for w in list(set(words)):
            if words.count(w)>1: # 记录重复词汇（词频大于1）
                repcnt += 1
        return repcnt
    
    def transform(self, X):
        '''
        文本长度、词个数、词比例、
        负面词个数、负面词比例、
        重复词个数、重复词比例
        '''
        data = []
        for x in X:
            if len(x) == 0:
                length  = 1
            else :
                length = len(x)
            data.append([len(x),self.getcnt(x),self.getcnt(x)/length,
                         self.getnegcnt(x),self.getnegcnt(x)/length,
                         self.getrepcnt(x),self.getrepcnt(x)/length])            
        return data

# 导入数据

## 预处理后数据

In [13]:
# 诈骗电话
corpus_pos = []
label_pos = []

filename = 'data/pos_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_pos.append(f)
    label_pos.append(1)
fid.close()
print(len(corpus_pos))
print(len(label_pos))

3
3


In [14]:
# 非诈骗电话
corpus_neg = []
label_neg = []

filename = 'data/neg_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_neg.append(f)
    label_neg.append(0)
fid.close()
print(len(corpus_neg))
print(len(label_neg))

3
3


In [15]:
folder = '20180703'

# 相关数据
corpus_cor = []
label_cor = []

filename = 'data/{0}/corpus_pre_cor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:    
    corpus_cor.append(f)
    label_cor.append(1)
fid.close()
print(len(corpus_cor))
print(len(label_cor))

6111
6111


In [16]:
# 不相关数据
corpus_uncor = []
label_uncor = []

filename = 'data/{0}/corpus_pre_uncor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_uncor.append(f)
    label_uncor.append(0)
fid.close()
print(len(corpus_uncor))
print(len(label_uncor))

8949
8949


## 分割数据

In [17]:
# corpus = corpus_pos + corpus_neg
# label = label_pos + label_neg
corpus = corpus_cor[:100] + corpus_uncor[:100]
label = label_cor[:100] + label_uncor[:100]

X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.1, random_state=42)
print('训练集：',len(y_train))
print('训练集-各类数量：',Counter(y_train))
print('测试集：',len(y_test))
print('测试集-各类数量：',Counter(y_test))

训练集： 180
训练集-各类数量： Counter({0: 90, 1: 90})
测试集： 20
测试集-各类数量： Counter({1: 10, 0: 10})


## 生成词典

In [18]:
if 0: getkeywords(corpus, N = 1000)

# 训练模型:GDBT

In [19]:
def train_print(pipeline, param_grid):
    train_res = []
    scores = ['roc_auc', 'precision_macro', 'recall_macro', 'f1_macro']
    for score in scores:
        print("### Tuning hyper-parameters for %s" % score)
        
        t0 = datetime.datetime.now()
        clf = GridSearchCV(pipeline, param_grid, cv=3, # n_jobs = 2, 
                           scoring = score, iid=False)
        clf.fit(X_train, y_train)
        t1 = datetime.datetime.now()
        print ('  耗时： %s s'%(t1 - t0).seconds)

        print("---- Best parameters set found on development set:")
        print(clf.best_params_)
        print("---- Grid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("    %0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

        print("---- Detailed classification report:")
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()

        train_res.append([score,clf.cv_results_,  clf.grid_scores_, 
                          clf.best_params_, clf.best_score_])
        
    return train_res

In [25]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords(topk = 50)),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', GradientBoostingClassifier( random_state=0))
])

# clf_xgb = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, cv = 10)
# clf_xgb.fit(X_train, y_train)

## 不调参

In [26]:
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
pipeline

0.9944444444444445


In [27]:
y_pred_class = pipeline.predict(X_test)
print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
print(metrics.classification_report(y_test, y_pred_class))
print('confusion_matrix: ')
print( metrics.confusion_matrix(y_test, y_pred_class))

accuracy_score:  0.9
             precision    recall  f1-score   support

          0       0.90      0.90      0.90        10
          1       0.90      0.90      0.90        10

avg / total       0.90      0.90      0.90        20

confusion_matrix: 
[[9 1]
 [1 9]]


## 调参

In [28]:
param_grid = dict(# features__tf_idf__chi__k = [2000, 20000, 40000],
                  features__tf__topk=[50, 100,500],
                  classifier__max_depth=[5, 7, 9, 11], # 决策树最大深度
                  classifier__min_samples_split=range(100,801,200), # 内部节点再划分所需最小样本数
                  classifier__min_samples_leaf = [3, 5, 6, 7, 9], # 叶子节点最少样本数  
                  classifier__max_features = [0.25, 0.5, 0.75, 1.0],  # 特征的比例
                  classifier__n_estimators = [50, 100, 200,500],  # 迭代次数/分类器个数
                  classifier__learning_rate=[0.01, 0.1, 0.3] # 学习率
                  )
param_grid

{'classifier__learning_rate': [0.01, 0.1, 0.3],
 'classifier__max_depth': [5, 7, 9, 11],
 'classifier__max_features': [0.25, 0.5, 0.75, 1.0],
 'classifier__min_samples_leaf': [3, 5, 6, 7, 9],
 'classifier__min_samples_split': range(100, 801, 200),
 'classifier__n_estimators': [50, 100, 200, 500],
 'features__tf__topk': [50, 100, 500]}

In [None]:
train_res = train_print(pipeline, param_grid)

# Tuning hyper-parameters for roc_auc
