# 本文件说明

- 优浪公司项目
- 预处理及特征值计算

# 基本设置

In [20]:
##load packages, needed
# encoding=utf-8

import jieba
import sys
import re
import time
import string
from sklearn import feature_extraction
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics

from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
from sklearn.metrics import classification_report
import pickle

import joblib
%matplotlib inline
import numpy as np
import pandas as pd
import pre_cor
import os
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from pandas.io import sql
from collections import Counter

from jieba import analyse
import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
def getkeywords(X, N = 1000):
    '''
    训练时生成，合并所有记录，取N个关键词
    '''
    textrank = analyse.textrank

    text_combined = ' '.join(X)
    keywords = textrank(text_combined, topK = N)
    print('keywords num: ', len(keywords))
    if len(keywords) < N : 
        N  = len(keywords)

    if keywords:
        f = open("corpus/keywords.txt","w+", encoding='UTF-8')
        for content in keywords:
            content = content.strip()
            if content != ':AB:':
                f.write(content + '\n')
        f.close()

In [3]:
class Statskeywords(BaseEstimator, TransformerMixin):
    
    def __init__(self, topk = 100):
        self.topk = topk
#         print(self.topk)
        self.keywords = set()
        f = open("corpus/keywords.txt","r+", encoding='UTF-8')
        num = 0
        for content in f:
            if num < topk:
                self.keywords.add(content.strip().replace('\n', ''))
            num += 1
        f.close() 
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        '''
        文本中关键词的词频
        '''                        
        data = []
        for x in X:
            words = x.split()
            word_tf = []
            keycnt = 0
            for kw in self.keywords:
                word_tf.append(words.count(kw)) # 各个关键词的词频
                if kw in words:keycnt+=1
            word_tf.append(keycnt) # 关键词的个数
            data.append(word_tf)            
        return data        

In [4]:
class StatsFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.neg = set()
        f = open("corpus/neg_words.txt","r+", encoding='UTF-8')
        for content in f:
            self.neg.add(content)
        f.close()       

    def fit(self, X, y=None):
        return self

    def getcnt(self,x): 
        '''词个数'''
        return len(list(set(x.split())))

    def getnegcnt(self,x):
        '''负面词个数'''
        negcnt = 0
        words = x.split()
        for w in words:
            if w in self.neg:
                negcnt = negcnt+1
        return negcnt

    def getrepcnt(self,x):
        '''重复词个数'''
        repcnt =0
        words = x.split()        
        for w in list(set(words)):
            if words.count(w)>1: # 记录重复词汇（词频大于1）
                repcnt += 1
        return repcnt
    
    def transform(self, X):
        '''
        文本长度、词个数、词比例、
        负面词个数、负面词比例、
        重复词个数、重复词比例
        '''
        data = []
        for x in X:
            if len(x) == 0:
                length  = 1
            else :
                length = len(x)
            data.append([len(x),self.getcnt(x),self.getcnt(x)/length,
                         self.getnegcnt(x),self.getnegcnt(x)/length,
                         self.getrepcnt(x),self.getrepcnt(x)/length])            
        return data

# 导入数据

## 预处理后数据

In [5]:
# 诈骗电话
corpus_pos = []
label_pos = []

filename = 'data/pos_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_pos.append(f)
    label_pos.append(1)
fid.close()
print(len(corpus_pos))
print(len(label_pos))

3
3


In [6]:
# 非诈骗电话
corpus_neg = []
label_neg = []

filename = 'data/neg_pre_20180723.txt'
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_neg.append(f)
    label_neg.append(0)
fid.close()
print(len(corpus_neg))
print(len(label_neg))

3
3


In [7]:
folder = '20180703'

# 相关数据
corpus_cor = []
label_cor = []

filename = 'data/{0}/corpus_pre_cor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:    
    corpus_cor.append(f)
    label_cor.append(1)
fid.close()
print(len(corpus_cor))
print(len(label_cor))

6111
6111


In [8]:
# 不相关数据
corpus_uncor = []
label_uncor = []

filename = 'data/{0}/corpus_pre_uncor_0703.txt'.format(folder)
fid = open(filename, "r+", encoding='UTF-8')
for f in fid:
    corpus_uncor.append(f)
    label_uncor.append(0)
fid.close()
print(len(corpus_uncor))
print(len(label_uncor))

8949
8949


## 分割数据

In [9]:
# corpus = corpus_pos + corpus_neg
# label = label_pos + label_neg
corpus = corpus_cor[:100] + corpus_uncor[:100]
label = label_cor[:100] + label_uncor[:100]

X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.1, random_state=42)
print('训练集：',len(y_train))
print('训练集-各类数量：',Counter(y_train))
print('测试集：',len(y_test))
print('测试集-各类数量：',Counter(y_test))

训练集： 180
训练集-各类数量： Counter({0: 90, 1: 90})
测试集： 20
测试集-各类数量： Counter({1: 10, 0: 10})


## 生成词典

In [10]:
if 0: getkeywords(corpus, N = 1000)

# 训练模型-xgboost

In [30]:
def train_print(pipeline, param_grid):
    train_res = []
    scores = ['roc_auc', 'precision_macro', 'recall_macro', 'f1_macro']
    for score in scores:
        print("### Tuning hyper-parameters for %s" % score)
        
        t0 = datetime.datetime.now()
        clf = GridSearchCV(pipeline, param_grid, cv=3, # n_jobs = 2, 
                           scoring = score, iid=False)
        clf.fit(X_train, y_train)
        t1 = datetime.datetime.now()
        print ('  耗时： %s s'%(t1 - t0).seconds)

        print("---- Best parameters set found on development set:")
        print(clf.best_params_)
        print("---- Grid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("    %0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

        print("---- Detailed classification report:")
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()

        train_res.append([score,clf.cv_results_,  clf.grid_scores_, 
                          clf.best_params_, clf.best_score_])
        
    return train_res

In [28]:
param_grid = dict(# features__tf_idf__chi__k = [2000, 20000, 40000],
                  features__tf__topk=[50, 100,500],
                  classifier__max_depth=[3, 5, 6, 7],
                  classifier__min_child_weight=[4, 5, 6], #这控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
                  classifier__gamma=[i/10.0 for i in range(0,5)],  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子
                  classifier__n_estimators = [50, 100, 200,500], 
                  classifier__learning_rate=[0.01, 0.1, 0.3] # 理想的学习速率有时候会在0.05到0.3之间波动
                  )
param_grid

{'classifier__gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
 'classifier__learning_rate': [0.01, 0.1, 0.3],
 'classifier__max_depth': [3, 5, 6, 7],
 'classifier__min_child_weight': [4, 5, 6],
 'classifier__n_estimators': [50, 100, 200, 500],
 'features__tf__topk': [50, 100, 500]}

In [31]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tf_idf', Pipeline([
            ('counts', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi', SelectKBest(chi2, k=200))
        ])),
        ('tf', Statskeywords()),
        ('len_stats', StatsFeatures())
    ])),
    ('classifier', XGBClassifier(# learning_rate =0.1, max_depth=7,
                                 nthread = 4,# cpu 线程数
                                 objective='multi:softmax', num_class=2))
])

# clf_xgb = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, cv = 10)
# clf_xgb.fit(X_train, y_train)

In [32]:
train_res = train_print(pipeline, param_grid)

### Tuning hyper-parameters for roc_auc


KeyboardInterrupt: 

In [16]:
clf_xgb.cv_results_,  clf_xgb.grid_scores_, clf_xgb.best_params_, clf_xgb.best_score_

({'mean_fit_time': array([1.13397741, 1.24849192, 1.27466178]),
  'mean_score_time': array([0.57790669, 0.55056993, 0.51939933]),
  'mean_test_score': array([0.73333333, 0.73333333, 0.73333333]),
  'mean_train_score': array([1., 1., 1.]),
  'param_features__tf__topk': masked_array(data=[50, 100, 500],
               mask=[False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'features__tf__topk': 50},
   {'features__tf__topk': 100},
   {'features__tf__topk': 500}],
  'rank_test_score': array([1, 1, 1]),
  'split0_test_score': array([0.81666667, 0.81666667, 0.81666667]),
  'split0_train_score': array([1., 1., 1.]),
  'split1_test_score': array([0.65, 0.65, 0.65]),
  'split1_train_score': array([1., 1., 1.]),
  'split2_test_score': array([0.73333333, 0.73333333, 0.73333333]),
  'split2_train_score': array([1., 1., 1.]),
  'std_fit_time': array([0.08603851, 0.146937  , 0.29764177]),
  'std_score_time': array([0.06933958, 0.11017828, 0.0750188 ]),
  'std

In [19]:
# clf_xgb = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, cv = 3)
# clf_xgb.fit(X_train, y_train)

In [18]:
# pipeline.fit(X_train, y_train)
# print(pipeline.score(X_train, y_train))

1.0


In [19]:
# y_pred_class = pipeline.predict(X_test)
# print('accuracy_score: ', metrics.accuracy_score(y_test, y_pred_class)) # 指所有分类正确的百分比
# print(metrics.classification_report(y_test, y_pred_class))
# print('confusion_matrix: ')
# print( metrics.confusion_matrix(y_test, y_pred_class))

accuracy_score:  0.95
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.91      1.00      0.95        10

avg / total       0.95      0.95      0.95        20

confusion_matrix: 
[[ 9  1]
 [ 0 10]]
