In [1]:
import re

regex_url = re.compile(
        r'^https?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

def strQ2B(ustring):
    rstring = ""
    for uchar in ustring:
        inside_code=ord(uchar)
        if inside_code == 12288:                              #全角空格直接转换            
            inside_code = 32 
        elif (inside_code >= 65281 and inside_code <= 65374): #全角字符（除空格）根据关系转化
            inside_code -= 65248
        rstring += chr(inside_code)
    return rstring

# 使用正则和简单匹配检查句子
def sentence_ok(sentence):
    if not isinstance(sentence, str):
        return False
    if regex_url.search(sentence) is not None:
        return False
    if '【系统温馨提示】' in sentence or \
        '5星' in sentence or \
        '亲，欢迎来到联想服务' in sentence or\
        '我是联想在线工程师' in sentence:
        return False
    return True

# 载入停词
def load_stopword_set():
    stop_words = []
    with codecs.open("dataset/stopwords.txt", "r", "utf-8") as stop_file:
        stop_words = set([w[:-1] for w in stop_file.readlines()])
    return stop_words

# 全角转换成半角
def strQ2B(ustring):
    rstring = ""
    for uchar in ustring:
        inside_code=ord(uchar)
        if inside_code == 12288:                              #全角空格直接转换            
            inside_code = 32 
        elif (inside_code >= 65281 and inside_code <= 65374): #全角字符（除空格）根据关系转化
            inside_code -= 65248
        rstring += chr(inside_code)
    return rstring

In [2]:
# 将END标记之前的条目整合
import codecs
import mysql.connector as c

def load_corpus():
    texts = []
    ids = []
    labels = [] # 标签
    term_freq = {} # {index: count}
    
    conn = c.connect(user='root', password='YsuKeg@20160705', host='139.129.208.70', database='qa')
    cursor = conn.cursor()
    cursor.execute("SELECT `qaid`, `role`, `label`, `end`, `sendtime`, `words` FROM qas");
    stopwords = set(load_stopword_set()) # 停止词

    session = []
    last_case_id = -1
    after_end = False
    result_cnt = 0;
    
    result = cursor.fetchmany(size=10000)
    while result:
        result_cnt += 1
        print('Processing: ' + str(result_cnt))
        for case_id, role, label, end_mark, sendtime, raw_sentence in result:
            if case_id != last_case_id:
                after_end = False
            if after_end:
                continue
            if sentence_ok(raw_sentence):                
                session.append(raw_sentence)
            if end_mark == 1:
                after_end = True
                ids.append(last_case_id)
                labels.append(label)
                texts.append(' '.join(session).rstrip())
                session = []
            last_case_id = case_id
        result = cursor.fetchmany(size=10000)

    cursor.close()
    conn.close()
    return texts, ids, labels, term_freq

raw_texts, raw_ids, raw_labels, term_freq = load_corpus()
print('Done!')

with codecs.open('tmp/raw_text.txt', 'w') as f:
    for text in raw_texts:
        f.write(text.strip().replace('\n', ' ').replace('\r', ' ') + '\n')

with codecs.open('tmp/raw_ids.txt', 'w') as f:
    for idx in raw_ids:
        f.write(str(idx) + '\n')

with codecs.open('tmp/raw_labels.txt', 'w') as f:
    for label in raw_labels:
        f.write(label + '\n')

Processing: 1
Done!


In [13]:
import jieba
import codecs

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import metrics
from scipy.sparse import hstack, vstack
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [21]:
#################
# 特征提取方法
#################
class FeatureBuilder:
    
    def __init__(self):
        self.tf_vectorizer = CountVectorizer(min_df=1)
        self.bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
        self.tfidf_vectorizer = TfidfVectorizer(min_df=1)
#         self.lda = LatentDirichletAllocation(
#             n_topics=16, max_iter=10, learning_method='online', learning_offset=50., random_state=0)
        self.ch2 = SelectKBest(chi2, k=6000)

    """
    bi-gram, tf-idf属性既能达到最高精度
    其他属性加入之后精度变化不显著
    """
    def fit_transform(self, corpus, labels):
        X_1 = self.tf_vectorizer.fit_transform(corpus) # 词频属性
        X_2 = self.bigram_vectorizer.fit_transform(corpus) # bi-gram属性
        X_3 = self.tfidf_vectorizer.fit_transform(corpus) # tf-idf属性
#         X_4 = self.lda.fit_transform(X_1) # 主题属性
#         X_hub = hstack([X_1, X_2, X_3, X_4])
        X_hub = hstack([X_1, X_2, X_3])
        return self.ch2.fit_transform(X_hub, labels)
    
    """
    将transform分离，防止数据污染
    """ 
    def transform(self, corpus):
        X_1 = self.tf_vectorizer.transform(corpus) # 词频属性
        X_2 = self.bigram_vectorizer.transform(corpus) # bi-gram属性
        X_3 = self.tfidf_vectorizer.transform(corpus) # tf-idf属性
#         X_4 = self.lda.transform(X_1) # 主题属性
#         X_hub = hstack([X_1, X_2, X_3, X_4])
        X_hub = hstack([X_1, X_2, X_3])        
        return self.ch2.transform(X_hub)



In [119]:
# # create fake file

# raw_ids = []
# with codecs.open('tmp/raw_ids.txt', 'r') as f:
#     for raw_id in f.read().splitlines():
#         raw_ids.append(raw_id)
# with codecs.open('tmp/rule_to_feature.txt', 'w') as f:
#     for idx in raw_ids:
#         f.write(idx + ' 1 0\n')

In [15]:
feature_dict = {}
with codecs.open('tmp/rule_to_feature.txt', 'r') as f:
    for line in f.read().splitlines():
        data = line[:-1].split(' ')
        feature_dict[data[0]] = data[1:]

corpus, labels, idx_list = [], [], []
rule_feature = []

with codecs.open('tmp/raw_text.txt', 'r') as f:
    for txt in f.readlines():
        corpus.append(' '.join(jieba.cut(strQ2B(str(txt[:-1])))))

with codecs.open('tmp/raw_labels.txt', 'r') as f:
    for label in f.read().splitlines():
        labels.append(label)

with codecs.open('tmp/raw_ids.txt', 'r') as f:
    for raw_id in f.read().splitlines():
        idx_list.append(raw_id)
        rule_feature.append([int(item) for item in feature_dict[raw_id]])
        

In [22]:
#################
# 交叉验证性能
#################
X_zipped = list(zip(corpus, rule_feature, idx_list))
skf = StratifiedKFold(n_splits=10)

overall_train_acc = []
overall_test_acc = []

for train, test in skf.split(X_zipped, labels):
    
    corpus_train, rule_train, idx_train = zip(*[X_zipped[i] for i in train])
    y_train = [labels[i] for i in train]
    corpus_test, rule_test, idx_test = zip(*[X_zipped[i] for i in test])
    y_test = [labels[i] for i in test]
    
    print('Building features ...')
    fb = FeatureBuilder()
    X_train = fb.fit_transform(corpus_train, y_train)
    rule_train = np.array(rule_train)
    rule_test = np.array(rule_test)
    
    X_train_mixed = hstack([X_train, rule_train])
    X_test = fb.transform(corpus_test)
    X_test_mixed = hstack([X_test, rule_test])
    print('Features done!')
    
    clf = linear_model.LogisticRegression()
    clf.fit(X_train_mixed, y_train)
    
    train_acc = clf.score(X_train_mixed, y_train)
    overall_train_acc.append(train_acc)
    print('Train accuracy:  %0.3f' % train_acc)

    pred = clf.predict(X_test_mixed)
    test_acc = metrics.accuracy_score(y_test, pred)
    overall_test_acc.append(test_acc)
    print("Test accuracy:  %0.3f" % test_acc)

print('Overall train acc = %0.3f' % np.mean(overall_train_acc))
print('Overall test acc = %0.3f' % np.mean(overall_test_acc))

with codecs.open('tmp/result.txt', 'w') as f:
    f.write(str(np.mean(overall_test_acc) * 100))
    

Building features ...




ValueError: k should be >=0, <= n_features; got 6000.Use k='all' to return all features.