In [14]:
import jieba
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
def read_data(data_path):
    """
    读取数据
    :param data_path: 数据存放路径
    :return:读取到的数据
    """
    with open(data_path, 'r', encoding='utf-8') as f:
        data = f.readlines()
    return data

In [3]:
def cut_words(data, stopwords, test_size=0.2):
    """
    分词、去停用词并将数据集分成训练集和测试集
    :param data:文本数据
    :param stopwords:停用词
    :param test_size:测试集和训练集的划分比例
    :return:测试集和训练集
    """
    stop_words = list()
    for word in stopwords:
        stop_words.append(word[:-1])
    y = list()
    text_list = list()
    for line in data:
        label, text = line.split('\t', 1)
        cut_text = [word for word in jieba.cut(text) if word not in stop_words]
        if cut_text == '':
            continue
        else:
            text_list.append(' '.join(cut_text))
            y.append(int(label))
    return train_test_split(text_list, y, test_size=test_size, random_state=1028)


In [4]:

def calculate_tfidf(X_train, X_test):
    """
    计算文本的tf-idf
    :param X_train: 训练集
    :param X_test:测试集
    :return:返回的是文本的tf-idf特征
    """
    vectorizer = TfidfVectorizer()
    vectorizer.fit_transform(X_train)
    X_train_tfidf = vectorizer.transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    return X_train_tfidf, X_test_tfidf, vectorizer

In [5]:
def evaluate(model, X, y):
    """
    模型评估
    :param model:训练好的模型
    :param X:测试集
    :param y:测试集标签
    :return:正确率和auc值
    """
    accuracy = model.score(X, y)
    fpr, tpr, thresholds = metrics.roc_curve(y, model.predict_proba(X)[:, 1], pos_label=1)
    return accuracy, metrics.auc(fpr, tpr)

In [11]:
    # step1 读取数据（文本和停用词）
    data_path = "./train.txt"
    stopwords_path = "./stopwords.txt"
    data = read_data(data_path)
    stopwords = read_data(stopwords_path)

    # step2 分词、分为训练集和测试集
    X_train, X_test, y_train, y_test = cut_words(data, stopwords, test_size=0.2)

    # step3 提取特征参数（tf-idf）
    X_train_tfidf, X_test_tfidf, tfidf_model = calculate_tfidf(X_train, X_test)
   

In [19]:
    print("TfidfVectorizer & Multinomial Naive Bayes - MNB\n")
    # step4 训练lr模型
    nb = MultinomialNB(alpha=0.01)
    nb.fit(X_train_tfidf, y_train)
    
    # step5 模型评估
    accuracy, auc = evaluate(nb, X_train_tfidf, y_train)
    print("Train Accuarcy：%.4f%%\n" % (accuracy * 100))
    print("Train AUC：%.6f\n" % auc)

    accuracy, auc = evaluate(nb, X_test_tfidf, y_test)
    print("Test Accuarcy：%.4f%%\n" % (accuracy * 100))
    print("Test AUC：%.6f\n" % auc)

TfidfVectorizer & Multinomial Naive Bayes - MNB

Train Accuarcy：98.4051%

Train AUC：0.999471

Test Accuarcy：94.8064%

Test AUC：0.990541



In [20]:
    print("TfidfVectorizer & LogisticRegression \n")
    # step4 训练lr模型
    lr = LogisticRegression(C=1.0)
    lr.fit(X_train_tfidf, y_train)

    # step5 模型评估
    accuracy, auc = evaluate(lr, X_train_tfidf, y_train)
    print("Train Accuarcy：%.4f%%\n" % (accuracy * 100))
    print("Train AUC：%.6f\n" % auc)

    accuracy, auc = evaluate(lr, X_test_tfidf, y_test)
    print("Test Accuarcy：%.4f%%\n" % (accuracy * 100))
    print("Test AUC：%.6f\n" % auc)

TfidfVectorizer & LogisticRegression 

Train Accuarcy：95.1470%

Train AUC：0.996376

Test Accuarcy：93.6674%

Test AUC：0.988965

