In [3]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn import metrics  #计算准确率 精度 召回率 F值
import jieba
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
path = "E:/pyPor/nlp_assignment-master/Lesson7-8/"
true_news_data_file = path +  "news_classification/data/true_news_data.txt"
fake_news_data_file = path + "news_classification/data/fake_news_data.txt"
stop_txt_file = "E:/pyPor/nlp_assignment-master/Lesson7-8/stop/stopword.txt"

def bow_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    # print(vectorizer.get_feature_names())  #词表是以逗号、句号分割的
    return vectorizer, features


from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix




def tfidf_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus) #每一个列表元素中的词语的词频（权重）一样
    return vectorizer, features

warnings.filterwarnings('ignore')
stop_words = [line.strip() for line in open(stop_txt_file, 'r', encoding='utf-8-sig')]

def get_data():
    '''
    获取数据 真新华社新闻取5000条，假新华社新闻取5001
    :return: 文本数据，对应的labels
    '''
    with open(true_news_data_file, encoding="utf-8-sig") as true_news_f, open(fake_news_data_file, encoding="utf-8-sig") as fake_news_f:
        true_news_data = true_news_f.readlines()
        fake_news_data = fake_news_f.readlines()
        true_news_label = np.ones(len(true_news_data)).tolist()  #创建一个列表5000个1.0 [1.0,1.0,1.0，……]
        fake_news_label = np.zeros(len(fake_news_data)).tolist()  #创建一个列表5001个0.0 [0.0,0.0,0.0，……]
        #1.0代表新华社，0.0代表其他新闻社
        print("true_news_label len =",len(true_news_label))
        print("fake_news_label len =",len(fake_news_label))
        corpus = true_news_data + fake_news_data
        labels = true_news_label + fake_news_label
    return corpus, labels

#过滤空文本，但本数据集中没有空文本
def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label) #len(filtered_corpus)=16266
    return filtered_corpus, filtered_labels

#划分数据集：将数据分为训练集和测试集
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    '''
    :param corpus: 文本数据
    :param labels: label数据
    :param test_data_proportion:测试数据占比
    :return: 训练数据,测试数据，训练label,测试label
    '''
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,test_size=test_data_proportion, random_state=42)
    return train_X, test_X, train_Y, test_Y

#数据预处理：分词、去数字、特殊字符（停用词后面处理去除）
def deal_corpus(text):
    text_with_spaces = ''
    text = re.sub(r'\d+', ' ', text)  # 去除数字
    textcut = jieba.cut(text)
    for word in textcut:
        text_with_spaces += word + ' '
    # print(text_with_spaces)
    return text_with_spaces



#计算得分
def get_metrics(true_labels, predicted_labels):
    print('准确率:', np.round(metrics.accuracy_score(true_labels, predicted_labels), 2))
    print('精度:', np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),2))
    print('召回率:', np.round(metrics.recall_score(true_labels,predicted_labels,average='weighted'),2))
    print('F1得分:', np.round(metrics.f1_score(true_labels,predicted_labels,average='weighted'),2))


def train_predict_evaluate_model(classifier,train_features, train_labels,test_features, test_labels):
    # build model
    classifier.fit(train_features, train_labels)
    # 用模型预测
    predictions = classifier.predict(test_features)
    # 评估模型效果
    get_metrics(true_labels=test_labels,
                predicted_labels=predictions)
    return predictions



def main():
    corpus, labels = get_data()  # 获取数据集
    print("总的数据量:", len(labels))

    corpus, labels = remove_empty_docs(corpus, labels)

    print('样本之一:', corpus[10])
    print('样本的label:', labels[10])
    label_name_map = ["其他新闻报道单位", "新华社"]
    print('实际类型:labels[4999]=', label_name_map[int(labels[4999])],"labels[10000]=", label_name_map[int(labels[10000])]) #labels[0:4999]为1.0，labels[5000:10000]为0.0
    

    # 对数据进行划分 test_data_proportion=0.3 百分之30的数据作为测试数据
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,labels,test_data_proportion=0.3)

    #数据预处理
    norm_train_corpus = []
    for text in train_corpus:
        norm_train_corpus.append(deal_corpus(text))
    norm_test_corpus = []
    for text in test_corpus:
        norm_test_corpus.append(deal_corpus(text))
    print(len(norm_train_corpus),len(train_labels))
    print(len(norm_test_corpus), len(test_labels))


#朴素贝叶斯
    # 计算单词权重
    tf = TfidfVectorizer(stop_words=stop_words, max_df=0.5)

    train_features = tf.fit_transform(norm_train_corpus)
    # 上面fit过了，这里transform
    test_features = tf.transform(norm_test_corpus)

    # 多项式贝叶斯分类器
    from sklearn.naive_bayes import MultinomialNB
    clf = MultinomialNB(alpha=0.001).fit(train_features, train_labels)
    predicted_labels = clf.predict(test_features)

    # 计算准确率
    print('准确率为：', metrics.accuracy_score(test_labels, predicted_labels))

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)


    # 训练分类器
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier

    mnb = MultinomialNB()  # 朴素贝叶斯
    svm = SGDClassifier(loss='hinge', n_iter=100)  # 支持向量机
    lr = LogisticRegression()  # 逻辑回归
    knn = KNeighborsClassifier()#KNN

    # 基于词袋模型的KNN模型
    print("基于词袋模型特征的KNN模型")
    mnb_bow_predictions = train_predict_evaluate_model(classifier=knn,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)
    # 基于词袋模型的多项朴素贝叶斯
    print("基于词袋模型特征的贝叶斯分类器")
    mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)
    print(mnb_bow_predictions)  #返回的预测结果：[0. 0. 1. ... 0. 1. 0.]
    # 基于词袋模型特征的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(classifier=lr,
                                                      train_features=bow_train_features,
                                                      train_labels=train_labels,
                                                      test_features=bow_test_features,
                                                      test_labels=test_labels)

   
    print("基于词袋模型的支持向量机")
    svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)
   
    # 基于tfidf的KNN模型
    print("基于tfidf的KNN模型")
    knn_tfidf_predictions = train_predict_evaluate_model(classifier=knn,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)
   
    print("基于tfidf的朴素贝叶斯模型")
    mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)
    print("基于tfidf的逻辑回归模型")
    lr_tfidf_predictions = train_predict_evaluate_model(classifier=lr,
                                                        train_features=tfidf_train_features,
                                                        train_labels=train_labels,
                                                        test_features=tfidf_test_features,
                                                        test_labels=test_labels)
   

    # 基于tfidf的支持向量机模型
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)
    
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
        if label == 0 and predicted_label == 0:
            print('新闻报道单位:', label_name_map[int(label)])
            print('预测的新闻报道单位:', label_name_map[int(predicted_label)])
            print('新闻文本:')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break
    #如果我们预测的是新华社的新闻，但是标签缺不是新华社，我们就说这篇新闻是抄袭新华社的
    print("涉嫌抄袭新闻:")
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
        if label == 0 and predicted_label == 1:
            print('新闻报道单位:', label_name_map[int(label)])
            print('预测的新闻报道单位:', label_name_map[int(predicted_label)])
            print('新闻文本:')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break

if __name__ == '__main__':
    main()

true_news_label len = 5000
fake_news_label len = 5001
总的数据量: 10001
样本之一: 北京售票员可厉害，嘿嘿，有专座的，会直接拉着脖子指着鼻子让上面的人站起来让 座的，呵呵，比较赞。。。 杭州就是很少有人给让座，除非司机要求乘客那样做。 五一去杭州一个景点玩，车上有两个不到一岁的小孩，就是没有人给让座，没办法家长只能在车上把小孩的推车打开让孩子坐进去，但是孩子还是闹，只能抱着，景点离市区很远，车上很颠，最后家长坐在地上抱孩子，就是没有一个人给让座，要是在北京，一上车就有人让座了

样本的label: 1.0
实际类型:labels[4999]= 新华社 labels[10000]= 其他新闻报道单位
7000 7000
3001 3001
准确率为： 0.9886704431856048
基于词袋模型特征的KNN模型
准确率: 0.75
精度: 0.84
召回率: 0.75
F1得分: 0.74
基于词袋模型特征的贝叶斯分类器
准确率: 0.99
精度: 0.99
召回率: 0.99
F1得分: 0.99
[0. 1. 1. ... 0. 1. 0.]
基于词袋模型特征的逻辑回归
准确率: 0.99
精度: 0.99
召回率: 0.99
F1得分: 0.99
基于词袋模型的支持向量机
准确率: 0.99
精度: 0.99
召回率: 0.99
F1得分: 0.99
基于tfidf的KNN模型
准确率: 0.65
精度: 0.8
召回率: 0.65
F1得分: 0.61
基于tfidf的朴素贝叶斯模型
准确率: 0.99
精度: 0.99
召回率: 0.99
F1得分: 0.99
基于tfidf的逻辑回归模型
准确率: 0.99
精度: 0.99
召回率: 0.99
F1得分: 0.99
基于tfidf的支持向量机模型
准确率: 0.99
精度: 0.99
召回率: 0.99
F1得分: 0.99
新闻报道单位: 其他新闻报道单位
预测的新闻报道单位: 其他新闻报道单位
新闻文本:
中信（国际）电子科技有限公司推出新产品： 升职步步高、做生意发大财、连找情人都用的上，详情进入 网  址:  http://www.usa5588.com/ccc 电话：020-33770208   服务热线：01365085299