## 算法

In [None]:
!cp /content/drive/MyDrive/Data/text_classification-master.zip text_classification.zip
!unzip text_classification.zip

In [None]:
import os
import shutil
import zipfile
import jieba
import time
import warnings
import xgboost
import lightgbm
import numpy as np
import pandas as pd
from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from sklearn import svm
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')

In [None]:
def read_text(path, text_list):
    '''
    path: 必选参数，文件夹路径
    text_list: 必选参数，文件夹 path 下的所有 .txt 文件名列表
    return: 返回值
        features 文本(特征)数据，以列表形式返回; 
        labels 分类标签，以列表形式返回
    '''
    
    features, labels = [], [] 
    for text in text_list:
        if text.split('.')[-1] == 'txt':
            try:
                with open(path + text, encoding='gbk') as fp:
                    features.append(fp.read())          # 特征 
                    labels.append(path.split('/')[-2])  # 标签
            except Exception as erro:
                print('\n>>>发现错误, 正在输出错误信息:', erro)
                
    return features, labels
                    
def merge_text(train_or_test, label_name):
    '''
    train_or_test: 必选参数，train 训练数据集 or test 测试数据集
    label_name: 必选参数，分类标签的名字
    return: 返回值
        merge_features 合并好的所有特征数据，以列表形式返回;
        merge_labels   合并好的所有分类标签数据，以列表形式返回
    '''
    
    # print('\n>>>文本读取和合并程序已经启动, 请稍候...')
    
    merge_features, merge_labels = [], []  # 函数全局变量
    for name in label_name:
        path = '/content/text_classification-master/text classification/'+ train_or_test +'/'+ name +'/'
        text_list = os.listdir(path)
        features, labels = read_text(path=path, text_list=text_list)  # 调用函数
        merge_features += features  # 特征
        merge_labels   += labels    # 标签
        
    # 可以自定义添加一些想要知道的信息
    # print('\n>>>你正在处理的数据类型是...\n', train_or_test)
    # print('\n>>>[', train_or_test ,']数据具体情况如下...')
    # print('样本数量\t', len(merge_features), '\t类别名称\t', set(merge_labels))   
    # print('\n>>>文本读取和合并工作已经处理完毕...\n')
    
    return merge_features, merge_labels

train_or_test = 'train'
label_name = ['女性', '体育', '校园', '文学']
X_train, y_train = merge_text(train_or_test, label_name)
train_or_test = 'test'
label_name = ['女性', '体育', '校园', '文学']
X_test, y_test = merge_text(train_or_test, label_name)
X_test[0], y_test[0]

In [None]:
stoplist = [word.strip() for word in open('/content/text_classification-master/text classification/stop/stopword.txt', encoding='utf-8').readlines()]
stoplist[:10]

In [None]:
# 编码器处理文本标签
le = LabelEncoder()

y_train_le = le.fit_transform(y_train)
y_test_le  = le.fit_transform(y_test)

y_train_le, y_test_le

In [None]:
# 文本数据转换成数据值数据矩阵
count = CountVectorizer(stop_words=stoplist) # 这里要先 count.fit() 训练所有训练和测试集，保证特征数一致，这样在算法建模时才不会报错

jieba.enable_parallel(64) #并行分词开启
X_train_word = [jieba.cut(words) for words in X_train]
X_train_cut = [' '.join(word) for word in X_train_word]
X_test_word = [jieba.cut(words) for words in X_test]
X_test_cut  = [' '.join(word) for word in X_test_word]

count.fit(list(X_train_cut) + list(X_test_cut))
X_train_count = count.transform(X_train_cut)
X_test_count  = count.transform(X_test_cut)

X_train_count = X_train_count.toarray()
X_test_count  = X_test_count.toarray()

print(X_train_count.shape, X_test_count.shape)
X_train_count, X_test_count

In [None]:
# 用于存储所有算法的名字，准确率和所消耗的时间
estimator_list, score_list, time_list = [], [], []

def get_text_classification(estimator, X, y, X_test, y_test):
    '''
    estimator: 分类器，必选参数
            X: 特征训练数据，必选参数
            y: 标签训练数据，必选参数
       X_test: 特征测试数据，必选参数
        y_tes: 标签测试数据，必选参数
       return: 返回值
           y_pred_model: 预测值
             classifier: 分类器名字
                  score: 准确率
                      t: 消耗的时间
                  matrix: 混淆矩阵
                  report: 分类评价函数
                       
    '''
    start = time.time()
    
    # print('\n>>>算法正在启动，请稍候...')
    model = estimator
    
    # print('\n>>>算法正在进行训练，请稍候...')
    model.fit(X, y)
    print(model)
    
    # print('\n>>>算法正在进行预测，请稍候...')
    y_pred_model = model.predict(X_test)
    # print(y_pred_model)
    
    # print('\n>>>算法正在进行性能评估，请稍候...')
    score = metrics.accuracy_score(y_test, y_pred_model)
    matrix = metrics.confusion_matrix(y_test, y_pred_model)
    report = metrics.classification_report(y_test, y_pred_model)

    print('>>>准确率', score)
    print('\n>>>召回率', report)
    print('\n>>>混淆矩阵', matrix)
    # print('>>>算法程序已经结束...')
    
    end = time.time()
    t = end - start
    print('\n>>>算法消耗时间为：', t, '秒\n')
    classifier = str(model).split('(')[0]
    
    return y_pred_model, classifier, score, round(t, 2), matrix, report


#### TF-ID

In [None]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

def number_normalizer(tokens):
    """ 将所有数字标记映射为一个占位符（Placeholder）。
    对于许多实际应用场景来说，以数字开头的tokens不是很有用，
    但这样tokens的存在也有一定相关性。 通过将所有数字都表示成同一个符号，可以达到降维的目的。
    """
    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)

class NumberNormalizingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))


tfv = NumberNormalizingVectorizer(min_df=3,  
                                  max_df=0.5,
                                  max_features=None,                 
                                  ngram_range=(1, 2), 
                                  use_idf=True,
                                  smooth_idf=True,
                                  stop_words = stoplist)

# 使用TF-IDF来fit训练集和测试集（半监督学习）
tfv.fit(list(X_train_cut) + list(X_test_cut))
xtrain_tfv =  tfv.transform(X_train_cut) 
xvalid_tfv = tfv.transform(X_test_cut)

#利用提取的TFIDF特征来fit一个简单的Logistic Regression 
clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(xtrain_tfv, X_train_cut)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(X_train_cut, predictions))
#print(classification_report(predictions, yvalid))

#### k 近邻算法

In [None]:
knc = KNeighborsClassifier()

result = get_text_classification(knc, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 决策树

In [None]:
dtc = DecisionTreeClassifier()

result = get_text_classification(dtc, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 多层感知器

In [None]:
mlpc = MLPClassifier()

result = get_text_classification(mlpc, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 伯努力贝叶斯算法

In [None]:
bnb = BernoulliNB()

result = get_text_classification(bnb, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 高斯贝叶斯

In [None]:
gnb = GaussianNB()

result = get_text_classification(gnb, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 多项式朴素贝叶斯

In [None]:
mnb = MultinomialNB()

result = get_text_classification(mnb, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 逻辑回归算法

In [None]:
lgr = LogisticRegression()

result = get_text_classification(lgr, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 支持向量机算法

In [None]:
svc = svm.SVC()

result = get_text_classification(svc, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 随机森林算法

In [None]:
rfc = RandomForestClassifier()

result = get_text_classification(rfc, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 自增强算法

In [None]:
abc = AdaBoostClassifier()

result = get_text_classification(abc, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### lightgbm算法

In [None]:
gbm = lightgbm.LGBMClassifier()

result = get_text_classification(gbm, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### xgboost算法

In [None]:
xgb = xgboost.XGBClassifier()#(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1)

result = get_text_classification(xgb, X_train_count, y_train_le, X_test_count, y_test_le)
estimator_list.append(result[1]), score_list.append(result[2]), time_list.append(result[3])

#### 多分类前馈神经网络
1 算法流程：  
创建神经网络——添加神经层——编译神经网络——训练神经网络——预测——性能评估——保存模型  

2 添加神经层  
至少要有两层神经层，第一层必须是输入神经层，最后一层必须是输出层；  
输入神经层主要设置输入的维度，而最后一层主要是设置激活函数的类型来指明是分类还是回归问题  

3 编译神经网络  
分类问题的 metrics，一般以 accuracy 准确率来衡量  
回归问题的 metrics, 一般以 mae 平均绝对误差来衡量

In [None]:
start = time.time()
# np.random.seed(0)     # 设置随机数种子
feature_num = X_train_count.shape[1]     # 设置所希望的特征数量

# 独热编码目标向量来创建目标矩阵
y_train_cate = to_categorical(y_train_le)
y_test_cate = to_categorical(y_test_le)
print(y_train_cate)

# 1 创建神经网络
network = models.Sequential() 

# 2 添加神经连接层 第一层必须有并且一定是 [输入层], 必选
network.add(layers.Dense(     # 添加带有 relu 激活函数的全连接层
                         units=128, 
                         activation='relu', 
                         input_shape=(feature_num, )
                         ))
# 介于第一层和最后一层之间的称为 [隐藏层]，可选
network.add(layers.Dense(     # 添加带有 relu 激活函数的全连接层
                         units=128, 
                         activation='relu'
                         ))
network.add(layers.Dropout(0.8))
# 最后一层必须有并且一定是 [输出层], 必选                         
network.add(layers.Dense(     # 添加带有 softmax 激活函数的全连接层
                         units=4,
                         activation='sigmoid'
                         ))

# 3 编译神经网络
network.compile(loss='categorical_crossentropy',  # 分类交叉熵损失函数    
                optimizer='rmsprop',  
                metrics=['accuracy']              # 准确率度量
                )


# 4 开始训练神经网络
history = network.fit(X_train_count,     # 训练集特征
            y_train_cate,        # 训练集标签
            epochs=20,          # 迭代次数
            batch_size=300,    # 每个批量的观测数  可做优化
            validation_data=(X_test_count, y_test_cate)  # 验证测试集数据
            )
network.summary()

In [None]:
# 5 模型预测
y_pred_keras = network.predict(X_test_count)


In [None]:
# 6 性能评估
print('>>>多分类前馈神经网络性能评估如下...\n')
score = network.evaluate(X_test_count,
                        y_test_cate,
                        batch_size=32)
print('\n>>>评分\n', score)
print()
end = time.time()

estimator_list.append('前馈网络')
score_list.append(score[1])
time_list.append(round(end-start, 2))

In [None]:
# 损失函数情况
import matplotlib.pyplot as plt
%matplotlib inline

train_loss = history.history["loss"]
valid_loss = history.history["val_loss"]
epochs = [i for i in range(len(train_loss))]
plt.plot(epochs, train_loss,linewidth=3.0)
plt.plot(epochs, valid_loss,linewidth=3.0)

In [None]:
# 准确率情况
train_loss = history.history["accuracy"]
valid_loss = history.history["val_accuracy"]
epochs = [i for i in range(len(train_loss))]
plt.plot(epochs, train_loss,linewidth=3.0)
plt.plot(epochs, valid_loss,linewidth=3.0)

In [None]:
# 保存
print('\n>>>你正在进行保存模型操作, 请稍候...\n')

network.save('/home/kesci/work/xiaozhi/my_network_model.h5')

print('>>>保存工作已完成...\n')


# 加载和使用
print('>>>你正在加载已经训练好的模型, 请稍候...\n')

my_load_model = models.load_model('/home/kesci/work/xiaozhi/my_network_model.h5')

print('>>>你正在使用加载的现成模型进行预测, 请稍候...\n')
print('>>>预测部分结果如下...')

my_load_model.predict(X_test_count)[:20]

#### LSTM 神经网络

In [None]:
# 设置所希望的特征数
feature_num = X_train_count.shape[1] 

# 使用单热编码目标向量对标签进行处理
y_train_cate = to_categorical(y_train_le)
y_test_cate = to_categorical(y_test_le)

print(y_train_cate)

# 1 创建神经网络
lstm_network = models.Sequential()

# 2 添加神经层
lstm_network.add(layers.Embedding(input_dim=feature_num, output_dim=4))     # 添加嵌入层
lstm_network.add(layers.LSTM(units=128))                                    # 添加 128 个单元的 LSTM 神经层
lstm_network.add(layers.Dense(units=4,activation='sigmoid'))                # 添加 sigmoid 分类激活函数的全连接层

# 3 编译神经网络
lstm_network.compile(loss='binary_crossentropy',
                     optimizer='Adam',
                     metrics=['accuracy']
                     )

# 4 开始训练模型
lstm_network.fit(X_train_count,
                 y_train_cate,
                 epochs=5,
                 batch_size=128,
                 validation_data=(X_test_count, y_test_cate)
                 )

#### 算法之间性能比较

In [None]:
df = pd.DataFrame()
df['分类器'] = estimator_list
df['准确率'] = score_list
df['消耗时间/s'] = time_list
df