In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import jieba.analyse
%matplotlib inline

In [2]:
# 读取数据集
def load_data():
    # 读取训练集
    train_file = open("./thucnews_train.txt", encoding='utf-8')
    train_collection = train_file.readlines()
    train_data = []
    for i in train_collection:
        i = i.split("_!_")
#         print(i[3][:100])
#         keywords = []
# #         print(i[3])
# #         print(type(jieba.analyse.textrank(i[3], topK=30)))
#         items = jieba.analyse.textrank(i[3], topK=30) # 提取30个关键词
#         for item in items:
#             keywords.append(item[0])
        if len(i) < 4:
            continue
        i[3] = " ".join(jieba.analyse.textrank(i[3], topK=30))
        i.append(len(i[3].split()))
        i.pop(2) # 去除ID
        train_data.append(i)
    train_df = pd.DataFrame(train_data)
    train_df.columns = ["label_index", "label", "cutword", "cutwordnum"]
    
    # 读取验证集
    val_file = open("./thucnews_dev.txt", encoding='utf-8')
    val_collection = val_file.readlines()
    val_data = []
    for i in val_collection:
        i = i.split("_!_")
        if len(i) < 4:
            continue
        i[3] = " ".join(jieba.analyse.textrank(i[3], topK=30))
        i.append(len(i[3].split()))
        i.pop(2) # 去除ID
        val_data.append(i)
    val_df = pd.DataFrame(val_data)
    val_df.columns = ["label_index", "label", "cutword", "cutwordnum"]
    
    # 读取测试集
    test_file = open("./thucnews_test.txt", encoding='utf-8')
    test_collection = test_file.readlines()
    test_data = []
    for i in test_collection:
        i = i.split("_!_")
        if len(i) < 4:
            continue
        i[3] = " ".join(jieba.analyse.textrank(i[3], topK=30))
        i.append(len(i[3].split()))
        i.pop(2) # 去除ID
        test_data.append(i)
    test_df = pd.DataFrame(test_data)
    test_df.columns = ["label_index", "label", "cutword", "cutwordnum"]
    
    return train_df, val_df, test_df

In [None]:
train_df, val_df, test_df = load_data()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\yinzh\AppData\Local\Temp\jieba.cache
Loading model cost 0.815 seconds.
Prefix dict has been built successfully.


In [None]:
train_df.head()

数据探索

In [None]:
## 查看训练集都有哪些标签
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure()
sns.countplot(train_df.label)
plt.xlabel('label')
plt.xticks(rotation=90)
plt.show()

共15种标签，标签分布比较均匀。

In [None]:
## 分析训练集中词组数量的分布
print(train_df.cutwordnum.describe())
plt.figure()
plt.hist(train_df.cutwordnum,bins=100)
plt.xlabel("phrase length")
plt.ylabel("frequency")
plt.title("train data")
plt.show()

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

In [None]:
## 对数据集的标签数据进行重新编码
train_y = train_df.label
val_y = val_df.label
test_y = test_df.label
le = LabelEncoder()
train_y = le.fit_transform(train_y).reshape(-1,1)
val_y = le.transform(val_y).reshape(-1,1)
test_y = le.transform(test_y).reshape(-1,1)

## 对数据集的标签数据进行one-hot编码
ohe = OneHotEncoder()
train_y = ohe.fit_transform(train_y).toarray()
val_y = ohe.transform(val_y).toarray()
test_y = ohe.transform(test_y).toarray()

使用Tokenizer对词组进行编码。

In [None]:
max_words = 5000
max_len = 600
tok = Tokenizer(num_words=max_words)  ## 使用的最大词语数为5000
tok.fit_on_texts(train_df.cutword)

## 使用word_index属性可以看到每次词对应的编码
## 使用word_counts属性可以看到每个词对应的频数
for ii,iterm in enumerate(tok.word_index.items()):
    if ii < 10:
        print(iterm)
    else:
        break
print("===================")  
for ii,iterm in enumerate(tok.word_counts.items()):
    if ii < 10:
        print(iterm)
    else:
        break

使用tok.texts_to_sequences()将数据转化为序列，并使用sequence.pad_sequences()将每个序列调整为相同的长度。

In [None]:
## 对每个词编码之后，每句新闻中的每个词就可以用对应的编码表示，即每条新闻可以转变成一个向量了：
train_seq = tok.texts_to_sequences(train_df.cutword)
val_seq = tok.texts_to_sequences(val_df.cutword)
test_seq = tok.texts_to_sequences(test_df.cutword)
## 将每个序列调整为相同的长度
train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)
val_seq_mat = sequence.pad_sequences(val_seq,maxlen=max_len)
test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)

print(train_seq_mat.shape)
print(val_seq_mat.shape)
print(test_seq_mat.shape)

In [None]:
train_df.label.drop_duplicates()

In [None]:
len(train_df.label.drop_duplicates())

建立LSTM模型并训练。

In [None]:
## 定义LSTM模型
inputs = Input(name='inputs',shape=[max_len])
## Embedding(词汇表大小,batch大小,每个新闻的词长)
layer = Embedding(max_words+1,128,input_length=max_len)(inputs)
layer = LSTM(64)(layer)
layer = Dense(32,activation="relu",name="FC1")(layer)
# layer = Dropout(0.5)(layer)
layer = Dense(14,activation="softmax",name="FC2")(layer)
model = Model(inputs=inputs,outputs=layer)
model.summary()
model.compile(loss="categorical_crossentropy",optimizer=RMSprop(),metrics=["accuracy"])

In [None]:
## 模型训练
model_fit = model.fit(train_seq_mat,train_y,batch_size=512,epochs=10,
                      validation_data=(val_seq_mat,val_y),
                      callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] ## 当val-loss不再提升时停止训练
                     )

In [None]:
## 对测试集进行预测
test_pre = model.predict(test_seq_mat)

## 评价预测效果，计算混淆矩阵
confm = metrics.confusion_matrix(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1))

In [None]:
Labname =  list(train_df.label.drop_duplicates())

In [None]:
## 混淆矩阵可视化
plt.figure(figsize=(8,8))
sns.heatmap(confm.T, square=True, annot=True,
            fmt='d', cbar=False,linewidths=.8,
            cmap="YlGnBu")
plt.xlabel('True label',size = 14)
plt.ylabel('Predicted label',size = 14)
plt.xticks(np.arange(len(Labname))+0.5,Labname,rotation=90)
plt.yticks(np.arange(len(Labname))+0.3,Labname,rotation=0)
plt.show()


print(metrics.classification_report(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1)))

用时1小时，测试集准确率86%，相当不错的准确率。

In [None]:
# 保存训练好的Tokenizer，和导入
import pickle
# saving
with open('tok_thucnews.pickle', 'wb') as handle:
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# import pickle
# # loading
# with open('tok_thucnews.pickle', 'rb') as handle:
#     tok = pickle.load(handle)
# ## 使用word_index属性可以看到每次词对应的编码
# ## 使用word_counts属性可以看到每个词对应的频数
# for ii,iterm in enumerate(tok.word_index.items()):
#     if ii < 10:
#         print(iterm)
#     else:
#         break
# print("===================")  
# for ii,iterm in enumerate(tok.word_counts.items()):
#     if ii < 10:
#         print(iterm)
#     else:
#         break

In [None]:
## 模型的保存和导入
from keras.models import load_model
# 保存模型
model.save('LSTM_thucnews_model.h5')  

In [None]:
# # 导入已经训练好的模型
# model = load_model('my_model.h5')
# ## 使用tok对验证数据集重新预处理
# val_seq = tok.texts_to_sequences(val_df.cutword)
# ## 将每个序列调整为相同的长度
# val_seq_mat = sequence.pad_sequences(val_seq,maxlen=max_len)
# ## 对验证集进行预测
# val_pre = model.predict(val_seq_mat)
# print(metrics.classification_report(np.argmax(val_pre,axis=1),np.argmax(val_y,axis=1)))