In [1]:
import numpy as np
import re
import pandas as pd

#读取第一列、第二列、第四列

traindata_path = './data_processed/train_data.csv'
df_all = pd.read_csv(traindata_path, header=None, engine="python", encoding="utf-8")  #8597
df_all.columns = ["index","words","types","caijing","fangchan","jiaoyu","junshi","keji","qiche","tiyu","youxi","yule"]
df_all.drop(0,inplace=True)
df_all.drop(['index'],inplace=True,axis=1)
df_all = df_all.sample(frac = 1).reset_index(drop=True)

df_all.reset_index(drop=True)
print(' ===============================  data loading success! ===================================')



In [2]:
df_types = df_all[['types']]
df_types = df_types.replace('caijing', 0)
df_types = df_types.replace('fangchan', 1)
df_types =df_types.replace('jiaoyu', 2)
df_types =df_types.replace('junshi', 3)
df_types =df_types.replace('keji', 4)
df_types =df_types.replace('qiche', 5)
df_types =df_types.replace('tiyu', 6)
df_types =df_types.replace('youxi', 7)
df_types =df_types.replace('yule', 8)

In [3]:
import sklearn
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import *

x_series = df_all['words']
labels = to_categorical(df_types['types'].values)

#将数据集按9:1分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x_series, labels, test_size=0.2)

In [4]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 500

#创建一个Tokenizer对象，fit_on_texts函数可以将输入的文本中的每个词编号，编号是根据词频的，词频越大，编号越小
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_series)
word_index = tokenizer.word_index

# 将每个样本中的每个词转换为数字列表，使用每个词的编号进行编号
x_train_word_ids = tokenizer.texts_to_sequences(x_train)
x_test_word_ids = tokenizer.texts_to_sequences(x_test)

# 每条样本长度不唯一，将每条样本的长度设置一个固定值
x_train_padded_seqs=pad_sequences(x_train_word_ids,maxlen) #将超过固定值的部分截掉，不足的在最前面用0填充
x_test_padded_seqs=pad_sequences(x_test_word_ids, maxlen)

print('Shape of data tensor:', len(x_train_word_ids))


Shape of data tensor: 1137


In [5]:
import gensim
from gensim.models import word2vec
# 加载bin格式的模型
w2v_model = gensim.models.KeyedVectors.load_word2vec_format("./model/content_w2v.bin", binary=True)

# 预训练的词向量中没有出现的词用0向量表示，出现的词在w2v模型中找到其对应的向量
embedding_matrix = np.zeros((len(word_index) + 1 , 100))

for word, i in word_index.items():
    try:
        embedding_vector = w2v_model[str(word)]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        continue




In [6]:
from tensorflow.keras.layers import Input,Embedding,Bidirectional,LSTM,BatchNormalization,Dense,Dropout,Lambda
from tensorflow.keras import Sequential
import numpy as np

model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, input_length=maxlen,weights=[embedding_matrix],trainable=False))
model.add(Bidirectional(LSTM(128)))
model.add(BatchNormalization())
model.add(Dense(128,activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(labels.shape[1],activation="softmax")) 

model.compile(
    loss='binary_crossentropy',
    optimizer="adam",
    metrics=['accuracy']
)

history = model.fit(x_train_padded_seqs, y_train, epochs=50, batch_size=20,validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [9]:
def return_f1(y_predict,y_test):
    precision_list = []
    recall_list = []
    f1=[]
    for label in range(y_predict.shape[1]):
        tp=0
        fp=0
        tn=0
        fn=0
        for i in range(len(y_test)):
            if np.argmax(y_test[i])==label:
                if (np.argmax(y_test[i]))==(np.argmax(y_predict[i])):
                    tp+=1
                else:
                    fn+=1
            else:
                if (np.argmax(y_predict[i]))!=label:
                    tn+=1
                else:
                    fp+=1
        precision=tp/(tp+fp)
        recall=tp/(tp+fn)
        
        precision_list.append(precision)
        recall_list.append(recall)
        
        f1.append(2*precision*recall/(precision+recall))
    return precision_list,recall_list,f1
        
def get_avg(list1):
    avg = 0
    for i in range(len(list1)):
        avg += list1[i]
    return avg/len(list1)

In [10]:
y_predict =  model.predict(x_test_padded_seqs)
precision_list,recall_list,f1_list = return_f1(y_predict,y_test)
print('Average_precision:',get_avg(precision_list))
print('Average_recall:',get_avg(recall_list))
print('Average_f1:',get_avg(f1_list))

Average_precision: 0.9326277114816668
Average_recall: 0.925288473588282
Average_f1: 0.9267596009763016


In [14]:
print('       model name: BiLSTM')
print('     precision   recall   f1-score')
print('财经:   %.3f'%precision_list[0],'    %.3f'%recall_list[0],'    %.3f'%f1_list[0])
print('房产:   %.3f'%precision_list[1],'    %.3f'%recall_list[1],'    %.3f'%f1_list[1])
print('教育:   %.3f'%precision_list[2],'    %.3f'%recall_list[2],'    %.3f'%f1_list[2])
print('军事:   %.3f'%precision_list[3],'    %.3f'%recall_list[3],'    %.3f'%f1_list[3])
print('科技:   %.3f'%precision_list[4],'    %.3f'%recall_list[4],'    %.3f'%f1_list[4])
print('汽车:   %.3f'%precision_list[5],'    %.3f'%recall_list[5],'    %.3f'%f1_list[5])
print('体育:   %.3f'%precision_list[6],'    %.3f'%recall_list[6],'    %.3f'%f1_list[6])
print('游戏:   %.3f'%precision_list[7],'    %.3f'%recall_list[7],'    %.3f'%f1_list[7])
print('娱乐:   %.3f'%precision_list[8],'    %.3f'%recall_list[8],'    %.3f'%f1_list[8])
print('平均值: %.3f'%get_avg(precision_list),'    %.3f'%get_avg(recall_list),'    %.3f'%get_avg(f1_list))

       model name: BiLSTM
     precision   recall   f1-score
财经:   0.864     0.704     0.776
房产:   0.857     0.909     0.882
教育:   0.966     1.000     0.982
军事:   0.912     0.969     0.939
科技:   1.000     0.862     0.926
汽车:   0.825     0.943     0.880
体育:   1.000     0.971     0.986
游戏:   1.000     0.970     0.985
娱乐:   0.971     1.000     0.985
平均值: 0.933     0.925     0.927
