In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author   : huhu
# @Time     : 2023/3/20 9:14
# @File     : train.py.py
# @Project  : blog_04
# @objective: 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#### 加载数据集

In [2]:
# 得到评论，normal_file为存放正常评论的文件，spam_file为存放垃圾评论的文件
train_data = pd.read_csv('data/train/train.csv')
test_data  = pd.read_csv('data/test/test.csv')

# print (train_data.head(2))

# 将特征划分到 X 中，标签划分到 Y 中
x = train_data.iloc[:, 1:]
y = train_data.iloc[:, 0]

y.head()

0      news_world
1    news_finance
2        news_edu
3     news_sports
4     news_sports
Name: 分类名称, dtype: object

In [3]:
type(x)

pandas.core.frame.DataFrame

In [4]:
x['新闻字符串'].to_list()[:5]

['科学家警告：“害虫杂交群”威胁全球农作物',
 '企业“走出去”有了更多公共服务',
 '大专毕业多年，2018年入学成人本科，本科毕业后，能否参加司法考试？',
 '历史第一！C罗将迎来欧冠金靴六连霸！',
 '今日二串：世界杯之战，就是本彩店决定胜负，因为他看的出结果']

In [3]:
y_dict_temp = list(set(y.to_list()))
y_dict_temp.sort()
# y_dict = [[y_dict_temp[index], index] for index in range(len(y_dict_temp))]

In [4]:
y_dict_temp

['news_agriculture',
 'news_car',
 'news_culture',
 'news_edu',
 'news_entertainment',
 'news_finance',
 'news_game',
 'news_house',
 'news_military',
 'news_sports',
 'news_story',
 'news_tech',
 'news_travel',
 'news_world',
 'stock']

In [7]:
range(len(y_dict_temp))

range(0, 15)

In [8]:
# 生成字典
y_dict = dict([y_dict_temp[index], index] for index in range(len(y_dict_temp)))

y_dict

{'news_agriculture': 0,
 'news_car': 1,
 'news_culture': 2,
 'news_edu': 3,
 'news_entertainment': 4,
 'news_finance': 5,
 'news_game': 6,
 'news_house': 7,
 'news_military': 8,
 'news_sports': 9,
 'news_story': 10,
 'news_tech': 11,
 'news_travel': 12,
 'news_world': 13,
 'stock': 14}

In [11]:
# 字典的键值反转
y_dict_temp = dict(zip(y_dict.values(), y_dict.keys()))

In [12]:
y_dict_temp

{0: 'news_agriculture',
 1: 'news_car',
 2: 'news_culture',
 3: 'news_edu',
 4: 'news_entertainment',
 5: 'news_finance',
 6: 'news_game',
 7: 'news_house',
 8: 'news_military',
 9: 'news_sports',
 10: 'news_story',
 11: 'news_tech',
 12: 'news_travel',
 13: 'news_world',
 14: 'stock'}

In [14]:
# 使用数值标签替换文字标签
y = [y_dict[index] for index in y]

In [15]:
y[0]

13

### 构建、训练模型

In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
import jieba

In [17]:
# 加载数据集，data 和 labels 分别为训练样本和标签
data = x['新闻字符串'].to_list()
labels = y

# 对文本数据进行分词处理
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000
# 文本标记实用类。该类允许使用两种方法向量化一个文本语料库： 将每个文本转化为一个整数序列（每个整数都是词典中标记的索引）； 
# 或者将其转化为一个向量，其中每个标记的系数可以是二进制值、词频、TF-IDF权重等。
# num_words: 需要保留的最大词数，基于词频
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# 这个函数将num_samples个文本序列列表 (每个序列为整数列表) 转换成一个 2D Numpy数组，数组形状为 (num_samples, num_timesteps)
# 如果指定了参数 maxlen 的值，则num_timesteps的值取maxlen的值，否则num_timesteps的值等于最长序列的长度。
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# 将标签进行 one-hot 编码
labels = tf.keras.utils.to_categorical(labels)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

Found 2403 unique tokens.


In [18]:
y_train[0]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [19]:
# 构建模型
model = Sequential()

embedding_dim = 1000

model.add(Embedding(input_dim=MAX_NB_WORDS,
                    output_dim=embedding_dim,
                    input_length=MAX_SEQUENCE_LENGTH))

model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(labels.shape[1], activation='softmax'))

model.summary()  # 打印模型的结构和参数列表

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 1000)        10000000  
                                                                 
 conv1d (Conv1D)             (None, 1000, 32)          96032     
                                                                 
 global_max_pooling1d (Globa  (None, 32)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 256)               8448      
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                        

In [20]:
# 编译模型
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
# 训练模型
model.fit(X_train, y_train, batch_size=256, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a9f52bd450>

In [22]:
# 模型存储
save_path = 'model/test.h5'

model.save(save_path)

In [23]:
# 加载模型
from keras.models import load_model
save_path = 'model/test.h5'
model_test = load_model(save_path)

#### 预测数据

In [24]:
pred_data = [x['新闻字符串'].to_list()[0]]
pred_data

['科学家警告：“害虫杂交群”威胁全球农作物']

In [25]:
# 对文本数据进行分词处理
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
sequences = tokenizer.texts_to_sequences(pred_data)
pred_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [26]:
ynew = model_test.predict(pred_data)



In [29]:
ynew[0]

array([0.04992791, 0.11276272, 0.0689282 , 0.06253488, 0.10451338,
       0.07689668, 0.07057061, 0.04254714, 0.0592688 , 0.08783774,
       0.02688343, 0.11169963, 0.05471561, 0.05030889, 0.02060437],
      dtype=float32)

In [30]:
# numpy.argmax(array, axis) 用于返回一个numpy数组中最大值的索引值。
# 当一组中同时出现几个最大值时，返回第一个最大值的索引值。
y_test_pred = np.argmax(ynew, axis=1)
y_test_pred[0]

1

In [31]:
y_dict_temp[y_test_pred[0]]

'news_car'