In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author   : huhu
# @Time     : 2023/3/20 9:14
# @File     : train.py.py
# @Project  : blog_04
# @objective: 
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# 得到评论，normal_file为存放正常评论的文件，spam_file为存放垃圾评论的文件
train_data = pd.read_csv('data/train/train.csv')
test_data  = pd.read_csv('data/test/test.csv')

print (train_data.head(2))

# 将特征划分到 X 中，标签划分到 Y 中
x = train_data.iloc[:, 1:]
y = train_data.iloc[:, 0]

print(y.head(2))

               分类名称                            新闻字符串
0  news_agriculture  农村集市“野味”，它卖50元一斤，好多人没见过还嫌贵，不识货？
1         news_tech      微信小程序带给传统企业哪些颠覆性的优势？你还在观望吗？
0    news_agriculture
1           news_tech
Name: 分类名称, dtype: object


In [3]:
# 对数据集进行随机划分，训练过程暂时没有使用测试数据
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [4]:
x_train.head()

Unnamed: 0,新闻字符串
189853,哪个懂车的现代iX35和哈佛H2S自动挡的哪个好我想买谢谢？
149874,军演时来敌人了怎么办？
19942,为什么美国这么有钱？
65951,山东能不能种植甘蔗？
190155,通道：田间育种忙


#### 2，中文文本处理

In [5]:
#-*-coding:utf-8-*-

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report

In [6]:
# stopword.txt 是停用词存储所在的文件
stopword_file = open("data/stopword/stopword.txt", encoding='utf-8', errors='ignore')
# stopword_file = open("data/stopword/stopwords_zh.txt")
stopword_content = stopword_file.read()
stopword_list = stopword_content.splitlines()
stopword_file.close()

In [7]:
print(stopword_list[0])

,


In [8]:
# 常见的特征数值计算类，是一个文本特征提取方法。对于每一个训练文本，它只考虑每种词汇在该训练文本中出现的频率。
# count_vect = CountVectorizer(stop_words=stopword_list, token_pattern=r"(?u)\b\w+\b")
count_vect  = CountVectorizer(stop_words=stopword_list)
train_count = count_vect.fit_transform(list(x_train['新闻字符串']))



In [9]:
print(train_count[0])

  (0, 87555)	1


In [10]:
# tf-idf chi特征选择；类似将自然语言转成机器能识别的向量
tfidf_trainformer = TfidfTransformer()
train_tfidf = tfidf_trainformer.fit_transform(train_count)
select = SelectKBest(chi2, k=100)
train_tfidf_chi = select.fit_transform(train_tfidf, y_train)

In [11]:
print(train_tfidf_chi[0])




In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0.001)

# 模型训练
clf.fit(train_tfidf, y_train)

# import ant 准确值
print("train accurancy:", clf.score(train_tfidf, y_train))

# 预测值（结果内容是识别的具体值）
train_pre = clf.predict(train_tfidf)

# 输出分类报告（大概就是准确率、召回率）
print('输出分类报告:', classification_report(train_pre, y_train))

train accurancy: 0.9963285714285715
输出分类报告:                     precision    recall  f1-score   support

  news_agriculture       1.00      1.00      1.00      7100
          news_car       1.00      1.00      1.00     13161
      news_culture       1.00      0.99      1.00     10367
          news_edu       1.00      1.00      1.00      9914
news_entertainment       1.00      1.00      1.00     14250
      news_finance       0.99      0.99      0.99      9995
         news_game       1.00      1.00      1.00     10726
        news_house       1.00      1.00      1.00      6411
     news_military       0.99      0.99      0.99      9168
       news_sports       1.00      1.00      1.00     13654
        news_story       1.00      1.00      1.00      2301
         news_tech       0.99      0.99      0.99     15134
       news_travel       1.00      1.00      1.00      7814
        news_world       0.99      0.99      0.99      9873
             stock       0.96      0.92      0.94      

In [13]:
import pickle
with open('model/clf.pickle', 'wb') as fw:
    pickle.dump(clf, fw)

with open('model/count_vect.pickle', 'wb') as fw:
    pickle.dump(count_vect, fw)

with open('model/tfidf_trainformer.pickle', 'wb') as fw:
    pickle.dump(tfidf_trainformer, fw)

#### 3，模型预测

In [14]:
# 将特征划分到 X 中，标签划分到 Y 中
test_x = test_data.iloc[:, 1:]
test_y = test_data.iloc[:, 0]

In [15]:
# 读取模型
with open('model/clf.pickle', 'rb') as clf:
    clf1 = pickle.load(clf)

with open('model/count_vect.pickle', 'rb') as count_vect:
    count_vect1 = pickle.load(count_vect)

with open('model/tfidf_trainformer.pickle', 'rb') as tfidf_trainformer:
    tfidf_trainformer1 = pickle.load(tfidf_trainformer)

In [16]:
# 停用词处理等
test_count = count_vect1.transform(list(test_x['新闻字符串']))



In [17]:
test_count = count_vect1.transform([test_x['新闻字符串'][0]])

In [18]:
[test_x['新闻字符串'][0]]

['全球唯一一辆没有对手的SUV, 一年只卖1000台！']

In [19]:
print (test_count)

  (0, 21077)	1
  (0, 61266)	1


In [20]:
# 特征选择
test_tfidf = tfidf_trainformer1.transform(test_count)
select = SelectKBest(chi2, k=100)
# test_tfidf_chi = select.transform(test_tfidf)

In [21]:
print (test_tfidf)

  (0, 61266)	0.7071067811865476
  (0, 21077)	0.7071067811865476


In [24]:
test_y = clf1.predict(test_tfidf)

In [25]:
# 使用模型识别数据
accurancy = clf1.score(test_tfidf, test_y)

In [26]:
# 识别准确率
print("accurancy", accurancy)

accurancy 1.0


In [27]:
test_tfidf[0]

<1x251947 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [28]:
# 识别结果，类型是numpy.int32（可以使用int()直接转换成int型），后面通过excel来存储
test_pre = clf1.predict(test_tfidf[0])

In [29]:
test_pre[0]

'news_car'