## 朴素贝叶斯练习--20类新闻

In [3]:
# 导入必要的库
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
import time
import numpy as np

# 加载完整数据集,remove的作用是去除数据集中的头部、尾部和引号
newsgroups = fetch_20newsgroups(data_home='../data', subset='all', shuffle=True, random_state=42)

# 分割为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

# 为了保持与原代码兼容，创建类似的结构
newsgroups_train = type('obj', (object,), {
    'data': X_train,
    'target': y_train,
    'target_names': newsgroups.target_names
})

newsgroups_test = type('obj', (object,), {
    'data': X_test,
    'target': y_test,
    'target_names': newsgroups.target_names
})

# 查看数据集基本信息
print(f"训练集大小: {len(newsgroups_train.data)}")
print(f"测试集大小: {len(newsgroups_test.data)}")
print(f"类别数量: {len(newsgroups_train.target_names)}")

tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(newsgroups_train.data)

X_test_tfidf = tfidf.transform(newsgroups_test.data)

# 创建并训练朴素贝叶斯分类器
print("正在训练模型...")
clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_tfidf, newsgroups_train.target)
# 预测
y_pred = clf.predict(X_test_tfidf)

# 计算准确率
accuracy = accuracy_score(newsgroups_test.target, y_pred)
print(f"\n测试集准确率: {accuracy:.4f}")


训练集大小: 15076
测试集大小: 3770
类别数量: 20
正在训练模型...

测试集准确率: 0.9098
