In [18]:
# 基础数据处理库
import pandas as pd
import numpy as np

# 数据集加载
from sklearn.datasets import load_iris, fetch_20newsgroups, fetch_california_housing

# 模型选择与数据分割
from sklearn.model_selection import train_test_split, GridSearchCV

# 特征工程 (预处理与特征提取)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

# 各类算法模型
from sklearn.neighbors import KNeighborsClassifier  # KNN
from sklearn.naive_bayes import MultinomialNB       # 朴素贝叶斯
from sklearn.tree import DecisionTreeClassifier, export_graphviz # 决策树
from sklearn.ensemble import RandomForestClassifier # 随机森林

# 模型评估指标
from sklearn.metrics import classification_report, roc_auc_score

In [19]:
import os
from sklearn.datasets import fetch_20newsgroups  # 注意：这里换了导入的函数
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# ---------------------------------------------------------
# 1. 修正：使用 fetch_20newsgroups 读取 pkz 文件
# ---------------------------------------------------------
# data_home 需要指向包含 "20news_home" 的上一级目录
# 根据你的截图，文件在 ../day14/data/20news_home
# 所以这里我们要指向 ../day14/data
data_home = "../day14/data"

print(f"正在从 {data_home} 读取缓存文件...")

try:
    # subset='all' 表示读取所有数据（训练+测试）
    news = fetch_20newsgroups(data_home=data_home, subset='all')

    x = news.data
    y = news.target
    print(f"✅ 成功加载数据！共 {len(x)} 条文章。") # 这次应该能看到 18846 条了
    print(f"包含的类别: {news.target_names}")

    # -----------------------------------------------------
    # 2. 后续步骤保持不变
    # -----------------------------------------------------
    if len(x) > 0:
        # 划分训练集和测试集
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

        # 特征工程
        print("正在进行特征工程...")
        tf = TfidfVectorizer()
        x_train = tf.fit_transform(x_train)
        x_test = tf.transform(x_test) # 注意：测试集用 transform

        # 训练
        print("正在训练模型...")
        mlt = MultinomialNB(alpha=1.0)
        mlt.fit(x_train, y_train)

        # 评估
        score = mlt.score(x_test, y_test)
        print("-" * 30)
        print("模型准确率:", score)

    else:
        print("❌ 数据为空，请检查路径。")

except Exception as e:
    print(f"❌ 出错: {e}")
    print("提示：请确保 data_home 指向的是 '20news_home' 文件夹的父级目录（即 data 文件夹）。")

正在训练模型...
------------------------------
模型准确率: 0.8474801061007957
