In [22]:
# --- 1. 基础数据处理与工具库 ---
import pandas as pd
import numpy as np
import time
import warnings
from pprint import pprint  # 用于美化打印输出

# --- 2. 可视化库 ---
from matplotlib import pyplot as plt

# --- 3. 中文分词库 ---
import jieba

# --- 4. Sklearn: 数据集加载 ---
# load_*: 小数据集 (本地)
# fetch_*: 大数据集 (在线下载)
from sklearn.datasets import load_iris, fetch_20newsgroups, fetch_california_housing

# --- 5. Sklearn: 数据预处理 & 缺失值处理 ---
from sklearn.preprocessing import MinMaxScaler, StandardScaler  # 归一化、标准化
from sklearn.impute import SimpleImputer                        # 缺失值填补

# --- 6. Sklearn: 特征工程 (提取与选择) ---
from sklearn.feature_extraction import DictVectorizer           # 字典特征提取
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # 文本特征提取
from sklearn.feature_selection import VarianceThreshold         # 特征选择 (低方差过滤)
from sklearn.decomposition import PCA                           # 主成分分析 (降维)

# --- 7. Sklearn: 模型选择与数据集划分 ---
from sklearn.model_selection import train_test_split, GridSearchCV

# --- 8. Sklearn: 分类算法模型 ---
from sklearn.neighbors import KNeighborsClassifier              # KNN K-近邻
from sklearn.naive_bayes import MultinomialNB                   # 朴素贝叶斯
from sklearn.tree import DecisionTreeClassifier, export_graphviz # 决策树
from sklearn.ensemble import RandomForestClassifier             # 随机森林

# --- 9. Sklearn: 模型评估指标 ---
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# --- 全局设置 ---
# 忽略一些版本兼容性的警告信息
warnings.filterwarnings("ignore")

  import pkg_resources


In [23]:

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

def flower_data_process():
    """
    处理鸢尾花数据：加载、查看、分割
    """
    # 1. 加载数据集
    # li 是一个类似字典的对象，包含了数据和元数据
    li = load_iris()

    print("=" * 30 + " 数据集信息查看 " + "=" * 30)

    # 2. 查看特征值 (data)
    # 对应花萼长、花萼宽、花瓣长、花瓣宽
    print(f"特征值形状 (Shape): {li.data.shape}")  # (150, 4)
    # print(li.data) # 数据太多，平时不建议全打印

    # 3. 查看目标值 (target)
    # 对应 0, 1, 2 (三种花的类别)
    print(f"\n目标值 (前10个): {li.target[:10]}")

    # 4. 查看特征名字和目标名字
    print(f"\n特征名字 (Feature Names): {li.feature_names}")
    print(f"目标名字 (Target Names): {li.target_names}")

    # 5. 查看数据集描述 (DESCR)
    # print("\n数据集描述 (DESCR):")
    # print(li.DESCR)

    print("\n" + "=" * 30 + " 数据集划分 " + "=" * 30)

    # 6. 数据集划分 (Train Test Split)
    # 注意顺序：x_train, x_test, y_train, y_test
    # test_size=0.25: 拿出 25% 做测试
    # random_state=1: 固定随机种子，保证每次切分结果一样
    x_train, x_test, y_train, y_test = train_test_split(
        li.data,
        li.target,
        test_size=0.25,
        random_state=1
    )

    # 7. 查看划分结果
    print("训练集特征形状 (x_train):", x_train.shape)  # 应该是 112 个样本
    print("测试集特征形状 (x_test): ", x_test.shape)   # 应该是 38 个样本
    print("训练集目标形状 (y_train):", y_train.shape)
    print("测试集目标形状 (y_test): ", y_test.shape)

    return x_train, x_test, y_train, y_test

# --- 主程序入口 ---
# 加上这一行是为了防止 RecursionError 报错
if __name__ == '__main__':
    flower_data_process()

特征值形状 (Shape): (150, 4)

目标值 (前10个): [0 0 0 0 0 0 0 0 0 0]

特征名字 (Feature Names): ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
目标名字 (Target Names): ['setosa' 'versicolor' 'virginica']

训练集特征形状 (x_train): (112, 4)
测试集特征形状 (x_test):  (38, 4)
训练集目标形状 (y_train): (112,)
测试集目标形状 (y_test):  (38,)


In [25]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
import warnings
import numpy as np

# --- 1. 设置警告过滤 ---
# 修复点：np.VisibleDeprecationWarning 在新版 NumPy 中已被移除
# 改为直接忽略所有警告，或者干脆不写这一行
warnings.filterwarnings("ignore")

def news_data_process():
    """
    处理 20类新闻数据集：加载、查看详情、统计
    """
    print("正在尝试加载数据集，请稍候...")

    try:
        # --- 2. 加载数据集 ---
        # subset='all': 下载训练集 + 测试集
        # data_home='data': 指定数据下载/读取的目录为当前文件夹下的 data 目录
        news = fetch_20newsgroups(subset='all', data_home='data')

        print("\n" + "=" * 30 + " 数据集加载成功 " + "=" * 30)

        # --- 3. 查看数据具体内容 ---
        print("【第一个样本内容】 (news.data[0]):")
        print(news.data[0])

        print("-" * 50)
        print(f"【数据类型】: {type(news.data)}")

        # --- 4. 查看统计信息 ---
        print("\n" + "=" * 30 + " 统计信息 " + "=" * 30)

        # 查看样本总数
        print(f"新闻总篇数 (len): {len(news.data)}")

        # 查看分类总数
        print(f"分类总数 (target_names): {len(news.target_names)}")

        # --- 5. 查看目标标签 (Target) ---
        print("\n【前15个样本的标签值】 (0-19的数字):")
        print(news.target[0:15])

        # --- 6. 查看分类名称 (Target Names) ---
        print("\n【所有文章分类名称】:")
        pprint(list(news.target_names))

        return news

    except Exception as e:
        print("\n❌ 数据集加载失败！")
        print(f"错误信息: {e}")
        return None

# --- 主程序入口 ---
if __name__ == '__main__':
    news_data_process()

正在尝试加载数据集，请稍候...

【第一个样本内容】 (news.data[0]):
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


--------------------------------------------------
【数