In [4]:
#Scikit-learn调库实现（实际应用）

# 1. 导入核心库
import numpy as np  # 数值计算，处理多维数组
import pandas as pd  # 数据处理，CSV读取
import matplotlib.pyplot as plt  # 可视化
from sklearn.model_selection import train_test_split  # 数据集划分
from sklearn.preprocessing import StandardScaler  # 特征标准化
from sklearn.linear_model import LinearRegression  # 线性回归模型（调库版）
from sklearn.metrics import mean_squared_error, r2_score  # 评估指标

print("所有库导入完成！")

所有库导入完成！


In [2]:
# 使用真实数据集（波士顿房价简化版）
from sklearn.datasets import fetch_california_housing

In [None]:
# 加载数据
housing = fetch_california_housing()
X_real = housing.data[:, :2]  # 只取前两个特征（方便可视化）
y_real = housing.target   # 此处代表目标值y

print(f"数据集形状: {X_real.shape}")
print(f"特征名: {housing.feature_names[:2]}")

# 划分训练集/测试集
X_train, X_test, y_train, y_test = train_test_split(
    X_real, y_real, test_size=0.2, random_state=42    # 20%数据做测试集,80%做训练集,random_state是随机种子保证每次运行划分结果一样
)

# 数据标准化(加速梯度下降)
scaler = StandardScaler()    # 创建标准化工具
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 创建并训练模型
sklearn_model = LinearRegression()
sklearn_model.fit(X_train_scaled, y_train)

# 预测
y_train_pred = sklearn_model.predict(X_train_scaled)
y_test_pred = sklearn_model.predict(X_test_scaled)

# 评估
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"\nScikit-learn模型结果:")
print(f"权重 w = {sklearn_model.coef_}")
print(f"偏置 b = {sklearn_model.intercept_:.4f}")
print(f"\n训练集 MSE = {train_mse:.4f}, R² = {train_r2:.4f}")
print(f"测试集 MSE = {test_mse:.4f}, R² = {test_r2:.4f}")

# 特征重要性分析
feature_importance = pd.DataFrame({
    '特征': housing.feature_names[:2],
    '权重': sklearn_model.coef_     # coef_ 是 scikit-learn 模型训练完后自动保存的属性，存的是学到的权重 w
}).sort_values('权重', key=abs, ascending=False)   # 这是pandas DataFrame的排序方法,在权重这一列按绝对值降序排列

print(f"\n特征重要性排序:\n{feature_importance}")

数据集形状: (20640, 2)
特征名: ['MedInc', 'HouseAge']

Scikit-learn模型结果:
权重 w = [0.82533949 0.2203573 ]
偏置 b = 2.0719

训练集 MSE = 0.6513, R² = 0.5128
测试集 MSE = 0.6630, R² = 0.4941

特征重要性排序:
         特征        权重
0    MedInc  0.825339
1  HouseAge  0.220357
