线性回归综合案例 波士顿房价数据集

1. 数据集读取与划分

In [1]:
# 使用 Pandas 加载并预览数据集
import pandas as pd

df = pd.read_csv(
    "./course-5-boston.csv"
)
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [2]:
# 仅选取 CRIM, RM, LSTAT 三个特征用于线性回归模型训练 （三元）
features = df[["crim", "rm", "lstat"]]
# 目标值数据
target = df["medv"]

# describe() 统计了每列数据的个数、最大值、最小值、平均数等信息
features.describe()

Unnamed: 0,crim,rm,lstat
count,506.0,506.0,506.0
mean,3.593761,6.284634,12.653063
std,8.596783,0.702617,7.141062
min,0.00632,3.561,1.73
25%,0.082045,5.8855,6.95
50%,0.25651,6.2085,11.36
75%,3.647423,6.6235,16.955
max,88.9762,8.78,37.97


In [3]:
# 划分数据集

# 通常会将数据集划分为 70% 和 30% 两部分。
split_num = int(len(features) * 0.7)  # 得到 70% 位置

# 70% 的部分被称之为训练集，用于模型训练
X_train = features[:split_num]  # 训练集特征
y_train = target[:split_num]  # 训练集目标

# 另外的 30% 被称为测试集, 用于评估模型的预测性能
X_test = features[split_num:]  # 测试集特征
y_test = target[split_num:]  # 测试集目标

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((354, 3), (354,), (152, 3), (152,))

2. 模型训练及预测

In [4]:
# 构建和训练模型
from sklearn.linear_model import LinearRegression

model = LinearRegression()  # 建立模型
model.fit(X_train, y_train)  # 训练模型
model.coef_, model.intercept_  # 输出训练后的模型参数和截距项

(array([ 0.69979497, 10.13564218, -0.20532653]), -38.00096988969018)

In [5]:
# 评估模型的预测性能
preds = model.predict(X_test)  # 输入测试集特征进行预测
preds  # 预测结果

array([17.77439141, 21.09512448, 27.63412265, 26.78577951, 25.38313368,
       24.3286313 , 28.4257879 , 25.12834727, 16.82806601, 20.76498858,
       52.3350748 , -0.18169806, 12.01475786,  7.87878077, 15.13155699,
       32.93748235, 37.07872049, 29.50613719, 25.50800832, 12.35867972,
        9.08901644, 47.08374238, 35.31759193, 33.3738765 , 38.34913316,
       33.10414639, 91.3556125 , 35.11735022, 19.69326952, 18.49805269,
       14.03767555, 20.9235166 , 20.41406182, 21.92218226, 15.20451678,
       18.05362998, 21.26289453, 23.18192502, 15.87149504, 27.70381826,
       27.65958772, 30.17151829, 27.04987446, 21.52730227, 37.82614512,
       22.09872387, 34.71166346, 32.07959454, 29.45253042, 29.51137956,
       41.49935191, 62.4121152 , 13.64508882, 24.71242033, 18.69151684,
       37.4909413 , 54.05864658, 34.94758034, 15.01355249, 30.17849355,
       32.22191275, 33.90252834, 33.02530285, 28.4416789 , 69.60201087,
       34.7617152 , 31.65353442, 24.5644437 , 24.78130285, 24.00

3. 模型评价

In [6]:
import numpy as np

# 平均绝对误差（MAE）: 绝对误差的平均值, 
# MAE 的值越小，说明模型拥有更好的拟合程度
def mae_solver(y_true: np.ndarray, y_pred: np.ndarray):
    """MAE 求解"""
    n = len(y_true)
    mae = sum(np.abs(y_true - y_pred)) / n
    return mae

# 均方误差（MSE）: 误差的平方的期望值
# MSE 的值越小，说明预测模型拥有更好的精确度
def mse_solver(y_true: np.ndarray, y_pred: np.ndarray):
    """MSE 求解"""
    n = len(y_true)
    mse = sum(np.square(y_true - y_pred)) / n
    return mse

In [7]:
mae = mae_solver(y_test.values, preds)
mse = mse_solver(y_test.values, preds)

print("MAE: ", mae)
print("MSE: ", mse)

MAE:  13.022063072780178
MSE:  303.833124722358


In [8]:
# 调用 scikit-learn 中现成的 MAE 和 MSE 求解方法
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae_ = mean_absolute_error(y_test, preds)
mse_ = mean_squared_error(y_test, preds)

print("scikit-learn MAE: ", mae_)
print("scikit-learn MSE: ", mse_)

scikit-learn MAE:  13.02206307278018
scikit-learn MSE:  303.8331247223582
