经典的线性回归案例

In [45]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [46]:
# 导入数据
X = []
y = []
with open(file='./data/housing.csv', mode='r', encoding='utf-8') as f:
    for line in f:
        ling = line.strip()
        if line:
            line = [float(char) for char in line.split(' ') if char]
            X.append(line[:-1])
            y.append(line[-1])

X = np.array(X)
y = np.array(y)


In [47]:
# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 数据预处理：标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [42]:
# 线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 评估模型
mse = mean_squared_error(y_test, y_pred)
print(f"均方误差: {mse}")

print(f"模型权重（也叫系数）(就是 y = wx + b 中的 w): {model.coef_}")
print(f"模型偏置（也叫截距）(就是 y = wx + b 中的 b): {model.intercept_}")


均方误差: 23.6950851420352
模型权重（也叫系数）(就是 y = wx + b 中的 w): [-1.03236367  1.12804285  0.58748158  0.37930536 -2.11881444  3.67856734
  0.18276288 -2.94061549  3.5149033  -2.63450929 -2.05386727 -0.29735515
 -3.62614886]
模型偏置（也叫截距）(就是 y = wx + b 中的 b): 24.495975232198155


In [43]:
# 最小二乘法

# 在特征矩阵的最后，拼接全是1的列
X_train1 = np.hstack([X_train, np.ones((X_train.shape[0], 1))])

# 最小二乘法求解
# np.linalg.inv 求逆矩阵
# .T 矩阵的转置
np.linalg.inv(X_train1.T @ X_train1) @ X_train1.T @ y_train[:,np.newaxis]
# 最终的结果是一个 (14, 1) 的矩阵；前13个是w，最后一个是b


In [44]:
w

array([[-1.03236367],
       [ 1.12804285],
       [ 0.58748158],
       [ 0.37930536],
       [-2.11881444],
       [ 3.67856734],
       [ 0.18276288],
       [-2.94061549],
       [ 3.5149033 ],
       [-2.63450929],
       [-2.05386727],
       [-0.29735515],
       [-3.62614886],
       [24.49597523]])