In [1]:
class MyLinearRegression:
    """
    自己写的比较简单的线性回归模型
    """
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X, y):
        rows = X.shape[0]
        X = np.concatenate((np.ones((rows, 1)), X), axis=1)
        y = y.reshape(-1, 1)
        
        w = np.dot(np.linalg.inv(np.dot(X.T, X)),X.T).dot(y).flatten()
        self.coef_ = w[1:]
        self.intercept_ = w[0]
            
    def predict(self, X):
        if self.coef_ is None:
            print("请先训练后预测")
            return
        y_hat = np.dot(X, self.coef_) + self.intercept_
        return np.array(y_hat).ravel()

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
data = pd.read_csv('./Advertising.csv',encoding='utf-8', index_col=0)
# 查看是不是有空值
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


In [4]:
X = data[['TV', 'Radio', 'Newspaper']].values
y = data['Sales'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [5]:
# 使用自己的模型
mlr = MyLinearRegression()
mlr.fit(X_train, y_train)
print("偏置：",mlr.coef_)
print("截距：",mlr.intercept_)

偏置： [0.04391531 0.20027962 0.00184368]
截距： 2.8802552863313267


In [6]:
# 与sklearn对比
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
print("偏置：", lr.coef_)
print("截距：", lr.intercept_)

偏置： [0.04391531 0.20027962 0.00184368]
截距： 2.880255286331325


In [7]:
y_hat = mlr.predict(X_test)
y_hat

array([10.05866652,  7.43318827,  6.95305695, 24.16874598, 11.98172029,
        6.54464708, 13.19426793, 14.90240334, 11.07675294, 16.25061663,
       23.0081158 ,  9.12072148, 10.29342297, 15.3709661 , 11.62396513,
       12.1057432 , 18.55030805, 10.85810888, 16.13879856, 17.20281404,
       24.25469939,  9.43869821, 15.13419408, 12.41283844,  5.66973859,
       15.23876681, 12.1878762 , 20.95164149, 13.22671807,  9.20954724,
       13.39812727, 21.61944517, 18.05938674, 21.14798547,  6.73582996,
        6.16799717,  7.98042607, 13.09083488, 14.86214078,  6.23597127,
       12.35246911,  9.19452235, 15.04717679, 16.23039254, 17.17240889,
       13.3341216 ,  3.68035786, 12.53946851, 15.93674769,  8.67881357,
       10.62459997, 19.5646526 , 18.3472282 , 15.24662975, 10.04846378,
        8.17344188, 21.49061088, 14.137309  , 16.29760212,  8.90502274])

In [8]:
print("均方误差(MSE)：", mean_squared_error(y_test, y_hat))
print("根均方误差(RMSE)：", np.sqrt(mean_squared_error(y_test, y_hat)))
print("平均绝对值误差(MAE)：", mean_absolute_error(y_test, y_hat))
print("训练集R^2：", r2_score(y_train, mlr.predict(X_train)))
print("测试集R^2：", r2_score(y_test, y_hat))

均方误差(MSE)： 3.69139484569861
根均方误差(RMSE)： 1.9213003007595169
平均绝对值误差(MAE)： 1.2333759522850203
训练集R^2： 0.9065727532450596
测试集R^2： 0.8649018906637791
