# 实现标准简单线性回归

具体公式不谈，主要都是代码

In [1]:
from math import sqrt
dataset = [[1.2, 1.1], [2.4, 3.5], [4.1, 3.2], [3.4, 2.8], [5, 5.4]]

In [3]:
# 计算样本均值函数
def mean(values):
    return sum(values) / float(len(values))

In [4]:
# 计算 x 与 y 协方差的函数
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

In [5]:
# 计算方差的函数
def variance(values, mean):
    return sum([(x - mean)**2 for x in values])

In [13]:
# 计算回归系数的函数
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    w1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    w0 = y_mean - w1*x_mean
    return (w0, w1)


In [7]:
# 计算均方根误差 RMSE 的函数
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

In [9]:
# 构建简单线性回归
def simple_linear_regression(train, test):
    predictions = list()
    w0, w1 = coefficients(train) # 这里实际上是用最小二乘法求得系数和偏置，因为这个是一个公式推导的结果，所以不需要去迭代优化
    for row in test:
        y_model = w1 * row[0] + w0
        predictions.append(y_model)
    return predictions


In [14]:
# 评估算法的函数
def evaluate_algorithm(dataset, algorithm):
    test_set = list()
    for row in dataset:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(dataset, test_set)
    for val in predicted:
        print('%.3f\t' % val)
        
    actual = [row[-1] for row in dataset]
    rmse = rmse_metric(actual, predicted)
    return rmse

In [15]:
# 主函数运行
rmse = evaluate_algorithm(dataset, simple_linear_regression)
print('RMSE: %.3f' % rmse)

1.386	
2.463	
3.990	
3.362	
4.799	
RMSE: 0.701
