# demo03_linearRegression.ipynb   sklearn提供的API实现线性回归

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [69]:
data = pd.read_csv('../data/Salary_Data.csv')
x = data.loc[:, :'YearsExperience']   # 保证 x 是二维数组
y = data['Salary']
x.shape, y.shape

((30, 1), (30,))

In [70]:
# 基于sklearn的API，训练线性回归模型
import sklearn.linear_model as lm
model = lm.LinearRegression()
model.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [71]:
test_x = np.array([7.5, 8.9, 10, 15])
pred_y = model.predict(test_x.reshape(-1, 1))
for a, b in zip(test_x, pred_y):
    print(a, '->', b)

7.5 -> 96666.91760958177
8.9 -> 109896.86485961887
10.0 -> 120291.82341321945
15.0 -> 167541.6350204948


## 评估回归模型的误差

In [72]:
import sklearn.utils as su
# 打乱数据集，拆分测试集与训练集  su.shuffle提供了打乱算法，给出x、y及随机种子
# random_state：
#  随机种子，当执行多次shuffle时，若每次的随机种子相同，则得到的随机结果一致。
data_x, data_y = su.shuffle(x, y, random_state=7)
# 拆分测试集与训练集   
train_x, test_x, train_y, test_y = \
    data_x.iloc[:25], data_x.iloc[25:], data_y[:25], data_y[25:]

# 重新训练模型
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x) # 测试集的预测输出
pred_test_y, test_y

(array([ 45515.62646244, 100975.72114817, 111315.73880144,  47395.62967213,
         72775.67300289]), 3      43525
 22    101302
 25    105582
 4      39891
 15     67938
 Name: Salary, dtype: int64)

In [73]:
# 基于slearn.metrics模块提供的评估算法，评估模型的误差
import sklearn.metrics as sm
print(sm.mean_absolute_error(test_y, pred_test_y))
print(sm.mean_squared_error(test_y, pred_test_y))
print(sm.median_absolute_error(test_y, pred_test_y))
print(sm.r2_score(test_y, pred_test_y))

4078.5893581460346
23333471.76878661
4837.673002885393
0.9696498879448306


## 保存模型

In [75]:
import pickle
with open('SalaryPredictionModel.pkl', 'wb') as f:
    pickle.dump(model, f)
print('dump success.')

dump success.
