In [1]:
'''上周因为备考linux补考几乎没有心情听课，后续跟上也变得很难了
    这部分内容来自我与gpt重新补习了一遍流水线与特征工程的内容，大概用了八个小时
    补全了一份与课案略有不同但我能理解的笔记，为了优先完成作业
    故此没有深入理解sklearn各函数与类的具体实现，这是在今日分类问题补习完成之后
    再去补习的内容

    因此这次作业仅为具体实现一个封装流水线部分, 出现的所有内容便是补习的内容'''
import sys
assert sys.version_info >= (3, 7)   # 检查python版本
from packaging import version
import sklearn
assert version.parse(sklearn.__version__) >= version.parse("1.0.1")  # 检查sklearn版本

In [65]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()
# 由于预测差距接近百分之40到50，这部分是增加特征的特征工程
# 计算额外的交互特征
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

In [66]:
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
import numpy as np
import pandas as pd
# 数据划分 -> 特征分离 -> 填充缺失值 -> 数据归一化 -> 独热编码 ->
# 数据划分部分
housing['income_cat'] = pd.cut(housing['median_income'],
                               bins= [0., 1.5, 3., 4.5, 6., np.inf],
                               labels = [1,2,3,4,5])
split = StratifiedShuffleSplit(n_splits = 3, test_size = 0.2, random_state = 35)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# 特征分离部分
X_train = strat_train_set.drop('median_house_value', axis=1)
y_train = strat_train_set['median_house_value']
X_test = strat_test_set.drop('median_house_value', axis=1)
y_test = strat_test_set['median_house_value']

# 特征工程部分(缺失值，归一化，独热编码)
num_attribs = list(X_train.drop('ocean_proximity', axis=1))
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),          # 填充缺失值的封装管道
    ('std_scaler', StandardScaler())                        # 数据归一化的封装管道
])  # 数据预处理管道

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),                     # 调用数据预处理管道
    ('cat', OneHotEncoder(), cat_attribs)                   # 标签独热编码封装管道
])  # 完整预处理管道
tr_X_prepared = full_pipeline.fit_transform(X_train)
te_X_prepared = full_pipeline.transform(X_test)

# 模型训练部分
lin_reg = LinearRegression()
param_grid = [{
    'fit_intercept': [True, False]
}]
grid_search = GridSearchCV(lin_reg, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(tr_X_prepared, y_train)
best_model = grid_search.best_estimator_

# 效率验证部分
final_predictions = best_model.predict(te_X_prepared)
final_rmse =np.sqrt(mean_squared_error(y_test, final_predictions))
best_score = grid_search.best_score_
print("预测值:", final_predictions[:5])
print("实际值:", y_test.values[:5])
print("finalRMSE:", final_rmse, "|", "bestSCORE:",best_score)

预测值: [ 45863.32579656 213038.00034456 129479.14195667 138932.81560636
 163286.48554053]
实际值: [ 58600. 151800. 162500. 143700. 135600.]
finalRMSE: 67612.37507432092 | bestSCORE: -4933248487.861487
