In [1]:
import numpy as np
import pandas as pd
import sklearn

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import pickle

# 忽略没有意义的警告
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
strat_train_set = pd.read_csv('./datasets/strat_train_set.csv', index_col = 0)
strat_train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,286600.0,<1H OCEAN
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,340600.0,<1H OCEAN
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,196900.0,NEAR OCEAN
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,46300.0,INLAND
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,254500.0,<1H OCEAN


In [3]:
list_file = open('./datasets/housing_prepared.pickle','rb')
housing_prepared = pickle.load(list_file)
housing_prepared.shape


(16512, 16)

In [4]:
list_file = open('./datasets/housing_labels.pickle','rb')
housing_labels = pickle.load(list_file)
housing_labels.shape


(16512,)

In [5]:
from sklearn.linear_model import LinearRegression
#线性回归模型 定义模型lin_reg 调用.fit（数据，标签）训练模型
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

LinearRegression()

In [6]:
some_data = housing_prepared[:10]

some_label_data = housing_labels[:10]
lin_reg.predict(some_data)

array([210644.60459286, 317768.80697211, 210956.43331178,  59218.98886849,
       189747.55849879, 154295.07624519, 426711.9315643 , 228002.94602374,
       139801.43738593,  30128.00152011])

In [7]:
list(some_label_data)

[286600.0,
 340600.0,
 196900.0,
 46300.0,
 254500.0,
 127900.0,
 500001.0,
 140200.0,
 95000.0,
 500001.0]

In [15]:
housing_labels.shape

(16512,)

In [14]:
housing_prepared.shape

(16512, 16)

In [18]:
from sklearn.metrics import mean_squared_error
#lin_reg模型调用linear的predict函数，得出对应的预测结果
housing_prediction = lin_reg.predict(housing_prepared)
lin_reg_mse = mean_squared_error(housing_labels,housing_prediction)  #算均方误差
lin_reg_rmse = np.sqrt(lin_reg_mse)  #算均方根误差
lin_reg_rmse

68628.19819848923

In [19]:
#决策树
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)
tree_reg_prediction = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,tree_reg_prediction)

In [20]:
tree_mse

0.0

In [23]:
#随机森林
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
forest_prediction = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels,forest_prediction)
forest_mse


356026856.31691736

In [25]:
foreat_rmse = np.sqrt(forest_mse)
foreat_rmse

18868.67394166631

In [None]:
from sklearn.externals import joblib
joblib.dump(forest_reg,"forest_reg")  #保存训练出的模型
forest_reg = joblib.load("forest_reg")  #加载保存过得模型

In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [27]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [28]:
grid_search.best_estimator_
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63669.11631261028 {'max_features': 2, 'n_estimators': 3}
55627.099719926795 {'max_features': 2, 'n_estimators': 10}
53384.57275149205 {'max_features': 2, 'n_estimators': 30}
60965.950449450494 {'max_features': 4, 'n_estimators': 3}
52741.04704299915 {'max_features': 4, 'n_estimators': 10}
50377.40461678399 {'max_features': 4, 'n_estimators': 30}
58663.93866579625 {'max_features': 6, 'n_estimators': 3}
52006.19873526564 {'max_features': 6, 'n_estimators': 10}
50146.51167415009 {'max_features': 6, 'n_estimators': 30}
57869.25276169646 {'max_features': 8, 'n_estimators': 3}
51711.127883959234 {'max_features': 8, 'n_estimators': 10}
49682.273345071546 {'max_features': 8, 'n_estimators': 30}
62895.06951262424 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54658.176157539405 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59470.40652318466 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52724.9822587892 {'bootstrap': False, 'max_features': 3, 'n_estimators'

In [33]:
final_model = grid_search.best_estimator_

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
# 为了能放入pipeline

housing = strat_train_set.drop("median_house_value", axis=1) 
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)
# 列索引
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):  # 这个变量要传
        self.add_bedrooms_per_room = add_bedrooms_per_room # 控制是否加入bedrooms_per_room这个变量
    def fit(self, X, y=None):
        return self  # fit啥也不干
    # 通过数据计算的值添加进去
    def transform(self, X):
        # 先加两列
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        # 判断一下是否加第三列
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
            # 是的话返回三列
        else:
            return np.c_[X, rooms_per_household, population_per_household]
            # 否则返回两列

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
full_pipeline.fit_transform(housing)

strat_test_set = pd.read_csv('./datasets/strat_test_set.csv', index_col = 0)

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [38]:
final_rmse

47730.22690385927