In [1]:
import pandas as pd


In [2]:
path = r'..\..\..\数据集\handson-ml2\datasets\housing\housing.csv'
origin_data = pd.read_csv(path)
origin_data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [3]:
df = origin_data.drop('median_house_value', axis=1)
df_labels = origin_data['median_house_value']

In [8]:
#data clean
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.4+ MB


In [5]:
# so we need fill na in total_bedrooms, now we use median
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
# only fill na in int or float by imputer
# fit just calculate the median of all columns and store the result in statistics
df_num = df.drop('ocean_proximity', axis=1)
imputer.fit(df_num)
imputer.statistics_
# auto fill na in all num column
X = imputer.transform(df_num)
df_tr = pd.DataFrame(X, columns=df_num.columns, index=df_num.index)
df_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [6]:
# onehot cat feature
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
df_cat = df[['ocean_proximity']]
df_cat_onehot = cat_encoder.fit_transform(df_cat)
df_cat_onehot.toarray()
                                    

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [7]:
# use pipeline to complete data transform
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombineAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = True
    def fit(self, X):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix]/X[:, households_ix]
        population_per_household = X[:, population_ix]/X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#except for last one, other function should have fit_transform()
num_pipeline = Pipeline([
    ('impyter', SimpleImputer(strategy='median')),
    ('attribs_adder', CombineAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer

num_att = list(df_num)
cat_att = ['ocean_proximity']
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_att),
    ('cat', OneHotEncoder(), cat_att),
])

df_prepared = full_pipeline.fit_transform(df)

In [40]:
# use LR to train
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(df_prepared, df_labels)

# use mean_squard_error() to evaluate model
from sklearn.metrics import mean_squared_error
lr_predictions = lr.predict(df_prepared)
lr_rmse = np.sqrt(mean_squared_error(df_labels, lr_predictions))
lr_rmse

68286.12607251323

In [41]:
# use DecisionTree

from sklearn.tree import DecisionTreeRegressor

tr = DecisionTreeRegressor()
tr.fit(df_prepared, df_labels)
tr_predictions = tr.predict(df_prepared)
tr_rmse = np.sqrt(mean_squared_error(df_labels, tr_predictions))
tr_rmse

0.0

In [42]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tr, df_prepared, df_labels, scoring='neg_mean_squared_error', cv=10)
tr_mse_scores = np.sqrt(-scores)
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())
    
display_scores(tr_mse_scores)

Scores: [119244.11923308  72670.26091891  83677.92775679  74475.10327736
  89827.21027774  77655.70647366  68985.55446203  99509.57244364
  95480.19636218  72557.02651923]
Mean: 85408.2677724617
Standard deviation: 14981.42028594353


In [43]:
# compare with lr
lr_scores = cross_val_score(lr, df_prepared, df_labels, scoring='neg_mean_squared_error', cv=10)
lr_mse_scores = np.sqrt(-lr_scores)
display_scores(lr_mse_scores)

Scores: [8.41836630e+04 6.11915285e+04 8.67436096e+04 6.22867345e+04
 6.55810502e+15 6.89185866e+04 5.25048641e+04 9.09042279e+04
 7.76750890e+04 5.39409537e+04]
Mean: 655810502198756.0
Standard deviation: 1967431506383484.8


In [44]:
# user RandomForest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(df_prepared, df_labels)
rf_scores = cross_val_score(rf, df_prepared, df_labels, scoring='neg_mean_squared_error', cv=10)
rf_mse_scores = np.sqrt(-rf_scores)
display_scores(rf_mse_scores)

Scores: [97615.38436898 47257.43109551 65519.70300793 56448.15282712
 61143.31121477 59737.75942911 47317.60103882 79591.49611091
 74258.72253331 49262.471588  ]
Mean: 63815.20332144628
Standard deviation: 15295.315485184134


In [None]:
# save mode result
import joblib
joblib.dumps(rf, 'rf.pkl')
# load
# model_loaded = joblib.load('rf.pkl')

In [45]:
# and use SearchCV to adjust parameter
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3, 10, 30], 'max_features':[2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators':[3, 10], 'max_features':[2, 3, 4]}
]

rf_grid = RandomForestRegressor()
grid_search = GridSearchCV(rf_grid, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(df_prepared, df_labels)
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [46]:
#score
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

82453.76559972462 {'max_features': 2, 'n_estimators': 3}
71499.5988553603 {'max_features': 2, 'n_estimators': 10}
69697.47482116584 {'max_features': 2, 'n_estimators': 30}
76674.14545524534 {'max_features': 4, 'n_estimators': 3}
71382.9640741928 {'max_features': 4, 'n_estimators': 10}
68308.43508283746 {'max_features': 4, 'n_estimators': 30}
75455.45310532284 {'max_features': 6, 'n_estimators': 3}
71282.21843505515 {'max_features': 6, 'n_estimators': 10}
68048.81374128729 {'max_features': 6, 'n_estimators': 30}
76487.10339007161 {'max_features': 8, 'n_estimators': 3}
71344.07267106822 {'max_features': 8, 'n_estimators': 10}
68399.23608320835 {'max_features': 8, 'n_estimators': 30}
78896.41367236938 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
71593.54857189515 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
76809.27048981508 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
71143.01941529459 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}


In [48]:
#show feature importance
feature_importances = grid_search.best_estimator_.feature_importances_
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_att = list(cat_encoder.categories_[0])
extra_att = ['rooms_per_hhold', 'pop_per_hhold', 'bedromm_per_room']

atts = num_att+extra_att+cat_one_hot_att

sorted(zip(feature_importances, atts), reverse=True)

[(0.3023489818914827, 'median_income'),
 (0.16172338991642146, 'INLAND'),
 (0.10791115752561373, 'pop_per_hhold'),
 (0.08910919337840666, 'bedromm_per_room'),
 (0.07315586848726109, 'longitude'),
 (0.06917737003841141, 'rooms_per_hhold'),
 (0.06696331974952748, 'latitude'),
 (0.04356847031401048, 'housing_median_age'),
 (0.01705448418120849, 'total_rooms'),
 (0.017022601746209797, 'total_bedrooms'),
 (0.017012407665773912, 'population'),
 (0.0153451576828215, 'households'),
 (0.01224089642541003, '<1H OCEAN'),
 (0.004533375501151367, 'NEAR OCEAN'),
 (0.002679422052134257, 'NEAR BAY'),
 (0.00015390344415570358, 'ISLAND')]

In [60]:
# test the final model
final_model = grid_search.best_estimator_

X_test = origin_data.drop('median_house_value', axis=1)
y_test = origin_data[['median_house_value']]

X_test_prepared = full_pipeline.fit_transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

19096.08712099313

In [61]:
final_predictions.shape

(20640,)

In [64]:
from scipy import stats
confidence = 0.95
final_predictions = final_predictions.reshape(final_predictions.shape[0],1)
squard_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squard_errors)-1, loc=squard_errors.mean(), scale=stats.sem(squard_errors)))

array([[18674.1938046 ],
       [19508.85881884]])

In [16]:
#exerciese
# 1.use svm model to train

from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
svr_model = SVR(kernel='linear')
svr_model.fit(df_prepared, df_labels)
svr_scores =  cross_val_score(svr_model, df_prepared, df_labels, scoring='neg_mean_squared_error', cv=10)
svr_scores = np.sqrt(-svr_scores)
svr_scores.mean()

109374.66591940311

In [18]:
# 2. use randomizesearch
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
    {'n_estimators':[3, 10, 30], 'max_features':[2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators':[3, 10], 'max_features':[2, 3, 4]}
]

rf_grid = RandomForestRegressor()
grid_search = RandomizedSearchCV(rf_grid, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(df_prepared, df_labels)
grid_search.best_params_

{'n_estimators': 10, 'max_features': 6}

In [40]:
# 3. add a feature selection to pipeline
# use pipeline to complete data transform
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombineAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = True
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix]/X[:, households_ix]
        population_per_household = X[:, population_ix]/X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#except for last one, other function should have fit_transform()
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombineAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer

num_att = list(df_num)
cat_att = ['ocean_proximity']
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_att),
    ('cat', OneHotEncoder(), cat_att),
])


In [22]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df_prepared, df_labels)
model.coef_

array([-55320.72002442, -56255.14933279,  13364.73984258,  -1882.43424613,
         7465.2513185 , -46331.96946603,  45752.37330988,  74791.3185514 ,
         6372.10036191,    863.34022157,   9613.21958474, -23233.9309243 ,
       -60499.22193162, 129660.72058374, -27120.67833241, -18806.88939541])

In [44]:
# 4.create a totoal pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression

class FeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self, model, k):
        self.model = model
        self.k = k
        
    def fit(self, X, y=None):
        self.model.fit(X, y)
        self.feature_importance = np.abs(self.model.coef_)
        return self
    
    def transform(self, X):
        indices = np.argsort(self.feature_importance)[::-1][:self.k]
        features = X[:, indices]
        return features

k = 5
selector = FeatureSelection(LinearRegression(), k)
selected = selector.fit_transform(df_prepared, df_labels)
selected

array([[ 0.        ,  2.34476576,  0.        ,  1.05254828, -1.32783522],
       [ 0.        ,  2.33223796,  0.        ,  1.04318455, -1.32284391],
       [ 0.        ,  1.7826994 ,  0.        ,  1.03850269, -1.33282653],
       ...,
       [ 0.        , -1.14259331,  1.        ,  1.77823747, -0.8237132 ],
       [ 0.        , -1.05458292,  1.        ,  1.77823747, -0.87362627],
       [ 0.        , -0.78012947,  1.        ,  1.75014627, -0.83369581]])

In [45]:
# add it to the pipeline
total_pipeline = Pipeline([
    ('data_clean', full_pipeline),
    ('feature_selection', FeatureSelection(model=LinearRegression(), k=5)),
    ('regressor', LinearRegression())
])

total_pipeline.fit(df, df_labels)


Pipeline(steps=[('data_clean',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('attribs_adder',
                                                                   CombineAttributesAdder()),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['longitude', 'latitude',
                                                   'housing_median_age',
                                                   'total_rooms',
                                                   'total_bedrooms',
                                                   'population', 'households',
                      

In [None]:
# so, concat the code

rom sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

# data clean
class CombineAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = True
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix]/X[:, households_ix]
        population_per_household = X[:, population_ix]/X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImpute

#except for last one, other function should have fit_transform()
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombineAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

from sklearn.compose import ColumnTransformer

df_num = df.drop('ocean_proximity', axis=1)
num_att = list(df_num)
cat_att = ['ocean_proximity']
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_att),
    ('cat', OneHotEncoder(), cat_att),
])


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression

class FeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self, model, k):
        self.model = model
        self.k = k
        
    def fit(self, X, y=None):
        self.model.fit(X, y)
        self.feature_importance = np.abs(self.model.coef_)
        return self
    
    def transform(self, X):
        indices = np.argsort(self.feature_importance)[::-1][:self.k]
        features = X[:, indices]
        return features
    
total_pipeline = Pipeline([
    ('data_clean', full_pipeline),
    ('feature_selection', FeatureSelection(model=LinearRegression(), k=5)),
    ('regressor', LinearRegression())
])

total_pipeline.fit(df, df_labels)