In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from config import HOUSING_PATH


In [4]:
def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [5]:
housing = load_housing_data()

In [6]:
housing["median_income"]

0        8.3252
1        8.3014
2        7.2574
3        5.6431
4        3.8462
          ...  
20635    1.5603
20636    2.5568
20637    1.7000
20638    1.8672
20639    2.3886
Name: median_income, Length: 20640, dtype: float64

In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [29]:
t=list(housing.columns)

In [32]:
t

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'income_cat']

In [34]:
housing['income_cat'] = pd.cut(housing['median_income'],
                              bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
                              labels = [1,2,3,4,5])

for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.iloc[train_index]
    strat_test_set = housing.iloc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)
    
train_set = strat_train_set.drop("median_house_value", axis=1)
train_labels = strat_train_set["median_house_value"].copy()

test_set = strat_test_set.drop("median_house_value", axis=1)
test_labels = strat_test_set["median_house_value"].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [35]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
8297,-118.14,33.76,50.0,2960.0,761.0,1179.0,718.0,3.5214,NEAR OCEAN
17602,-121.88,37.30,42.0,1867.0,398.0,927.0,389.0,4.3250,<1H OCEAN
5912,-118.44,34.29,32.0,1260.0,382.0,1434.0,342.0,2.0286,<1H OCEAN
5050,-118.33,34.02,11.0,1249.0,313.0,625.0,336.0,0.8702,<1H OCEAN
7161,-118.17,34.03,31.0,1014.0,252.0,1064.0,247.0,2.4167,<1H OCEAN
7941,-118.12,33.87,43.0,1633.0,355.0,837.0,350.0,3.0405,<1H OCEAN
3721,-118.43,34.20,28.0,3386.0,,2240.0,737.0,3.0221,<1H OCEAN
17055,-122.25,37.47,35.0,3183.0,515.0,1313.0,487.0,5.9062,NEAR OCEAN
3205,-119.62,36.35,10.0,3674.0,734.0,1864.0,718.0,2.6145,INLAND
4315,-118.34,34.09,5.0,2665.0,954.0,1733.0,766.0,2.3568,<1H OCEAN


In [36]:
class CombinedAtrributesAdder(TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.households_idx = 6
        self.total_rooms_idx = 3
        self.total_bedrooms_idx = 4
        self.population_idx = 5
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, self.total_rooms_idx]/ \
                            X[:, self.households_idx]
        bedrooms_per_household = X[:, self.total_bedrooms_idx]/ \
                            X[:, self.households_idx]
        population_per_household = X[:, self.population_idx]/ \
                            X[:, self.households_idx]
        
        if self.add_bedrooms_per_room:
            return np.column_stack((X, rooms_per_household,
                                  bedrooms_per_household,
                                  population_per_household))
        else:
            return np.vstack[X, rooms_per_household,
                        bedrooms_per_household, population_per_household]

In [37]:
numerical_columns = train_set.drop('ocean_proximity', axis=1).columns.values
categorical_columns = ['ocean_proximity']

In [38]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAtrributesAdder()),
    ('std_scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('one_hot_encoder' , OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ("num", numerical_pipeline, numerical_columns),
    ("cat", categorical_pipeline, categorical_columns)
])


In [39]:
train_set_clean = full_pipeline.fit_transform(train_set)

In [40]:
train_set_clean

array([[ 0.71152955, -0.87328421,  1.69364505, ...,  0.        ,
         0.        ,  1.        ],
       [-1.15617031,  0.78401285,  1.06080258, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.56171406, -0.62515782,  0.26974949, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.14722451,  0.3252131 , -1.54967262, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.29704001,  0.94318828, -1.15414608, ...,  0.        ,
         0.        ,  0.        ],
       [-1.6955061 ,  1.30835543,  1.69364505, ...,  0.        ,
         0.        ,  0.        ]])

In [41]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_set_clean, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [42]:
some_data = test_set.iloc[:5]
some_labels = test_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

In [43]:
print("Predictions: ", lin_reg.predict(some_data_prepared))

Predictions:  [289420.46274989 111473.8933198  149867.89156667 143731.43264357
 222571.20379479]


In [44]:
print("Labels: ", some_labels)

Labels:  828      206500.0
13078    118600.0
19867    126600.0
4643     322200.0
11233    197400.0
Name: median_house_value, dtype: float64


In [45]:
def find_rsme(predictor, data, labels):
    from sklearn.metrics import mean_squared_error
    housing_predictions = predictor.predict(data)
    lin_mse = mean_squared_error(housing_predictions, labels)
    lin_rmse = np.sqrt(lin_mse)
    return lin_rmse

In [46]:
lin_rmse = find_rsme(lin_reg, train_set_clean, train_labels)
print(lin_rmse)

68704.9625054588


In [47]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_set_clean, train_labels)
lin_rmse = find_rsme(tree_reg, train_set_clean, train_labels)
print(lin_rmse)

0.0


In [48]:
print("Predictions: ", tree_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))
lin_rmse = find_rsme(tree_reg, some_data_prepared, some_labels)
print(lin_rmse)

Predictions:  [231200. 153600.  83700. 250000. 143500.]
Labels:  [206500.0, 118600.0, 126600.0, 322200.0, 197400.0]
48566.55227623225


In [49]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lin_reg, train_set_clean, train_labels,
                        scoring="neg_mean_squared_error", cv=10)

In [50]:
print(np.sqrt(-scores))
print("Linear regression mean scores: ", np.mean(np.sqrt(-scores)))

[68050.63913915 70567.86776266 67906.44042729 69299.31025499
 67472.24668807 72415.821424   70060.93052214 69446.00373467
 70193.50941046 66481.36234746]
Linear regression mean scores:  69189.41317108776


In [59]:
from sklearn.metrics import make_scorer
def my_custom_loss_function(y_true, y_pred):
    import numpy as np
    return np.mean(np.square(y_true - y_pred))

scorer = make_scorer(my_custom_loss_function, greater_is_better=False)
scores = cross_val_score(tree_reg, train_set_clean, train_labels,
                        scoring = scorer, cv=10)
print(scores)
#print(np.sqrt(-scores))
print("Tree regression mean scores: ", np.mean(np.sqrt(-scores)))

[4.99900262e+09 4.63144014e+09 4.92578541e+09 5.01974711e+09
 4.98625053e+09 4.86154239e+09 5.64118786e+09 5.22914635e+09
 5.26283729e+09 5.02654535e+09]
Tree regression mean scores:  nan


  # This is added back by InteractiveShellApp.init_path()


In [33]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(train_set_clean, train_labels)
forest_rmse = find_rsme(forest_reg, train_set_clean, train_labels)
print(forest_rmse)

scores = cross_val_score(forest_reg, train_set_clean, train_labels,
                         cv=10)
print(np.sqrt(-scores))
print("Forest mean scores: ", np.mean(np.sqrt(-scores)))

NameError: name 'train_set_clean' is not defined

In [38]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]},
    {'bootstrap' : [False], 'n_estimators': [3, 10], 'max_features': [2,3,4]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=10,
                          scoring="neg_mean_squared_error",
                          return_train_score=True)
grid_search.fit(train_set_clean, train_labels)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [43]:
for mean_score, params in zip(grid_search.cv_results_["mean_test_score"],
                              grid_search.cv_results_["params"]):
    print(np.sqrt(-mean_score), params)

63790.31379493411 {'max_features': 2, 'n_estimators': 3}
55232.63246343651 {'max_features': 2, 'n_estimators': 10}
52769.8045596997 {'max_features': 2, 'n_estimators': 30}
59688.81847336156 {'max_features': 4, 'n_estimators': 3}
52149.611670542836 {'max_features': 4, 'n_estimators': 10}
49846.49281123763 {'max_features': 4, 'n_estimators': 30}
58505.40311658753 {'max_features': 6, 'n_estimators': 3}
51347.9212480432 {'max_features': 6, 'n_estimators': 10}
49502.737073341734 {'max_features': 6, 'n_estimators': 30}
59297.911883988134 {'max_features': 8, 'n_estimators': 3}
51430.55057414578 {'max_features': 8, 'n_estimators': 10}
49614.9369571584 {'max_features': 8, 'n_estimators': 30}
62262.00778156415 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54191.71444470365 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59410.92272825865 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52221.766865870195 {'bootstrap': False, 'max_features': 3, 'n_estimators': 1

In [50]:
feature_importance = grid_search.best_estimator_.feature_importances_

In [56]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhoud", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]

In [67]:
cat_one_hot_attribs = full_pipeline.named_transformers_["cat"].\
named_steps["one_hot_encoder"].categories_[0]

In [73]:
attributes = list(numerical_columns) + extra_attribs + list(cat_one_hot_attribs)

In [76]:
sorted(zip(feature_importance, attributes), reverse=True)

[(0.3964761654463855, 'median_income'),
 (0.12988529955085695, 'INLAND'),
 (0.11152376637241045, 'bedrooms_per_room'),
 (0.07545017893054222, 'latitude'),
 (0.0711430967921782, 'longitude'),
 (0.05399074599897618, 'rooms_per_hhold'),
 (0.04573276826182368, 'housing_median_age'),
 (0.024935537778667374, 'pop_per_hhoud'),
 (0.01775556633091478, 'total_rooms'),
 (0.01770115960841769, 'population'),
 (0.016549947188901196, 'total_bedrooms'),
 (0.015588261042248482, 'households'),
 (0.011754379361705132, '<1H OCEAN'),
 (0.0073891717443193665, 'NEAR OCEAN'),
 (0.004069016410721156, 'NEAR BAY'),
 (5.4939180931668225e-05, 'ISLAND')]