In [16]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from config import HOUSING_PATH


In [17]:
def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [18]:
housing = load_housing_data()

In [19]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [20]:
housing['income_cat'] = pd.cut(housing['median_income'],
                              bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
                              labels = [1,2,3,4,5])

In [21]:
split = StratifiedShuffleSplit(n_splits=1, \
                test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.iloc[train_index]
    strat_test_set = housing.iloc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)
    
train_set = strat_train_set.drop("median_house_value", axis=1)
train_labels = strat_train_set["median_house_value"].copy()

test_set = strat_test_set.drop("median_house_value", axis=1)
test_labels = strat_test_set["median_house_value"].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [22]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN
14650,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
6563,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,INLAND
12053,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,INLAND
13908,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,INLAND
11159,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,<1H OCEAN


In [23]:
class CombinedAtrributesAdder(TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.households_idx = 6
        self.total_rooms_idx = 3
        self.total_bedrooms_idx = 4
        self.population_idx = 5
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, self.total_rooms_idx]/ \
                            X[:, self.households_idx]
        bedrooms_per_household = X[:, self.total_bedrooms_idx]/ \
                            X[:, self.households_idx]
        population_per_household = X[:, self.population_idx]/ \
                            X[:, self.households_idx]
        
        if self.add_bedrooms_per_room:
            return np.column_stack((X, rooms_per_household,
                                  bedrooms_per_household,
                                  population_per_household))
        else:
            return np.vstack[X, rooms_per_household,
                        bedrooms_per_household, population_per_household]

In [24]:
numerical_columns = train_set.drop('ocean_proximity', axis=1).columns.values
categorical_columns = ['ocean_proximity']

In [25]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAtrributesAdder()),
    ('std_scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('one_hot_encoder' , OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ("num", numerical_pipeline, numerical_columns),
    ("cat", categorical_pipeline, categorical_columns)
])


In [26]:
train_set_clean = full_pipeline.fit_transform(train_set)

In [27]:
train_set_clean

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [28]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_set_clean, train_labels)

LinearRegression()

In [29]:
some_data = test_set.iloc[:5]
some_labels = test_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

In [31]:
print("Predictions: ", lin_reg.predict(some_data_prepared))

Predictions:  [418774.46027457 266185.90517656 231087.58330965 205250.99643123
 166505.44125237]


In [32]:
print("Labels: ", some_labels)

Labels:  5241     500001.0
10970    240300.0
20351    218200.0
6568     182100.0
13285    121300.0
Name: median_house_value, dtype: float64


In [48]:
def find_rsme(predictor, data, labels):
    from sklearn.metrics import mean_squared_error
    housing_predictions = predictor.predict(data)
    lin_mse = mean_squared_error(housing_predictions, labels)
    lin_rmse = np.sqrt(lin_mse)
    return lin_rmse

In [50]:
lin_rmse = find_rsme(lin_reg, train_set_clean, train_labels)
print(lin_rmse)

68911.49637588045


In [52]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_set_clean, train_labels)
lin_rmse = find_rsme(tree_reg, train_set_clean, train_labels)
print(lin_rmse)

0.0


In [55]:
print("Predictions: ", tree_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))
lin_rmse = find_rsme(tree_reg, some_data_prepared, some_labels)
print(lin_rmse)

Predictions:  [500001. 244800. 262200. 184200. 111700.]
Labels:  [500001.0, 240300.0, 218200.0, 182100.0, 121300.0]
20262.37893239587
