In [1]:
import pandas as pd
import os
import numpy as np
import math

HOUSING_PATH = os.path.join(".\datasets", "housing")

In [2]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [3]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing_labels = housing["median_house_value"].copy()

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(housing, housing_labels, test_size=0.2, random_state=0)

In [6]:
# drop make train dimension unfit with test 
# X_train = X_train.drop(["ocean_proximity", "total_bedrooms"], axis=1)

In [7]:
X_train_num = X_train.drop(['ocean_proximity'], axis=1)
X_train_num = X_train_num.drop(['median_house_value'], axis=1)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [9]:
#Thêm thuộc tính cho data
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room", "longla"]
longtitude_ix, latitude_ix, rooms_ix, bedrooms_ix, population_ix, households_ix = 0, 1, 3, 4, 5, 6 #column index của các thuộc tính

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        longla = X[:, longtitude_ix] + X[:, latitude_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room, longla]
        else:
            return np.c_[X, rooms_per_household, population_per_household, longla]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)
housing_extra_attribs = attr_adder.transform(housing.values)

In [10]:
def get_categorical_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
#         print(transformer_in_columns)
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            continue
            #names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
#     print(col_name)
    return col_name

In [11]:
def print_coef_info(coef, attribs):
    for index in range(len(coef)):
        print(attribs[index] + ": ", coef[index])

In [12]:
#Num_transformer
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

#Category_transformer
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder())
])

In [13]:
num_attribs = list(X_train_num)
cat_attribs = ["ocean_proximity"]

#full_pipeline a.k.a columntransformer

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)
display(full_pipeline)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('attribs_adder',
                                                  CombinedAttributesAdder()),
                                                 ('std_scaler',
                                                  StandardScaler())]),
                                 ['longitude', 'latitude', 'housing_median_age',
                                  'total_rooms', 'total_bedrooms', 'population',
                                  'households', 'median_income']),
                                ('cat',
                                 Pipeline(steps=[('onehot', OneHotEncoder())]),
                                 ['ocean_proximity'])])

In [14]:
all_attribs = num_attribs + extra_attribs + get_categorical_names_from_ColumnTransformer(full_pipeline)
X_train_prepared = pd.DataFrame(X_train_prepared, columns=all_attribs)
X_test_prepared = pd.DataFrame(X_test_prepared, columns=all_attribs)
display(X_train_prepared.head(5))
#display(X_test_prepared.info())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_hhold,pop_per_hhold,bedrooms_per_room,longla,x0_<1H OCEAN,x0_INLAND,x0_ISLAND,x0_NEAR BAY,x0_NEAR OCEAN
0,1.003899,-0.840062,-1.795076,-0.977736,-1.095689,-1.047601,-1.13565,0.190012,0.907714,0.074082,-0.974585,0.268454,0.0,1.0,0.0,0.0,0.0
1,-1.434772,0.985364,1.855539,-0.118501,-0.106198,0.052109,-0.136882,0.269311,-0.042002,0.045276,-0.104142,-0.949412,0.0,0.0,0.0,1.0,0.0
2,0.779481,-0.840062,-0.207852,-0.42168,-0.360706,-0.352955,-0.343433,0.029895,-0.302824,-0.039332,0.171007,-0.285122,1.0,0.0,0.0,0.0,0.0
3,0.649818,-0.755812,0.744482,-0.888566,-0.781716,-0.591794,-0.766994,-1.26447,-0.812064,0.09399,1.342136,-0.383535,1.0,0.0,0.0,0.0,0.0
4,0.599947,-0.723048,1.855539,-0.122159,0.153068,-0.254095,0.200399,-0.367016,-0.520269,-0.164572,0.630611,-0.42044,1.0,0.0,0.0,0.0,0.0


In [15]:
# training
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

#lin_reg = Ridge(alpha=10000.0) # Áp dụng Regularization => trong số feature bị giảm => underfit
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)
pd.set_option('float_format', '{:f}'.format)
coef_matrix = pd.DataFrame([list(lin_reg.coef_.transpose())], columns=all_attribs)
coef_matrix.max()
#print_coef_info(list(lin_reg.coef_), all_attribs)

longitude            210945690592794112.000000
latitude             224758073042017472.000000
housing_median_age                13471.428946
total_rooms                        -516.154386
total_bedrooms                     6641.455600
population                       -46583.298070
households                        45228.631397
median_income                     75079.656864
rooms_per_hhold                    6368.796622
pop_per_hhold                      1893.230401
bedrooms_per_room                  9543.612639
longla               -85516622507146480.000000
x0_<1H OCEAN           1015054558507416.125000
x0_INLAND              1015054558471415.375000
x0_ISLAND              1015054558664809.125000
x0_NEAR BAY            1015054558502910.625000
x0_NEAR OCEAN          1015054558512444.000000
dtype: float64

In [16]:
# evaluate
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, housing_predictions)
display(lin_mse)
lin_rmse = np.sqrt(lin_mse)
display(lin_rmse)

4691014725.797534

68490.98280647997

In [17]:
print("Predictions:", lin_reg.predict(X_test_prepared[:10]))
print("Labels:", list(y_test[:10]))

Predictions: [213057. 285025. 177849.  89913. 294345. 203345. 277833. 312885. 311033.
 238657.]
Labels: [136900.0, 241300.0, 200700.0, 72500.0, 460000.0, 120000.0, 247000.0, 336900.0, 339700.0, 265600.0]
