In [1]:
import pandas as pd
import os
import numpy as np
import math

HOUSING_PATH = os.path.join(".\datasets", "housing")

In [2]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [3]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing_labels = housing["median_house_value"].copy()

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(housing, housing_labels, test_size=0.2, random_state=0)

In [6]:
# drop make train dimension unfit with test 
# X_train = X_train.drop(["ocean_proximity", "total_bedrooms"], axis=1)

In [7]:
X_train_num = X_train.drop(['ocean_proximity'], axis=1)
X_train_num = X_train_num.drop(['median_house_value'], axis=1)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [9]:
#Thêm thuộc tính cho data
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room", "longla", 'total_income', 'pop_per_room', 'pop_per_bedroom', 'bedrooms_per_household']
longitude_ix, latitude_ix, rooms_ix, bedrooms_ix, population_ix, households_ix, income_ix = 0, 1, 3, 4, 5, 6, 7 #column index của các thuộc tính

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        longla = X[:, longitude_ix] * X[:, latitude_ix]
        total_income = X[:, households_ix] * X[:, income_ix]
        pop_per_room = X[:, population_ix] / X[:, rooms_ix]
        pop_per_bedroom = X[:, population_ix] / X[:, bedrooms_ix]
        bedrooms_per_household = X[:, bedrooms_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room, longla, total_income, pop_per_room, pop_per_bedroom, bedrooms_per_household]
        else:
            return np.c_[X, rooms_per_household, population_per_household, longla, total_income, pop_per_room, pop_per_bedroom, bedrooms_per_household]

In [10]:
def get_categorical_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
#         print(transformer_in_columns)
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            continue
            #names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
#     print(col_name)
    return col_name

In [11]:
def print_coef_info(coef, attribs):
    for index in range(len(coef)):
        print(attribs[index] + ": ", coef[index])

In [12]:
def remove_attribs(X_train_prepared, X_test_prepared, all_attribs, attrib_to_drop):
    X_train_prepared = X_train_prepared.drop([attrib_to_drop], axis=1)
    X_test_prepared = X_test_prepared.drop([attrib_to_drop], axis=1)
    all_attribs.remove(attrib_to_drop)
    return X_train_prepared, X_test_prepared, all_attribs

In [13]:
#Num_transformer
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

#Category_transformer
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [14]:
num_attribs = list(X_train_num)
cat_attribs = ["ocean_proximity"]

#full_pipeline a.k.a columntransformer

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)

In [15]:
all_attribs = num_attribs + extra_attribs + get_categorical_names_from_ColumnTransformer(full_pipeline)
X_train_prepared = pd.DataFrame(X_train_prepared, columns=all_attribs)
X_test_prepared = pd.DataFrame(X_test_prepared, columns=all_attribs)
display(X_train_prepared.info())
#display(X_test_prepared.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   longitude               16512 non-null  float64
 1   latitude                16512 non-null  float64
 2   housing_median_age      16512 non-null  float64
 3   total_rooms             16512 non-null  float64
 4   total_bedrooms          16512 non-null  float64
 5   population              16512 non-null  float64
 6   households              16512 non-null  float64
 7   median_income           16512 non-null  float64
 8   rooms_per_hhold         16512 non-null  float64
 9   pop_per_hhold           16512 non-null  float64
 10  bedrooms_per_room       16512 non-null  float64
 11  longla                  16512 non-null  float64
 12  total_income            16512 non-null  float64
 13  pop_per_room            16512 non-null  float64
 14  pop_per_bedroom         16512 non-null

None

In [16]:
all_attribs = num_attribs + extra_attribs + get_categorical_names_from_ColumnTransformer(full_pipeline)
X_train_prepared = full_pipeline.fit_transform(X_train)
X_train_prepared = pd.DataFrame(X_train_prepared, columns=all_attribs)
X_test_prepared = full_pipeline.transform(X_test)
X_test_prepared = pd.DataFrame(X_test_prepared, columns=all_attribs)

#Xóa features
#X_train_prepared, X_test_prepared, all_attribs = remove_attribs(X_train_prepared, X_test_prepared, all_attribs, 'x0_ISLAND')
#X_train_prepared, X_test_prepared, all_attribs = remove_attribs(X_train_prepared, X_test_prepared, all_attribs, 'total_income')
X_train_prepared, X_test_prepared, all_attribs = remove_attribs(X_train_prepared, X_test_prepared, all_attribs, 'pop_per_room')
X_train_prepared, X_test_prepared, all_attribs = remove_attribs(X_train_prepared, X_test_prepared, all_attribs, 'pop_per_bedroom')
#X_train_prepared, X_test_prepared, all_attribs = remove_attribs(X_train_prepared, X_test_prepared, all_attribs, 'bedrooms_per_household')
#X_train_prepared, X_test_prepared, all_attribs = remove_attribs(X_train_prepared, X_test_prepared, all_attribs, 'population')
#X_train_prepared, X_test_prepared, all_attribs = remove_attribs(X_train_prepared, X_test_prepared, all_attribs, 'longitude')
#X_train_prepared, X_test_prepared, all_attribs = remove_attribs(X_train_prepared, X_test_prepared, all_attribs, 'latitude')
X_train_prepared.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_hhold,pop_per_hhold,bedrooms_per_room,longla,total_income,bedrooms_per_household,x0_<1H OCEAN,x0_INLAND,x0_ISLAND,x0_NEAR BAY,x0_NEAR OCEAN
0,1.003899,-0.840062,-1.795076,-0.977736,-1.090559,-1.047601,-1.13565,0.190012,0.907714,0.074082,-1.012152,0.884606,-0.880736,0.146489,0.0,1.0,0.0,0.0,0.0
1,-1.434772,0.985364,1.855539,-0.118501,-0.10254,0.052109,-0.136882,0.269311,-0.042002,0.045276,-0.095076,-1.099517,0.008421,0.011935,0.0,0.0,0.0,1.0,0.0
2,0.779481,-0.840062,-0.207852,-0.42168,-0.35667,-0.352955,-0.343433,0.029895,-0.302824,-0.039332,0.194813,0.837626,-0.262947,-0.094978,1.0,0.0,0.0,0.0,0.0
3,0.649818,-0.755812,0.744482,-0.888566,-0.777053,-0.591794,-0.766994,-1.26447,-0.812064,0.09399,1.428684,0.744791,-0.86685,-0.166587,1.0,0.0,0.0,0.0,0.0
4,0.599947,-0.723048,1.855539,-0.122159,0.15634,-0.254095,0.200399,-0.367016,-0.520269,-0.164572,0.67904,0.708728,-0.061466,-0.100459,1.0,0.0,0.0,0.0,0.0


In [17]:
# training
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

#lin_reg = Ridge(alpha=10000.0) # Áp dụng Regularization => trong số feature bị giảm => underfit
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)
pd.set_option('float_format', '{:f}'.format)
coef_matrix = pd.DataFrame([list(lin_reg.coef_.transpose())], columns=all_attribs)
coef_matrix.max()
#print_coef_info(list(lin_reg.coef_), all_attribs)

longitude                -207917.172992
latitude                  527738.249713
housing_median_age         13220.354491
total_rooms               -13284.505770
total_bedrooms             14905.549602
population                -46144.126293
households                 36496.023302
median_income              68408.339776
rooms_per_hhold            29006.203207
pop_per_hhold               1631.534283
bedrooms_per_room          18860.953287
longla                    726439.753618
total_income               14828.914125
bedrooms_per_household    -21429.346663
x0_<1H OCEAN              -26666.000499
x0_INLAND                 -67856.956728
x0_ISLAND                 137537.156480
x0_NEAR BAY               -28121.250238
x0_NEAR OCEAN             -14892.949015
dtype: float64

In [18]:
# evaluate
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, housing_predictions)
display(lin_mse)
lin_rmse = np.sqrt(lin_mse)
display(lin_rmse)

4590289149.04579

67751.67266603674

In [19]:
print("Predictions:", lin_reg.predict(X_test_prepared[:10]))
print("Labels:", list(y_test[:10]))

Predictions: [204144.63288173 285187.65615882 173706.34284378  75480.48639364
 290506.3486786  204512.79301126 278378.7789071  312919.80328645
 318064.01423129 236884.073459  ]
Labels: [136900.0, 241300.0, 200700.0, 72500.0, 460000.0, 120000.0, 247000.0, 336900.0, 339700.0, 265600.0]


In [20]:
#68466.55100466235 - income
#68441.14335551362 - no total income
#68107.22020111061 - long*la