### Checking out the data for obvious outliers, special features, feature scales etc.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Feature Extraction and Preprocessing

In [340]:
# numerical features pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import warnings

warnings.filterwarnings("ignore")

# helper function for preprocessing pipelines below
def numerical_value_for_rating(rating):
    if rating == "Ex":
        num_value = 5
    elif rating == "Gd":
        num_value = 4
    elif rating == "TA":
        num_value = 3
    elif rating == "FA":
        num_value = 2
    elif rating == "Po":
        num_value = 1
    else:
        num_value = 0
    return num_value

# define preprocessing pipeline steps
class DataFrameSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, selection):
        self.selection = selection
            
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.selection]
    
class AddNumAttr(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # generate the special combi features
        X["GarageCarsPerSF"] = X["GarageCars"] / X["GarageArea"]
        
        X["TotRoomsPerTotSF"] = X["TotRmsAbvGrd"] / X["GrLivArea"]
        
        X["TotBsmtBath"] = X["BsmtFullBath"] + 0.5*X["BsmtHalfBath"]
        X["TotBath"] = X["FullBath"] + 0.5*X["HalfBath"]
        
        X["TotBsmtBathPerSF"] = X["TotBsmtBath"] / X["TotalBsmtSF"]
        X["TotBathPerSF"] = X["TotBath"] / X["GrLivArea"]
        
        # turn the originally cat classes into quality integer features
        quality_classes = [
            "ExterQual",
            "ExterCond",
            "BsmtCond",
            "HeatingQC",
            "GarageQual",
            "GarageCond"
        ]
        
        for col in quality_classes:
            X[col] = [numerical_value_for_rating(item) for item in X[col]]
            
        bath_attrs = [
            "FullBath",
            "HalfBath",
            "BsmtFullBath",
            "BsmtHalfBath"
        ]
        
        return X.drop(columns=bath_attrs)
    
class AddBinaryAttr(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        binary_feature_cols = [
            "Fireplaces",
            "WoodDeckSF",
            'OpenPorchSF', 
            'EnclosedPorch', 
            '3SsnPorch', 
            'ScreenPorch', 
            'PoolArea'
        ]
        
        for col in binary_feature_cols:
            X[col] = [bool(item) for item in X[col]]

        return X.astype("category")

In [341]:
# choose numerical attrs
chosen_num_attrs = ['LotFrontage', 
                    'LotArea', 
                    'OverallQual',
                    'OverallCond', 
                    'YearBuilt', 
                    'YearRemodAdd', 
                    'MasVnrArea', 
                    'BsmtUnfSF', 
                    'TotalBsmtSF', 
                    '1stFlrSF', 
                    '2ndFlrSF', 
                    'GrLivArea', 
                    'BsmtFullBath', 
                    'BsmtHalfBath', 
                    'FullBath',
                    'HalfBath', 
                    'BedroomAbvGr', 
                    'KitchenAbvGr', 
                    'TotRmsAbvGrd',
                    'GarageYrBlt', 
                    'GarageCars', 
                    'GarageArea', 
                    'WoodDeckSF',
                    'OpenPorchSF', 
                    'EnclosedPorch', 
                    '3SsnPorch', 
                    'ScreenPorch', 
                    'PoolArea',
                    "ExterQual",
                    "ExterCond",
                    "BsmtCond",
                    "HeatingQC",
                    "GarageQual",
                    "GarageCond"
               ]

# choose categorical attrs
chosen_cat_attrs = [
    "Neighborhood",
    "Heating",
    "Foundation",
    "CentralAir",
    "Fireplaces",
    "WoodDeckSF",
    'OpenPorchSF', 
    'EnclosedPorch', 
    '3SsnPorch', 
    'ScreenPorch', 
    'PoolArea'
]

categories = [
    list(set(data["Neighborhood"])),
    list(set(data["Heating"]))
]

num_pipeline = Pipeline([("Selector", DataFrameSelector(chosen_num_attrs)),
                         ("NumAttrsAdder", AddNumAttr()),
                         ("Imputer", SimpleImputer(strategy="most_frequent")),
                         ("Scaler", StandardScaler())])

cat_pipeline = Pipeline([("cat_Selector", DataFrameSelector(chosen_cat_attrs)),
                         ("BinaryAttrsAdder", AddBinaryAttr()),
                         ("One-Hot-Enc", OneHotEncoder())])

full_pipe = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

data = pd.read_csv("../data/train.csv")
data_test = pd.read_csv("../data/test.csv")

y_train = data["SalePrice"].copy()
X_train = full_pipe.fit_transform(data.copy())
X_test = full_pipe.transform(data_test.copy())

### First Ridge Regression Baseline

In [365]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer

# define the cross val scoring function
def log_sqrd_err_loss(y_true, y_pred):
    return np.mean((np.log(1+y_true) - np.log(1+np.abs(y_pred)))**2)
score = make_scorer(log_sqrd_err_loss, greater_is_better=False)

param_grid = [
    {"alpha": [0, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}
]

ridge = Ridge()
grid_search_ridge = GridSearchCV(ridge, 
                                 param_grid, 
                                 cv=5, 
                                 scoring=score,
                                 refit=True)

grid_search_ridge.fit(X_train, y_train)
cvres = grid_search_ridge.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

print("predicting test set data with best model: ", grid_search_ridge.best_params_)
y_test = grid_search_ridge.best_estimator_.predict(X_test)

0.1892567574360122 {'alpha': 0}
0.1902031844728722 {'alpha': 0.001}
0.191736504412709 {'alpha': 0.01}
0.19043924411735974 {'alpha': 0.1}
0.18830208963369802 {'alpha': 1}
0.18601944477351248 {'alpha': 10}
0.21369052381258719 {'alpha': 100}
0.1762787784169623 {'alpha': 1000}
predicting test set data with best model:  {'alpha': 1000}


In [369]:
submit = pd.DataFrame(y_test, columns=["SalePrice"]).set_index(np.arange(1461, 1461+len(y_test)))#.to_csv("data/submission.csv")
submit["Id"] = submit.index
submit = submit[["Id", "SalePrice"]]
print(submit.head())
submit.to_csv("../data/submission.csv", index=False)

        Id      SalePrice
1461  1461  125960.911526
1462  1462  155000.283947
1463  1463  180268.355019
1464  1464  184103.975172
1465  1465  188598.033864


### Boosted Regression Trees

In [368]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor()

param_grid = [
    {"n_estimators": [50, 100, 150, 200], "max_leaf_nodes": [10, 20, 50, None]}
]

grid_search_gbrt = GridSearchCV(gbrt,
                                param_grid,
                                scoring=score,
                                refit=True)

grid_search_gbrt.fit(X_train, y_train)
cvres = grid_search_gbrt.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

print("predicting test set data with best model: ", grid_search_gbrt.best_params_)
y_test = grid_search_gbrt.best_estimator_.predict(X_test)

0.14361092052053093 {'max_leaf_nodes': 10, 'n_estimators': 50}
0.13571420974073564 {'max_leaf_nodes': 10, 'n_estimators': 100}
0.13415578551509144 {'max_leaf_nodes': 10, 'n_estimators': 150}
0.13410602844232644 {'max_leaf_nodes': 10, 'n_estimators': 200}
0.1432852037591588 {'max_leaf_nodes': 20, 'n_estimators': 50}
0.13531125620654919 {'max_leaf_nodes': 20, 'n_estimators': 100}
0.1345923398193988 {'max_leaf_nodes': 20, 'n_estimators': 150}
0.13373940495953324 {'max_leaf_nodes': 20, 'n_estimators': 200}
0.14362840910922114 {'max_leaf_nodes': 50, 'n_estimators': 50}
0.1352823320410758 {'max_leaf_nodes': 50, 'n_estimators': 100}
0.13466354403382225 {'max_leaf_nodes': 50, 'n_estimators': 150}
0.13366728691992827 {'max_leaf_nodes': 50, 'n_estimators': 200}
0.14336389253768728 {'max_leaf_nodes': None, 'n_estimators': 50}
0.13515732605965997 {'max_leaf_nodes': None, 'n_estimators': 100}
0.13411810221658574 {'max_leaf_nodes': None, 'n_estimators': 150}
0.13378494343939368 {'max_leaf_nodes': No