In [1]:
# Set up code checking
# Set up filepaths
import os
os.chdir(os.path.join(os.path.expanduser('~'), 'kaggle'))

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


## Load data into dataframes

In [3]:
train_path = os.path.join('data', 'house-price-data', 'train.csv')
train_data = pd.read_csv(train_path)

test_path = os.path.join('data', 'house-price-data', 'test.csv')
test_data = pd.read_csv(test_path)

## Select numerical and categorical variables, and separate target from features

In [4]:
y = train_data['SalePrice']
train_data.drop(columns=['SalePrice'], inplace=True)

numerical_columns = train_data.select_dtypes(include=np.number).columns.tolist()

numerical_columns.remove('MSSubClass')

categorical_columns = [c for c in train_data.columns if c not in numerical_columns]

print(numerical_columns)
print(categorical_columns)

['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'Gar

## Add missing data col for every column with missing data

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

# adds additional column which indicates whether data was missing for given feature
class AddMissingIndicator(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_columns):
        self.numeric_columns = numeric_columns
        self.num_cols_with_na = [] 

    def fit(self, X, y=None):
        self.num_cols_with_na = [c for c in self.numeric_columns if X[c].isna().any()]
        return self
    
    def transform(self, X, y=None):
        for c in self.num_cols_with_na:
            missing_col_name = f"{c}_is_missing"
            X[missing_col_name] = np.where(X[c].isna(), 1, 0)
        
        return X 


cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), 
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

num_pipeline = Pipeline(steps=[
    ('missing_indicator', AddMissingIndicator(numerical_columns)),
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])


preprocessing = ColumnTransformer(transformers=[
    ('numerical', num_pipeline, numerical_columns),
    ('categorical', cat_pipeline, categorical_columns)
])



In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.2)

## Test different regression models

In [9]:
from sklearn.linear_model import Lasso, Ridge
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

models = {
    'ridge': Ridge(),
    'lasso': Lasso(random_state=1),
    'xgboost': XGBRegressor(random_state=1),
    'GBR': GradientBoostingRegressor(loss='absolute_error', random_state=1),
}

pipelines = {key: Pipeline(steps=[
    ('preprocessing', preprocessing),
    (f'{key}', models[key])
]) for key in models}

def get_mae(predictor, X_train, y_train, X_test, y_test):
    predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)
    return mean_absolute_error(predictions, y_test)


for p in pipelines:
    predictor = pipelines[p]
    mae = get_mae(predictor, X_train, y_train, X_test, y_test)
    print(f"Model: {p}")
    print(f'Mae: {mae}')
    print(f'Mae, relative: {mae/np.mean(y_test)}')
    print()


Model: ridge
Mae: 17672.094558294415
Mae, relative: 0.09647904932501368



  model = cd_fast.enet_coordinate_descent(


Model: lasso
Mae: 17010.99247351808
Mae, relative: 0.09286982799385748

Model: xgboost
Mae: 17272.957098137842
Mae, relative: 0.09429999790703489

Model: GBR
Mae: 16344.064748342736
Mae, relative: 0.08922880215613578



## GridSearchCV parameters for models with small number of params

In [29]:
param_grid_small_model = {}

param_grid_small_model['ridge'] = {
    'ridge__alpha': np.logspace(1.5, 1.8, num=11),
    'ridge__tol': np.logspace(-6, -5, num=11)
}

# param_grid_small_model['lasso'] = {
#     'lasso__alpha': np.logspace(-1, 1, num=11),
#     'lasso__max_iter': [1000],
#     'lasso__tol': np.logspace(-4, -3.5, num=2)
# }

In [30]:
from sklearn.model_selection import GridSearchCV

for model_name in param_grid_small_model:
    print(f"Model name: {model_name}")
    pipe = pipelines[model_name]
    search = GridSearchCV(estimator=pipe, param_grid=param_grid_small_model[model_name], scoring='neg_mean_absolute_error')
    search.fit(train_data, y)
    print(f'Best values: {search.best_params_}')
    print(f'Best score MAE: {search.best_score_}')
    print()

Model name: ridge
Best values: {'ridge__alpha': 47.86300923226385, 'ridge__tol': 1e-06}
Best score MAE: -17666.719219321873



## Grid search for models with many params

In [None]:
consecutive_param_grids = {}

consecutive_param_grids['xgboost'] = {
    [ 
        {
            'xgboost__max_depth':range(3,8,2),
            'xgboost__min_child_weight':range(1,6,2)
        }, 
        {
            'xgboost__gamma':[i/10.0 for i in range(0,5)]
        },
        {
            'xgboost__subsample':[i/10.0 for i in range(6,10)],
            'xgboost__colsample_bytree':[i/10.0 for i in range(6,10)]
        }, 
        {
            'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
        }
    ]
}