In [18]:
# Set up code checking
# Set up filepaths
import os
os.chdir(os.path.join(os.path.expanduser('~'), 'kaggle'))

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


## Load data into dataframes

In [20]:
train_path = os.path.join('data', 'house-price-data', 'train.csv')
train_data = pd.read_csv(train_path)

test_path = os.path.join('data', 'house-price-data', 'test.csv')
test_data = pd.read_csv(test_path)

## Select numerical and categorical variables, and separate target from features

In [21]:
y = train_data['SalePrice']
train_data.drop(columns=['SalePrice'], inplace=True)

numerical_columns = train_data.select_dtypes(include=np.number).columns.tolist()

numerical_columns.remove('MSSubClass')

categorical_columns = [c for c in train_data.columns if c not in numerical_columns]

print(numerical_columns)
print(categorical_columns)

['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'Gar

## Add missing data col for every column with missing data

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin

# adds additional column which indicates whether data was missing for given feature
class AddMissingIndicator(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_columns):
        self.numeric_columns = numeric_columns
        self.num_cols_with_na = [] 

    def fit(self, X, y=None):
        self.num_cols_with_na = [c for c in self.numeric_columns if X[c].isna().any()]
        return self
    
    def transform(self, X, y=None):
        for c in self.num_cols_with_na:
            missing_col_name = f"{c}_is_missing"
            X[missing_col_name] = np.where(X[c].isna(), 1, 0)
        
        return X 


cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), 
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

num_pipeline = Pipeline(steps=[
    # ('missing_indicator', AddMissingIndicator(numerical_columns)),
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])


preprocessing = ColumnTransformer(transformers=[
    ('numerical', num_pipeline, numerical_columns),
    ('categorical', cat_pipeline, categorical_columns)
])



In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.2)

## Test different regression models

In [28]:
from sklearn.linear_model import LinearRegression, BayesianRidge 
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

models = {
    # 'linear regression': LinearRegression(), 
    'lgbm regressor': LGBMRegressor(), 
    'xgboost': XGBRegressor(),
    'GBR': GradientBoostingRegressor(loss='absolute_error'),
    'SVR': SVR()
}

pipelines = {key: Pipeline([
    ('preprocessing', preprocessing),
    (f'model_{key}', models[key])
]) for key in models}


for p in pipelines:
    predictor = pipelines[p]
    predictor.fit(X_train, y_train)
    predictions = predictor.predict(X_test)

    print(f"Model: {p}")
    print(f'Mae: {mean_absolute_error(predictions, y_test)}')
    print(f'Mae, relative: {mean_absolute_error(predictions, y_test)/np.mean(y_test)}')
    print()

# predictor = Pipeline(steps=[
#     ("preporc", preprocessing), 
#     ('model', XGBRegressor())
# ]) 

# print(X_train.shape, y_train.shape)
# predictor.fit(X_train, y_train)
# predictions = predictor.predict(X_test)

# print(f"Model: {p}")
# print(f'Mae: {mean_absolute_error(predictions, y_test)}')

Model: linear regression
Mae: 20521956310548.793
Mae, relative: 120489116.10127693

Model: lgbm regressor
Mae: 15668.240984111864
Mae, relative: 0.09199183929979605

Model: xgboost
Mae: 16645.805476776542
Mae, relative: 0.09773134482601216

Model: GBR
Mae: 15376.291157412372
Mae, relative: 0.09027773485319117

Model: SVR
Mae: 50742.70448964156
Mae, relative: 0.29792206551977163

