## House Pricing Model

This notebook is primary for model development. 


In [1]:
# Package Imports
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from scipy.stats import uniform
import math
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

# from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV



In [2]:
import sys
import os
sys.path.insert(0, os.path.abspath('/Users/daniel/Documents/GitHub/WH002-AWS-Containerized-Training/'))

In [3]:
# Data Loading
df = pd.read_csv("../data/house-prices-advanced-regression-techniques/train.csv")
print("Data shape : {}".format(df.shape))

Data shape : (1460, 81)


In [4]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
features = [x for x in df.columns if x not in ['SalePrice']]
X = df[features]
y = df['SalePrice']


In [6]:
# Columns

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 13 and 
                    X[cname].dtype == "object"]


numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

X = preprocessor.fit_transform(X)

In [14]:
def rmse(true,pred):
    res =  mean_squared_error(np.log(true), np.log(pred), squared=False)
    return res

def train(model, X, y, grid, metric, metric_module, greater):
    
    # Preprocess

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    grid_search = GridSearchCV(model, 
                            param_grid=grid,
                            scoring=make_scorer(metric_module, greater_is_better=greater),
                            cv=5)
    grid_search.fit(X_train,y_train)
    
    valid_score = grid_search.best_score_
    test_score = metric_module(y_test, grid_search.best_estimator_.predict(X_test))
    print(f'validation_{metric}:{valid_score }' )
    print(f'test_{metric}:{test_score}' )
    
    return grid_search, valid_score, test_score

In [17]:
# Decision Tree

tree_model = DecisionTreeRegressor()

grid = dict(max_features = ['auto','sqrt','log2'],
            min_samples_split = [float(x) for x in np.linspace(0.1, 1.0, 10, endpoint=True)],
            min_samples_leaf = [float(x) for x in np.linspace(0.1, 0.5, 5, endpoint=True)])
decision_tree_clf = train(tree_model, X, y, grid, "rmse", rmse, greater=False)


validation_rmse:-0.22927548820894567
test_rmse:0.228471647644574


In [19]:
# Random Forest Regressor
random_model = RandomForestRegressor(random_state=0)
grid2 = dict(n_estimators = [100],
             max_features = ['auto','sqrt','log2'],
             min_samples_split = [float(x) for x in np.linspace(0.1, 1.0, 10, endpoint=True)],
             min_samples_leaf = [float(x) for x in np.linspace(0.1, 0.5, 5, endpoint=True)])
random_tree_clf = train(random_model, X, y, grid2, "rmse", rmse, greater=False)

In [15]:
# XGBoost Model

xgb_model = XGBRegressor()
params = {
        'n_estimators': [500, 1000]
        }
xgb_model_clf, valid_score, test_score= train(xgb_model, X, y, params, "rmse", rmse, greater=False) 
    

validation_rmse:-0.1525117740687361
test_rmse:0.14655285390965905


In [17]:
report= xgb_model_clf.best_params_

In [18]:
metric  = {
    'valid_score': valid_score,
    'test_score':test_score
}
report.update(metric)
report

{'n_estimators': 500,
 'valid_score': -0.1525117740687361,
 'test_score': 0.14655285390965905}