In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import re

from ames_preprocessing import restrict_col_list, get_compressed_ames, transformed_df
from ames_model_helper import * 

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

# Import and clean

In [3]:
data = pd.read_csv('data/Ames_Housing_Price_Data.csv', index_col = 0).drop_duplicates().reset_index(drop = True)

## Manual adjustments

In [4]:
housing = data.copy()

#Do not appear to be legit garages; remove
housing.at[531,'GarageType'] = np.nan
housing.at[531,'GarageCars'] = np.nan
housing.at[531,'GarageArea'] = 0
housing.at[433, 'GarageType'] = np.nan

#Fill using known basement finish type
housing.at[2433, 'BsmtFinType2'] = housing.at[2433, 'BsmtFinType1']

#Fill missing exposure & electrical with most frequently occuring
housing.at[813, 'BsmtExposure'] = 'No'
housing.at[1201, 'BsmtExposure'] = 'No'
housing.at[2441, 'Electrical'] = 'SBrkr'

#Fix remodel year which makes no sense
housing.at[2033, 'YearRemodAdd'] = housing.at[2033, 'YearBuilt']

## Create cleaned/compressed dataset & feature lists

In [5]:
data_dict = get_compressed_ames(housing)

In [6]:
housing = data_dict['housing']
areas = data_dict['areas']
frontage = data_dict['frontage']
miscval = data_dict['miscval']
conditions = data_dict['conditions']
inspect10pt = data_dict['inspect10pt']
inspect5pt = data_dict['inspect5pt']
inspections = data_dict['inspections']
dates = data_dict['dates']
counts = data_dict['counts']
other_cats = data_dict['categoricals']

In [7]:
housing['remod_age'] = housing['YrSold'] - housing['YearRemodAdd']
housing.drop('YearRemodAdd', axis = 1, inplace = True)
dates = restrict_col_list(dates, housing)
dates.append('remod_age')

In [8]:
categoricals = other_cats+conditions+inspections
numeric = [x for x in housing.columns if x not in categoricals+['PID', 'SalePrice']]
assert set(housing[numeric].select_dtypes(include=np.number).columns) == set(numeric)

# Create training and test sets

In [9]:
features = housing.drop(['PID', 'SalePrice'], axis = 1)
target = housing.SalePrice

#Create train/test indices
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = .3, random_state = 11)
train_idx = X_train.index
test_idx = X_test.index

# Encoding

In [10]:
oe = ColumnTransformer([
    ('pass', 'passthrough', numeric),
    
    #pass a list of columns which you want to be OE
    ('oe', OrdinalEncoder(), categoricals)
])

features_encoded = pd.DataFrame(oe.fit_transform(features), columns = numeric + categoricals)
assert (features_encoded[numeric] == features[numeric]).all().all()

X_enc_train = features_encoded.loc[train_idx,:] 
X_enc_test = features_encoded.loc[test_idx,:] 

# Gradient Boosting Regressor

In [11]:
gbr = GradientBoostingRegressor()
gbr.fit(X_enc_train, y_train)
print('Train score: %s' %gbr.score(X_enc_train,y_train))
print('Test score: %s' %gbr.score(X_enc_test,y_test))

Train score: 0.9620137895593565
Test score: 0.9171757869654605


Quite good out of the box, but overfit. Tune.

## Tuning

In [12]:
# param_grid = {'n_estimators': 5 ** np.arange(3,5), 'max_depth': 2 ** np.arange(2,5), 
#               'random_state':[11], 'max_features':['sqrt']}
# gsgbr = GridSearchCV(GradientBoostingRegressor(), param_grid = param_grid, n_jobs = -1)
# gsgbr.fit(X_enc_train,y_train)
# best_gbr = gsgbr.best_estimator_

In [13]:
# print(f'The best CV score was {gsgbr.best_score_} using parameters {gsgbr.best_params_}')

In [14]:
# param_grid = {'n_estimators': np.arange(200,1000,100), 'max_depth': np.arange(4,8), 
#               'random_state':[11], 'max_features':['sqrt']}
# gsgbr = GridSearchCV(GradientBoostingRegressor(), param_grid = param_grid, n_jobs = -1)
# gsgbr.fit(X_enc_train,y_train)
# best_gbr = gsgbr.best_estimator_

In [15]:
# print(f'The best CV score was {gsgbr.best_score_} using parameters {gsgbr.best_params_}')

In [16]:
param_grid = {'n_estimators': np.arange(900,1000,10), 'max_depth': [4], 
              'random_state':[11], 'max_features':['sqrt']}
gsgbr = GridSearchCV(GradientBoostingRegressor(), param_grid = param_grid, n_jobs = -1)
gsgbr.fit(X_enc_train,y_train)
best_gbr = gsgbr.best_estimator_

In [17]:
print(f'The best CV score was {gsgbr.best_score_} using parameters {gsgbr.best_params_}')

The best CV score was 0.9004312000934064 using parameters {'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 900, 'random_state': 11}


The best CV score was 0.9004312000934064 using parameters {'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 900, 'random_state': 11}

# Evaluate Model

In [18]:
best_gbr.fit(X_enc_train, y_train)
print('Train score: %s' %best_gbr.score(X_enc_train,y_train))
print('Test score: %s' %best_gbr.score(X_enc_test,y_test))

Train score: 0.9985375100671453
Test score: 0.9161343955934337


Tuning exacerbated overfitting problem. Was not fixed by tuning min_samples_split.