# Introduction

In this exercise, you will create and submit predictions for a Kaggle competition. You can then improve your model (e.g. by adding features) to apply what you've learned and move up the leaderboard.

Begin by running the code cell below to set up code checking and the filepaths for the dataset.

In [9]:
# Set up code checking
# Set up filepaths
import os
os.chdir(os.path.join(os.path.expanduser('~'), 'kaggle'))

In [10]:
# Import helpful libraries
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

## Load data to pd.Df

In [11]:
iowa_file_path = 'data/house-price-data/train.csv'
home_data: pd.DataFrame = pd.read_csv(iowa_file_path)

test_data_path = 'data/house-price-data/test.csv'
test_data: pd.DataFrame = pd.read_csv(test_data_path)

In [12]:
# sort data into numeric and categoric variables
y = home_data['SalePrice']
home_data.drop(columns=['SalePrice'], inplace=True)

numeric_features = [col for col in home_data.columns if is_numeric_dtype(home_data[col].dtype)]
categorical_features = [col for col in home_data.columns if is_string_dtype(home_data[col].dtype)]
# categorical values which appear as numbers
hidden_categorical = ['MSSubClass']
categorical_features.extend(hidden_categorical)
numeric_features = [n for n in numeric_features if (n not in hidden_categorical)]
print("numerical features")
print(numeric_features)
print('categorical features')
print(categorical_features)

numerical features
['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
categorical features
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageF

## Fill mising data, encode categorical features

In [13]:
# get cols with missing data
cols_with_na = [c for c in home_data.columns if home_data[c].isna().any()]

# print([1 if c else 0 for c in home_data[cols_with_na[1]].isna()])
# add indicator data was missing
for cna in cols_with_na:
    home_data[f'{cna}_was_missing'] = [1 if c else 0 for c in home_data[cols_with_na[1]].isna()]
    test_data[f'{cna}_was_missing'] = [1 if c else 0 for c in test_data[cols_with_na[1]].isna()]
    numeric_features.append(f'{cna}_was_missing')

for nf in numeric_features:
    home_data[nf].fillna(home_data[nf].mean(), inplace=True)
    test_data[nf].fillna(test_data[nf].mean(), inplace=True)

for cf in categorical_features:
    home_data[cf].fillna(home_data[cf].mode()[0], inplace=True)
    test_data[cf].fillna(test_data[cf].mode()[0], inplace=True)

# onehotencoding for categorical features
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

cat_features_df = pd.DataFrame(ohe.fit_transform(home_data[categorical_features]), columns=ohe.get_feature_names_out())

test_cat_features_df = pd.DataFrame(ohe.transform(test_data[categorical_features]), columns=ohe.get_feature_names_out())

home_data = pd.concat([home_data[numeric_features], cat_features_df], axis=1)
test_data = pd.concat([test_data[numeric_features], test_cat_features_df], axis=1)
print(test_data.shape, home_data.shape)

(1459, 322) (1460, 322)


### Feature selection

In [14]:
features = home_data.columns
print(features)
# Select columns corresponding to features, and preview the data
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.5, random_state=1)

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       ...
       'MSSubClass_60', 'MSSubClass_70', 'MSSubClass_75', 'MSSubClass_80',
       'MSSubClass_85', 'MSSubClass_90', 'MSSubClass_120', 'MSSubClass_160',
       'MSSubClass_180', 'MSSubClass_190'],
      dtype='object', length=322)


## Select model

In [30]:
from sklearn.linear_model import LinearRegression, BayesianRidge 
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

models = {# 'random forest': RandomForestRegressor(), 
        # 'linear regression': LinearRegression(copy_X=True), 
        # 'lgbm': LGBMRegressor(), 
        'xgregressor': XGBRegressor(objective='reg:squarederror', eval_metric='mae'), 
        # 'Gradient boost regressor': GradientBoostingRegressor(),
        # 'SVR': SVR(),
        # 'bayes ridge': BayesianRidge(),
        }

# def get_mae(model):
#     model.fit(train_X, train_y)
#     model_predictions = model.predict(val_X)
#     return mean_absolute_error(model_predictions, val_y)

# for model_name in models:
#     model = models[model_name]
#     mae = get_mae(model)
#     print(f"Model: {model_name}; MAE: {mae: .0f}")
#     print()

## Hyperparam tuning

In [33]:
xg_param_grid = {'learning_rate': np.logspace(start=np.log10(0.1), stop=np.log10(0.7), num=10),
                'min_split_loss': [0, 1, 2],
                'max_depth': list(map(int, np.linspace(start=5, stop=7, num=3))),
                'min_child_weight': [1, 2],
                'subsample':[i/10.0 for i in range(6,10)],
                'colsample_bytree':[i/10.0 for i in range(6,10)]
                }

In [35]:

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer

for model_name in models:
    model = models[model_name]
    search = RandomizedSearchCV(model, 
                                param_distributions=xg_param_grid, 
                                n_iter=100, 
                                scoring='neg_mean_absolute_error', 
                                n_jobs=-1) 
    search.fit(X, y)
    print(f"Model: {model_name}")
    print(f"best params: {search.best_params_}")
    print(f"Score: {search.best_score_}")

Model: xgregressor
best params: {'subsample': 0.9, 'min_split_loss': 0, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Score: -15542.27716181507


In [38]:
# tune for the second time
best_results = {'subsample': 0.9, 
                'min_split_loss': 0, 
                'min_child_weight': 1, 
                'max_depth': 5, 
                'learning_rate': 0.1, 
                'colsample_bytree': 0.7, 
                'gamma': 0, 
                'colsample_bytree': 0.7, 
                'subsample': 0.6}
xg_param_grid = {'subsample':[i/10.0 for i in range(6,10)],
                'colsample_bytree':[i/10.0 for i in range(6,10)]}
model = models['xgregressor']
grid_search = GridSearchCV(XGBRegressor(**best_results), 
                            param_grid=xg_param_grid, 
                            scoring='neg_mean_absolute_error')

grid_search.fit(X, y)

print(f"Model: {model_name}")
print(f"best params: {grid_search.best_params_}")
print(f"Score: {grid_search.best_score_}")

Model: xgregressor
best params: {'colsample_bytree': 0.7, 'subsample': 0.6}
Score: -15482.744073737156


In [39]:
full_model = model.set_params(**best_results) 

full_model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             eval_metric='mae', gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, min_split_loss=0, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

Now, read the file of "test" data, and apply your model to make predictions.

In [40]:
# path to file you will use for predictions
# test_data_path = '../input/test.csv'

# # read test data file using pandas
# test_data = pd.read_csv(test_data_path)

# # create test_X which comes from test_data but includes only the columns you used for prediction.
# # The list of columns is stored in a variable called features
test_X = test_data[features]

# # make predictions which we will submit. 
test_preds = full_model.predict(test_X)
print(test_preds)
print(type(test_preds), len(test_preds))

[124955.26  155590.52  179521.19  ... 164801.73  111943.664 230853.47 ]
<class 'numpy.ndarray'> 1459


Before submitting, run a check to make sure your `test_preds` have the right format.

# Generate a submission

Run the code cell below to generate a CSV file with your predictions that you can use to submit to the competition.

In [None]:
# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('code/house-prices-competition/submission_xgboost_0.csv', index=False)