In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv
/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt


In [16]:
!pip install pycaret







# Read Data

In [17]:
import pandas as pd
import numpy as np

def read_data():
    train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
    train['split'] = 'train'
    
    test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
    test['split'] = 'test'
    
    full = pd.concat([train, test])
    
    return train, test, full

train, test, full = read_data()
full.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,split
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500.0,train
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500.0,train
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500.0,train
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000.0,train
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000.0,train


# Feature Generation

In [18]:
def generate_features(df):
    df['Age'] = 2010 - df[['YearBuilt', 'YearRemodAdd']].max(axis = 1)
    df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF']
    
    return df

full = generate_features(full)

# Missing Value

In [19]:
# show all rows
pd.set_option('display.max_rows', 100)

def tabulate_missing_val(df):
    missing_pct = df.isna().sum() / df.shape[0] * 100
    missing_val_table = zip(df.columns, df.dtypes, missing_pct)
    result = pd.DataFrame(missing_val_table, columns = ['cols', 'types', 'missing_pct'])

    return result

tabulate_missing_val(full)

Unnamed: 0,cols,types,missing_pct
0,Id,int64,0.0
1,MSSubClass,int64,0.0
2,MSZoning,object,0.137033
3,LotFrontage,float64,16.649538
4,LotArea,int64,0.0
5,Street,object,0.0
6,Alley,object,93.216855
7,LotShape,object,0.0
8,LandContour,object,0.0
9,Utilities,object,0.068517


The following numerical missing values exist because of no garage. Hence I fill in with 0 for cars and area but the year when the house was built for garage year built.
* GarageYrBlt
* GarageCars
* GarageArea

The following ordinal missing values exist because of no fence/garage etc. Hence I fill in with a value 'not_available' for the use of label encoding in modelling setup stage
* PoolQC
* GarageCond, GarageQual, GarageFinish
* FireplaceQu
* BsmtFinType2, BsmtFinType1, BsmtExposure, BsmtCond, BsmtQual

The rest of missing values will be imputed during modelling setup stage.

In [20]:
def handle_missing(df):
    num_cols = ['GarageCars', 'GarageArea']
    ord_cols = ['GarageCond', 'GarageQual', 'GarageFinish', 'FireplaceQu', 
                'BsmtFinType2', 'BsmtFinType1', 'BsmtExposure', 'BsmtCond', 'BsmtQual']
    
    for col in num_cols:
        df[col] = df[col].fillna(0)
        
    for col in ord_cols:
        df[col] = df[col].fillna('not_available')
        
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])
        
    return df

full = handle_missing(full)

# Split data to train and test

In [21]:
train = full[full['split'] == 'train']
test = full[full['split'] == 'test']

# Categorical Features
'MSSubClass' has non-object data type but should be treated as categorical variables in models.

In [22]:
cat_feat = ['MSSubClass']

# Set Up Ordinal Feature Dictionary

In [23]:
ordinal_dict = {'LotShape': ['Reg', 'IR1', 'IR2', 'IR3'], 
                'LandContour': ['Lvl', 'Bnk', 'HLS', 'Low'], 
                'LandSlope': ['Gtl', 'Mod', 'Sev'], 
                'OverallQual': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], 
                'OverallCond': ['1', '2', '3', '4', '5', '6', '7', '8', '9'], 
                'ExterQual': ['Fa', 'TA', 'Gd', 'Ex'], 
                'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'], 
                'BsmtQual': ['not_available', 'Fa', 'TA', 'Gd', 'Ex'], 
                'BsmtCond': ['not_available', 'Po', 'Fa', 'TA', 'Gd'], 
                'BsmtExposure': ['not_available', 'No', 'Mn', 'Av', 'Gd'], 
                'BsmtFinType1': ['not_available', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], 
                'BsmtFinType2': ['not_available', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], 
                'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'], 
                'KitchenQual': ['Fa', 'TA', 'Gd', 'Ex'], 
                'Functional': ['Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'], 
                'FireplaceQu': ['not_available', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
                'GarageFinish': ['not_available', 'Unf', 'RFn', 'Fin'], 
                'GarageQual': ['not_available', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
                'GarageCond': ['not_available', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
                'PavedDrive': ['N', 'P', 'Y']}

# High Cardinal Features
Features with high cardinality will be encoded by frequency by default.

In [24]:
high_cardinal_feat = ['MSSubClass', 'Neighborhood', 'Condition1', 'Condition2', 'HouseStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'SaleType']

# Columns to Drop
Besides Id, only 1 house has no full utilities in train data, also only 7 houses have pool in train data. Due to low variance, these columns are dropped

In [25]:
ignore_feat = ['Id', 'split', 'Utilities', 'PoolQC', 'MiscFeature', 'Fence', 'Alley']

# Setup


In [26]:
from pycaret.regression import *

pipeline = setup(data = train, target = 'SalePrice', 
                 categorical_features = cat_feat, 
                 ordinal_features = ordinal_dict, 
                 high_cardinality_features = high_cardinal_feat,
                 numeric_imputation = 'median',
                 ignore_features = ignore_feat, 
                 normalize = True,
                 combine_rare_levels = True,
                 remove_outliers = True,
                 session_id = 1)

Unnamed: 0,Description,Value
0,session_id,1
1,Target,SalePrice
2,Original Data,"(1460, 84)"
3,Missing Values,True
4,Numeric Features,23
5,Categorical Features,53
6,Ordinal Features,True
7,High Cardinality Features,True
8,High Cardinality Method,frequency
9,Transformed Train Set,"(970, 154)"


# Compare Models

In [27]:
compare_models(include = ['lr', 'ridge', 'lasso', 'rf', 'xgboost', 'lightgbm'], fold = 5, sort = 'RMSLE', errors = 'raise')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,16804.9403,789879221.9766,27454.5345,0.8724,0.1226,0.0903,0.146
rf,Random Forest Regressor,17510.4295,818471959.5735,28113.9404,0.8661,0.1301,0.0958,0.846
xgboost,Extreme Gradient Boosting,17687.2158,749677798.4,26584.4664,0.8786,0.1311,0.0979,0.844
ridge,Ridge Regression,19166.7312,757181574.5767,27339.0874,0.8731,0.1522,0.1114,0.016
lasso,Lasso Regression,19480.8513,790990425.9293,27996.0863,0.8668,0.155,0.1134,0.062
lr,Linear Regression,428887414168391.1,3.879568964266423e+31,4333428906410636.0,-7.129666738384612e+21,2.6219,3526713775.7383,0.652


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# Tune Model

In [28]:
# create and tune model
lightgbm = create_model('lightgbm')
tuned_lightgbm = tune_model(lightgbm, optimize = 'RMSLE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,15098.6904,451853535.9933,21256.8468,0.9062,0.1103,0.0865
1,20386.189,898031818.1419,29967.179,0.8444,0.1441,0.1103
2,14692.3098,393044437.7935,19825.3484,0.9161,0.1199,0.0901
3,16614.2745,618259010.8373,24864.8147,0.9015,0.116,0.0863
4,17480.9166,790925835.4249,28123.4037,0.9072,0.1226,0.0887
5,20366.555,1454439581.4604,38137.1155,0.823,0.1412,0.103
6,14441.5287,522509517.695,22858.4671,0.9003,0.0982,0.0748
7,17935.4326,713174054.8498,26705.3188,0.8632,0.1352,0.1004
8,11599.2043,327184945.404,18088.2543,0.9303,0.0838,0.0632
9,14687.6467,528094486.0193,22980.3065,0.9107,0.1101,0.0823


# Evaluate Model

In [None]:
# evaluate model
evaluate_model(tuned_lightgbm)

<img src="pycaret_residual.png">

<img src="pycaret_feature_importance.png">

# Deploy Model

In [31]:
# deploy model
final_lightgbm = finalize_model(tuned_lightgbm)
submission = predict_model(final_lightgbm, data = test)

submission = pd.DataFrame({
    "Id": list(range(1461, 2920)),
    "SalePrice": submission['Label']
})

submission.to_csv('lightgbm_pycaret.csv', index=False)