In [1]:
import numpy as np
import pandas as pd

### Reading data

In [2]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv",index_col="Id")

# drop rows with missing targets in train data
n = np.sum(pd.isnull(train["SalePrice"]))
if n!=0:
    train.dropna(axis=0, subset=["SalePrice"], inplace=True)
    print("%d rows with missing targets are dropped." % n)

X_train_val_org = train.drop(columns="SalePrice")
y_train_val = train["SalePrice"]
X_test_org = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv",index_col="Id")

### Exploring data for preprocessing:
missing values, catagrical columns

In [3]:
X_train_val = X_train_val_org.copy()
X_test = X_test_org.copy()

# change fake missing values
#   alley
X_train_val.replace({"Alley":{np.nan:"wo"}},inplace=True)
X_test.replace({"Alley":{np.nan:"wo"}},inplace=True)
#   basement
bsmt_cols = ["BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2"]
X_train_val.loc[X_train_val["TotalBsmtSF"]==0,bsmt_cols] = "wo"
X_test.loc[X_test["TotalBsmtSF"]==0,bsmt_cols] = "wo"
#   fireplace
X_train_val.replace({"FireplaceQu":{np.nan:"wo"}},inplace=True)
X_test.replace({"FireplaceQu":{np.nan:"wo"}},inplace=True)
#   garage
grg_cols=["GarageType","GarageFinish","GarageQual","GarageCond"]
X_train_val.loc[X_train_val["GarageArea"]==0,grg_cols] = "wo"
X_test.loc[X_test["GarageArea"]==0,grg_cols] = "wo"
#   fence
X_train_val.replace({"Fence":{np.nan:"wo"}},inplace=True)
X_test.replace({"Fence":{np.nan:"wo"}},inplace=True)
#   MiscFeature
X_train_val.replace({"MiscFeature":{np.nan:"wo"}},inplace=True)
X_test.replace({"MiscFeature":{np.nan:"wo"}},inplace=True)

# drop columns with too little information
n_row = np.shape(X_train_val)[0]
for col in X_train_val.columns:
    if np.sum(pd.isnull(X_train_val[col])) > n_row*0.7:
        X_train_val.drop(col, axis=1, inplace=True)
        X_test.drop(col, axis=1, inplace=True)
        print("Column "+col+": >70% missing values, dropped")
        
# change the non-ordinal numerical columns into string categories
cols_num2str = ["MSSubClass","MoSold"]
X_train_val[cols_num2str] = X_train_val[cols_num2str].apply(str)
X_test[cols_num2str] = X_test[cols_num2str].apply(str)

# numerical columns, categorical columns
cols_num = X_train_val.columns[X_train_val.dtypes!="object"]
cols_cat = X_train_val.columns[X_train_val.dtypes=="object"]

# ordinal categorical columns
cols_cat_ord = ["ExterQual","ExterCond","BsmtQual","BsmtCond",
                "BsmtExposure","BsmtFinType1","BsmtFinType2",
                "HeatingQC","CentralAir","KitchenQual",
                "FireplaceQu","GarageFinish","GarageCond","Fence"]
ords1 = ["Po","Fa","TA","Gd","Ex"]
ords2 = ["wo","Po","Fa","TA","Gd","Ex"]
ords3 = ["wo","No","Mn","Av","Gd"]
ords4 = ["wo","Unf","LwQ","Rec","BLQ","ALQ","GLQ"]
ords5 = ["wo","Unf","RFn","Fin"]
ords6 = ["wo","MnWw","GdWo","MnPrv","GdPrv"]        
ords = [ords1,ords1,ords2,ords2,
        ords3,ords4,ords4,
        ords1,["N","Y"],ords1,
        ords2,ords5,ords2,ords6]

# non-ordinal categorical columns
cols_cat_oh = list(set(cols_cat)-set(cols_cat_ord))

Column PoolQC: >70% missing values, dropped


### Model 1: RandomForestRegressor

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.compose import ColumnTransformer

# split X_train_val into a train set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val,
                                                 test_size=0.3)

# process columns with imputers
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
cat_ord_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(categories=ords))])
cat_oh_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, cols_num),
        ('cat_ord', cat_ord_transformer, cols_cat_ord),
        ('cat_oh', cat_oh_transformer, cols_cat_oh)])

# model 1: RandomForestRegressor
rf = RandomForestRegressor()

# building pipeline
pipeline_rf = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model_rf", rf)])

# tuning parameters
param_grid = {"model_rf__n_estimators": [100,200,500,1000]}
grid_cv_rf = GridSearchCV(pipeline_rf, param_grid, cv=5, n_jobs=4)
grid_cv_rf.fit(X_train, y_train)
print("Best parameter for RandomForestRegressor:", grid_cv_rf.best_params_)

Best parameter for RandomForestRegressor: {'model_rf__n_estimators': 1000}


### Model 2: XGBoost

In [5]:
from xgboost import XGBRegressor

xgb = XGBRegressor()

# building pipeline
pipeline_xgb = Pipeline(steps=[("preprocessor", preprocessor),
                               ("model_xgb", xgb)])

# tuning parameters
param_grid = {"model_xgb__n_estimators": [100,200,500],
             "model_xgb__learning_rate": [0.1, 0.5]}
grid_cv_xgb = GridSearchCV(pipeline_xgb, param_grid, cv=5, n_jobs=4)
grid_cv_xgb.fit(X_train, y_train)
print("Best parameter for XGBoost:", grid_cv_xgb.best_params_)

Best parameter for XGBoost: {'model_xgb__learning_rate': 0.1, 'model_xgb__n_estimators': 200}


### compare two models' performance with X_val, y_val

In [6]:
from sklearn.metrics import mean_squared_error as mse

# prediction of model1
y_pred1 = grid_cv_rf.predict(X_val)
score1 = mse(y_val, y_pred1)

# prediction of model2
y_pred2 = grid_cv_xgb.predict(X_val)
score2 = mse(y_val, y_pred2)

# compare scores
if score1<score2:
    print("The random forest model (mse=%.4f) performs better than the XGB regressor (mse=%.4f)." %(score1,score2))
else:
    print("The XGB regressor (mse=%.4f) performs better than the random forest model (mse=%.4f)." %(score2,score1))

The XGB regressor (mse=1163772405.8828) performs better than the random forest model (mse=1356902082.2437).


In [7]:
y_pred = grid_cv_rf.predict(X_test)
otp_fn = "/kaggle/working/submission.csv"
df = pd.DataFrame(y_pred,index=X_test.index,columns=["SalePrice"])
df.to_csv(otp_fn,header=["SalePrice"])