## Imports and Configuration ##

In [None]:
import os
import warnings
from pathlib import Path

import optuna
import matplotlib as mpt
import category_encoders as ce
import xgboost as xgb
import sklearn as skl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from pandas.core.frame import DataFrame
from category_encoders import MEstimateEncoder
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

print('pandas')
print(pd.__version__)
print('sklearn')
print(skl.__version__)
print('xgboost')
print(xgb.__version__)
print('category_encoders')
print(ce.__version__)
print('seaborn')
print(sns.__version__)
print('matplotlib')
print(mpt.__version__)


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')


## Load data ##

In [None]:
data_dir = Path("../input/house-prices-advanced-regression-techniques/")
df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
# Merge the splits so we can process them together
df_in = pd.concat([df_train, df_test])


## Score dataset ##

In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


## Preprocess data ##

In [None]:
from sklearn.preprocessing import FunctionTransformer


def clean(df: DataFrame):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(
        df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df


df_in = clean(df_in)


In [83]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator


def to_dataframe(index, columns):
    return FunctionTransformer(lambda X: pd.DataFrame(X, index=index, columns=columns))


features_nom = [
    "MSSubClass", "MSZoning",
    "Street", "Alley",
    "LandContour", "LotConfig",
    "Neighborhood", "Condition1",
    "Condition2", "BldgType",
    "HouseStyle", "RoofStyle",
    "RoofMatl", "Exterior1st",
    "Exterior2nd", "MasVnrType",
    "Foundation", "Heating",
    "CentralAir", "GarageType",
    "MiscFeature", "SaleType",
    "SaleCondition"
]

for name in features_nom:
    df_in[name] = df_in[name].astype("category")

features_num = list(df_in.select_dtypes(include=np.number).columns)

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',
     missing_values=np.nan, fill_value='None')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)),
])


## Setup pipeline ##

In [92]:
y_column = ['SalePrice']

column_transformer = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ('cat', categorical_transformer, features_nom),
        ('price', 'passthrough', y_column),
    ])


transform_pipeline = Pipeline(steps=[
    ('encode', column_transformer),
    ('dataframe', to_dataframe(df_in.index, features_nom + y_column)),
], verbose=False)

transformed_df = transform_pipeline.fit_transform(df_in)


df_train = transformed_df.loc[df_train.index, :]
df_test = transformed_df.loc[df_test.index, :]

y = df_train.pop('SalePrice')
X = df_train

df_test.pop('SalePrice')
X_test = df_test


## XGBRegressor params ##

In [None]:

def objective(trial):
    xgb_params = dict(
        predictor='cpu_predictor',
        max_depth=trial.suggest_int("max_depth", 2, 20),
        num_parallel_tree=5,
        learning_rate=trial.suggest_float(
            "learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 12000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 15),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
    )
    xgb = XGBRegressor(random_state=0, **xgb_params)
    return score_dataset(X, y, xgb)


# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=320)
# xgb_params = study.best_params
# print('Best params:')
# print(xgb_params)


In [95]:
xgb_params = {
    'num_parallel_tree': 5,
    'predictor': 'cpu_predictor',
    'max_depth': 3,
    'learning_rate': 0.021417776537357703,
    'n_estimators': 3197,
    'min_child_weight': 1,
    'colsample_bytree': 0.2019378980891107,
    'subsample': 0.7824234012865849,
    'reg_alpha': 0.002657877890767917,
    'reg_lambda': 0.0008794544069627564
}


## Setup model ##

In [96]:
model = XGBRegressor(random_state=0, **xgb_params)
pipeline = Pipeline(steps=[
    ('model', model)
], verbose=True)

scores = score_dataset(X, y, model=pipeline)

print("Scores:\n", scores)


[Pipeline] ............. (step 1 of 1) Processing model, total=   3.4s
[Pipeline] ............. (step 1 of 1) Processing model, total=   3.4s
[Pipeline] ............. (step 1 of 1) Processing model, total=   3.5s
[Pipeline] ............. (step 1 of 1) Processing model, total=   3.4s
[Pipeline] ............. (step 1 of 1) Processing model, total=   3.4s
Scores:
 0.20151813675487693


prev: 0.11926628354817692

new:  0.11882136204052456

# Train Model and Create Submissions #

In [None]:
xgb = XGBRegressor(random_state=0, **xgb_params)
xgb.fit(X, np.log(y))
predictions = np.exp(xgb.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
