## Vars ##

In [None]:
from pathlib import Path

path = Path('./project_vars.py')
is_file_exist = path.is_file()

if is_file_exist:
    import project_vars

is_write_to_results = False if not is_file_exist else project_vars.is_write_to_results
save_itermidiate_results = False if not is_file_exist else project_vars.save_itermidiate_results

if is_write_to_results:
    import results as r

def write_to_results(scores: float):
    if is_write_to_results:
        print(r.result(scores, 'data/results.json'))
    else:
        print(scores)

## Imports and Configuration ##

In [None]:
import os
import warnings
import optuna

import matplotlib as mpt
import category_encoders as ce
import xgboost as xgb
import sklearn as skl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from pandas.core.frame import DataFrame
from category_encoders import MEstimateEncoder
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer,OrdinalEncoder,OneHotEncoder
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)

print('pandas')
print(pd.__version__)
print('sklearn')
print(skl.__version__)
print('xgboost')
print(xgb.__version__)
print('category_encoders')
print(ce.__version__)
print('seaborn')
print(sns.__version__)
print('matplotlib')
print(mpt.__version__)


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')

y_column_name = 'SalePrice'

def to_dataframe(index, columns):
    return FunctionTransformer(lambda X: pd.DataFrame(X, index=index, columns=columns))

path = Path('./vars.py')

print(path.is_file())


## Load data ##

In [None]:
data_dir = Path("../input/house-prices-advanced-regression-techniques/")
df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
# Merge the splits so we can process them together
df_in_raw = pd.concat([df_train, df_test])


## Score dataset ##

In [None]:
def score_dataset(X, y, model=XGBRegressor(), n_jobs=None):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error", n_jobs=n_jobs,
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


## Preprocess data ##

### Clean Data ###

In [None]:
removed_columns = [
    'YrSold',
    'MoSold',
    'Utilities',
    'RoofMatl',
    'Threeseasonporch',
    'MiscFeature',
    'BsmtFinSF2',
    'Condition2',
    'PoolArea',
    'PoolQC',
    'Alley',
]


def clean(df: DataFrame) -> DataFrame:
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(
        df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df.drop(removed_columns, axis=1)


df_in = df_in_raw[df_in_raw.LotArea < 100000]
df_train = df_train[df_train.LotArea < 100000]
df_in = clean(df_in)


### Encode nominative (unordered) categorical features ###

In [None]:
features_nominative = [
    "MSSubClass", "MSZoning",
    "Street", "Alley",
    "LandContour", "LotConfig",
    "Neighborhood", "Condition1",
    "Condition2", "BldgType",
    "HouseStyle", "RoofStyle",
    "RoofMatl", "Exterior1st",
    "Exterior2nd", "MasVnrType",
    "Foundation", "Heating",
    "GarageType",
    "MiscFeature", "SaleType",
    "SaleCondition"
]

features_nominative = [item for item in features_nominative if item not in removed_columns]

one_hot_count = sum([len(df_in[column].unique())
                    for column in features_nominative]) - 1

one_hote_names = ['OH{}'.format(name) for name in range(one_hot_count)]

for name in features_nominative:
    df_in[name] = df_in[name].astype("category")

categorical_nominative_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(
        strategy='constant',
        missing_values=np.nan,
        fill_value='None')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    # ('encoder', OrdinalEncoder(
    #     handle_unknown='use_encoded_value',
    #     unknown_value=np.nan
    # )),
])


### Encode the ordinal (ordered) categorical features ###

In [None]:
# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1,11))

ordered_levels_int = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
}

ordered_levels_int = {key: ordered_levels_int[key] for key in ordered_levels_int.keys() if key not in removed_columns}

ordered_levels = {
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

ordered_levels = {key: ordered_levels[key] for key in ordered_levels.keys() if key not in removed_columns}

features_ordered = list(ordered_levels.keys())
features_ordered_int = list(ordered_levels_int.keys())

for name, levels in ordered_levels.items():
    df_in[name] = df_in[name].astype(CategoricalDtype(levels, ordered=True))

for name, levels in ordered_levels_int.items():
    df_in[name] = df_in[name].astype(CategoricalDtype(levels, ordered=True))

categorical_ordinal_int_transformer = Pipeline(
    steps=[
         ('imputer', SimpleImputer(
            strategy='constant',
            missing_values=np.nan,
            fill_value=0)),
        ('encoder', OrdinalEncoder(
            handle_unknown='use_encoded_value', unknown_value=np.nan)),
    ]
)

categorical_ordinal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(
            strategy='constant',
            missing_values=np.nan,
            fill_value='None')),
        ('encoder', OrdinalEncoder(
            handle_unknown='use_encoded_value', unknown_value=np.nan)),
    ]
)


### Numerical data transform ###

In [None]:
features_num = list(
    df_in.select_dtypes(include=np.number)
    .columns
    .difference([y_column_name])
    )

print(list(features_num))

numerical_constant_transformer = Pipeline(steps=[
    ('imputer_constant', SimpleImputer(strategy='constant', fill_value=0,
     missing_values=np.nan)),
])

len(features_num)


## Setup pipeline ##

In [None]:
column_transformer = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ('num_constant', numerical_constant_transformer, features_num),
        ('cat_nominative', categorical_nominative_transformer, features_nominative),
        ('cat_ordered', categorical_ordinal_transformer, features_ordered),
        ('cat_ordered_int', categorical_ordinal_int_transformer, features_ordered_int),
        ('price', 'passthrough', [y_column_name]),
    ], verbose=True)


transform_pipeline = Pipeline(steps=[
    ('encode', column_transformer),
    ('dataframe', to_dataframe(df_in.index,
                               features_num +
                               one_hote_names +
                            #    features_nominative +
                               features_ordered +
                               features_ordered_int +
                               [y_column_name]
                               )),
], verbose=True)

transformed_df = transform_pipeline.fit_transform(df_in)


df_train = transformed_df.loc[df_train.index, :]
df_test = transformed_df.loc[df_test.index, :]

y: DataFrame = df_train.pop('SalePrice')
X: DataFrame = df_train

if save_itermidiate_results:
    y.to_csv('data/y.csv')
    X.to_csv('data/X.csv')

df_test.pop('SalePrice')
X_test = df_test


## Optimization ##

In [None]:
default_xgb_params_optimize = {}
default_xgb_params_optimize['predictor'] = 'cpu_predictor'
default_xgb_params_optimize['num_parallel_tree'] = 6
default_xgb_params_optimize['gamma'] = 0

def optimize_params(optimize_X, optimize_y, n_trials=10):
    def objective(trial):
        xgb_params = dict(
            **default_xgb_params_optimize,
            max_delta_step=trial.suggest_float('max_delta_step', 0, 10),
            eta=trial.suggest_float('eta', 0.1, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.1, 1.0),
            colsample_bylevel=trial.suggest_float(
                'colsample_bylevel', 0.1, 1.0),
            colsample_bynode=trial.suggest_float('colsample_bynode', 0.1, 1.0),
            max_depth=trial.suggest_int("max_depth", 2, 8),
            learning_rate=trial.suggest_float(
                "learning_rate", 1e-4, 1e-1, log=True),
            n_estimators=trial.suggest_int("n_estimators", 1000, 6000),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 5),
            subsample=trial.suggest_float("subsample", 0.2, 1.0),
            reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        )
        xgb = XGBRegressor(random_state=0, nthread=9, **xgb_params)
        return score_dataset(optimize_X, optimize_y, xgb, 10)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, n_jobs=1,)
    return study.best_params

# xgb_params_optimize = optimize_params(X, y, n_trials=300)
# xgb_params_optimize.update(default_xgb_params_optimize)
# print(xgb_params_optimize)

## XGBRegressor params ##

In [None]:
xgb_params = {
    'max_delta_step': 8.86618038915867, 
    'eta': 0.9834024455355324, 
    'colsample_bytree': 0.8675149982844935, 
    'colsample_bylevel': 0.7210269022252196, 
    'colsample_bynode': 0.22600933379430393, 
    'max_depth': 4, 
    'learning_rate': 0.016021794885963347, 
    'n_estimators': 3620, 
    'min_child_weight': 2, 
    'subsample': 0.45740090097048586, 
    'reg_alpha': 0.026940973044607994, 
 'reg_lambda': 1.0886035281053992,
    "predictor": "cpu_predictor",
    "num_parallel_tree": 6,
    "gamma": 0
}


## Setup model ##

In [None]:
model = XGBRegressor(random_state=0, nthread=9, **xgb_params)
pipeline = Pipeline(steps=[
    ('model', model)
], verbose=True)

scores = score_dataset(X, y, model=pipeline)

write_to_results(scores)


# Train Model and Create Submissions #

In [None]:
xgb = XGBRegressor(random_state=0, **xgb_params)
xgb.fit(X, np.log(y))
predictions = np.exp(xgb.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
