## Imports and Configuration ##

In [None]:
import results as r
import os
import warnings
from pathlib import Path

import optuna
import matplotlib as mpt
import category_encoders as ce
import xgboost as xgb
import sklearn as skl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from pandas.core.frame import DataFrame
from category_encoders import MEstimateEncoder
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer,OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

print('pandas')
print(pd.__version__)
print('sklearn')
print(skl.__version__)
print('xgboost')
print(xgb.__version__)
print('category_encoders')
print(ce.__version__)
print('seaborn')
print(sns.__version__)
print('matplotlib')
print(mpt.__version__)


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')

y_column_name = 'SalePrice'

def to_dataframe(index, columns):
    return FunctionTransformer(lambda X: pd.DataFrame(X, index=index, columns=columns))


## Load data ##

In [None]:
data_dir = Path("../input/house-prices-advanced-regression-techniques/")
df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
# Merge the splits so we can process them together
df_in = pd.concat([df_train, df_test])


## Score dataset ##

In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


In [None]:
def optimize_params(optimize_X, optimize_y, n_trials=10):
    def objective(trial):
        xgb_params = dict(
            predictor='cpu_predictor',
            num_parallel_tree=4,
            gamma=0,
            max_delta_step=trial.suggest_discrete_uniform(
                'max_delta_step', 0, 10, 0.1),
            eta=trial.suggest_discrete_uniform(
                'eta', 0.1, 1.0, 0.001),
            colsample_bytree=trial.suggest_discrete_uniform(
                'colsample_bytree', 0.1, 1.0, 0.001),
            colsample_bylevel=trial.suggest_discrete_uniform(
                'colsample_bylevel', 0.1, 1.0, 0.001),
            colsample_bynode=trial.suggest_discrete_uniform(
                'colsample_bynode', 0.1, 1.0, 0.001),
            max_depth=trial.suggest_int("max_depth", 2, 8),
            learning_rate=trial.suggest_float(
                "learning_rate", 1e-4, 1e-1, log=True),
            n_estimators=trial.suggest_int("n_estimators", 1000, 6000),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 5),
            subsample=trial.suggest_float("subsample", 0.2, 1.0),
            reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        )
        xgb = XGBRegressor(random_state=0, **xgb_params)
        return score_dataset(optimize_X, optimize_y, xgb)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    return study.best_params


## Preprocess data ##

### Clean Data ###

In [None]:
def clean(df: DataFrame):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(
        df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df

df_in = df_in[df_in.LotArea < 100000]
df_train = df_train[df_train.LotArea < 100000]
df_in = clean(df_in)


### Encode nominative (unordered) categorical features ###

In [None]:
features_nominative = [
    "MSSubClass", "MSZoning",
    "Street", "Alley",
    "LandContour", "LotConfig",
    "Neighborhood", "Condition1",
    "Condition2", "BldgType",
    "HouseStyle", "RoofStyle",
    "RoofMatl", "Exterior1st",
    "Exterior2nd", "MasVnrType",
    "Foundation", "Heating",
     "GarageType",
    "MiscFeature", "SaleType",
    "SaleCondition"
]

for name in features_nominative:
    df_in[name] = df_in[name].astype("category")

categorical_nominative_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(
        strategy='constant',
        missing_values=np.nan,
        fill_value='None')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)),
])


### Encode the ordinal (ordered) categorical features ###

In [None]:
# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

ordered_levels_int = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
}

ordered_levels = {
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

features_ordered = list(ordered_levels.keys())

for name, levels in ordered_levels.items():
    df_in[name] = df_in[name].astype(CategoricalDtype(levels, ordered=True))

categorical_ordinal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(
            strategy='constant',
            missing_values=np.nan,
            fill_value='None')),
        ('encoder', OrdinalEncoder(
            handle_unknown='use_encoded_value', unknown_value=np.nan)),
    ]
)


### Numerical data transform ###

In [None]:
test_exclude = ['MasVnrArea', 'BsmtFullBath', 'BsmtHalfBath',
                'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'GarageArea', 'GarageCars', 'TotalBsmtSF']

features_num = list(df_in.select_dtypes(
    include=np.number).columns.difference([y_column_name]))

# features_num = ['LotArea', 'LotFrontage']

print(list(features_num))

numerical_constant_transformer = Pipeline(steps=[
    ('imputer_constant', SimpleImputer(strategy='constant', fill_value=0,
     missing_values=np.nan)),
])


## Numerical data sndbox ##

In [None]:
df_features_num = df_in[features_num]
df_features_num_nan = pd.isna(df_features_num)
l = len(df_features_num_nan.index)
df_nan_ratio = df_features_num_nan.sum() / l
df_nan_ratio.loc[df_nan_ratio > 0].sort_values(ascending=False)

# df_area_vs_frontage = df_features_num[['LotArea','LotFrontage']]

# from sklearn.tree import DecisionTreeRegressor

# melbourne_model = DecisionTreeRegressor(random_state=1)

# # melbourne_model.fit(X, y)

# # df_area_vs_frontage = df_area_vs_frontage.loc[pd.notna(df_area_vs_frontage.LotFrontage)]
# df_area_vs_frontage = df_area_vs_frontage.loc[df_train.index, :]
# sns.scatterplot(x=df_area_vs_frontage['LotArea'], y=df_area_vs_frontage['LotFrontage'])



In [None]:
numerical_sandbox_transformer = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ('num_default', numerical_constant_transformer, features_num),
        ('price', 'passthrough', [y_column_name]),
    ])

numerical_sandbox_pipeline = Pipeline(steps=[
    ('transformer', numerical_sandbox_transformer),
    ('dataframe', to_dataframe(df_in.index, features_num + [y_column_name])),
])

sandbox_num_df = numerical_sandbox_pipeline.fit_transform(df_in)

sandbox_num_df_train = sandbox_num_df.loc[df_train.index, :]
sandbox_num_y = sandbox_num_df_train.pop('SalePrice')
sandbox_num_X = sandbox_num_df_train


In [None]:
# xgb_params_optimize = optimize_params(sandbox_num_X, sandbox_num_y, n_trials=40)
# print(xgb_params_optimize)

In [None]:
sandbox_num_params = {
    'num_parallel_tree': 5,
    'predictor': 'cpu_predictor',
    'max_depth': 3, 
    'learning_rate': 0.01310022730134661, 
    'n_estimators': 3393, 
    'min_child_weight': 1, 
    'colsample_bytree': 0.3093338706926109, 
    'subsample': 0.34665898773899856, 
    'reg_alpha': 0.00025979785900718985, 
    'reg_lambda': 0.6047124456612869,
}

In [None]:
sandbox_num_model = XGBRegressor(random_state=0, **sandbox_num_params)
sandbox_num_pipeline = Pipeline(steps=[
    ('model', sandbox_num_model)
], verbose=True)

# sandbox_num_scores = score_dataset(
#     sandbox_num_X, sandbox_num_y, model=sandbox_num_pipeline)

# r.result(sandbox_num_scores, 'num_results.json')


## Setup pipeline ##

In [None]:
column_transformer = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ('num_constant', numerical_constant_transformer, features_num),
        ('cat_nominative', categorical_nominative_transformer, features_nominative),
        ('cat_ordered', categorical_ordinal_transformer, features_ordered),
        ('price', 'passthrough', [y_column_name]),
    ])


transform_pipeline = Pipeline(steps=[
    ('encode', column_transformer),
    ('dataframe', to_dataframe(df_in.index, features_num + features_nominative + features_ordered + [y_column_name])),
], verbose=True)

transformed_df = transform_pipeline.fit_transform(df_in)


df_train = transformed_df.loc[df_train.index, :]
df_test = transformed_df.loc[df_test.index, :]

y = df_train.pop('SalePrice')
X = df_train

df_test.pop('SalePrice')
X_test = df_test


## Optimize XGBRegressor params ##

In [None]:
# xgb_params_optimize = optimize_params(X, y, n_trials=120)
# print(xgb_params_optimize)


In [None]:
xgb_params = {
    'num_parallel_tree': 2,
    'predictor': 'cpu_predictor',
    'gamma': 0.0,
    'max_delta_step': 3.4000000000000004, 
    'eta': 0.123, 
    'colsample_bytree': 0.96, 
    'colsample_bylevel': 0.221, 
    'colsample_bynode': 0.959,  
    'max_depth': 4, 
    'learning_rate': 0.029669310558200025, 
    'n_estimators': 1105, 
    'min_child_weight': 1, 
    'subsample': 0.3263923139834056, 
    'reg_alpha': 0.005254189426687715, 
    'reg_lambda': 1.5529469959822073
}


## Setup model ##

In [None]:
model = XGBRegressor(random_state=0, **xgb_params)
pipeline = Pipeline(steps=[
    ('model', model)
], verbose=True)

scores = score_dataset(X, y, model=pipeline)

print(r.result(scores, 'results.json'))


# Train Model and Create Submissions #

In [None]:
xgb = XGBRegressor(random_state=0, **xgb_params)
xgb.fit(X, np.log(y))
predictions = np.exp(xgb.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
