## Vars ##

In [1]:
from pathlib import Path

path = Path('./project_vars.py')
is_file_exist = path.is_file()

if is_file_exist:
    import project_vars

is_write_to_results = False if not is_file_exist else project_vars.is_write_to_results
save_itermidiate_results = False if not is_file_exist else project_vars.save_itermidiate_results

if is_write_to_results:
    import results as r

def write_to_results(scores: float):
    if is_write_to_results:
        print(r.result(scores, 'data/results.json'))
    else:
        print(scores)

## Imports and Configuration ##

In [2]:
import os
import warnings

import matplotlib as mpt
import category_encoders as ce
import xgboost as xgb
import sklearn as skl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from pandas.core.frame import DataFrame
from category_encoders import MEstimateEncoder
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer,OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)

print('pandas')
print(pd.__version__)
print('sklearn')
print(skl.__version__)
print('xgboost')
print(xgb.__version__)
print('category_encoders')
print(ce.__version__)
print('seaborn')
print(sns.__version__)
print('matplotlib')
print(mpt.__version__)


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')

y_column_name = 'SalePrice'

def to_dataframe(index, columns):
    return FunctionTransformer(lambda X: pd.DataFrame(X, index=index, columns=columns))

path = Path('./vars.py')

print(path.is_file())


Python version
3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]
Version info.
sys.version_info(major=3, minor=11, micro=0, releaselevel='final', serial=0)
pandas
1.5.3
sklearn
1.2.1
xgboost
1.7.3
category_encoders
2.6.0
seaborn
0.12.2
matplotlib
3.7.0
False


  plt.style.use("seaborn-whitegrid")


## Load data ##

In [3]:
data_dir = Path("../input/house-prices-advanced-regression-techniques/")
df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
# Merge the splits so we can process them together
df_in = pd.concat([df_train, df_test])


## Score dataset ##

In [4]:
def score_dataset(X, y, model=XGBRegressor(), n_jobs=None):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error", n_jobs=n_jobs,
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


## Preprocess data ##

### Clean Data ###

In [5]:
def clean(df: DataFrame):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(
        df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df

df_in = df_in[df_in.LotArea < 100000]
df_train = df_train[df_train.LotArea < 100000]
df_in = clean(df_in)


### Encode nominative (unordered) categorical features ###

In [6]:
features_nominative = [
    "MSSubClass", "MSZoning",
    "Street", "Alley",
    "LandContour", "LotConfig",
    "Neighborhood", "Condition1",
    "Condition2", "BldgType",
    "HouseStyle", "RoofStyle",
    "RoofMatl", "Exterior1st",
    "Exterior2nd", "MasVnrType",
    "Foundation", "Heating",
     "GarageType",
    "MiscFeature", "SaleType",
    "SaleCondition"
]

for name in features_nominative:
    df_in[name] = df_in[name].astype("category")

categorical_nominative_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(
        strategy='constant',
        missing_values=np.nan,
        fill_value='None')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)),
])


### Encode the ordinal (ordered) categorical features ###

In [7]:
# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1,11))

ordered_levels_int = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
}

ordered_levels = {
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

features_ordered = list(ordered_levels.keys())
features_ordered_int = list(ordered_levels_int.keys())

for name, levels in ordered_levels.items():
    df_in[name] = df_in[name].astype(CategoricalDtype(levels, ordered=True))

for name, levels in ordered_levels_int.items():
    df_in[name] = df_in[name].astype(CategoricalDtype(levels, ordered=True))

categorical_ordinal_int_transformer = Pipeline(
    steps=[
         ('imputer', SimpleImputer(
            strategy='constant',
            missing_values=np.nan,
            fill_value=0)),
        ('encoder', OrdinalEncoder(
            handle_unknown='use_encoded_value', unknown_value=np.nan)),
    ]
)

categorical_ordinal_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(
            strategy='constant',
            missing_values=np.nan,
            fill_value='None')),
        ('encoder', OrdinalEncoder(
            handle_unknown='use_encoded_value', unknown_value=np.nan)),
    ]
)


### Numerical data transform ###

In [8]:
test_exclude = ['MasVnrArea', 'BsmtFullBath', 'BsmtHalfBath',
                'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'GarageArea', 'GarageCars', 'TotalBsmtSF']

features_num = list(df_in.select_dtypes(
    include=np.number).columns.difference([y_column_name]))

# features_num = ['LotArea', 'LotFrontage']

print(list(features_num))

numerical_constant_transformer = Pipeline(steps=[
    ('imputer_constant', SimpleImputer(strategy='constant', fill_value=0,
     missing_values=np.nan)),
])


['BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FirstFlrSF', 'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'MoSold', 'OpenPorchSF', 'PoolArea', 'ScreenPorch', 'SecondFlrSF', 'Threeseasonporch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold']


## Setup pipeline ##

In [9]:
column_transformer = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ('num_constant', numerical_constant_transformer, features_num),
        ('cat_nominative', categorical_nominative_transformer, features_nominative),
        ('cat_ordered', categorical_ordinal_transformer, features_ordered),
        ('cat_ordered_int', categorical_ordinal_int_transformer, features_ordered_int),
        ('price', 'passthrough', [y_column_name]),
    ], verbose=True)


transform_pipeline = Pipeline(steps=[
    ('encode', column_transformer),
    ('dataframe', to_dataframe(df_in.index,
                               features_num +
                               features_nominative +
                               features_ordered +
                               features_ordered_int +
                               [y_column_name]
                               )),
], verbose=True)

transformed_df = transform_pipeline.fit_transform(df_in)


df_train = transformed_df.loc[df_train.index, :]
df_test = transformed_df.loc[df_test.index, :]

y: DataFrame = df_train.pop('SalePrice')
X: DataFrame = df_train

if save_itermidiate_results:
    y.to_csv('data/y.csv')
    X.to_csv('data/X.csv')

df_test.pop('SalePrice')
X_test = df_test


[ColumnTransformer] .. (1 of 5) Processing num_constant, total=   0.0s
[ColumnTransformer]  (2 of 5) Processing cat_nominative, total=   0.0s
[ColumnTransformer] ... (3 of 5) Processing cat_ordered, total=   0.0s
[ColumnTransformer]  (4 of 5) Processing cat_ordered_int, total=   0.0s
[ColumnTransformer] ......... (5 of 5) Processing price, total=   0.0s
[Pipeline] ............ (step 1 of 2) Processing encode, total=   0.0s
[Pipeline] ......... (step 2 of 2) Processing dataframe, total=   0.0s


## XGBRegressor params ##

In [10]:
xgb_params = {
    "max_delta_step": 6.0,
    "eta": 0.248,
    "colsample_bytree": 0.234,
    "colsample_bylevel": 0.855,
    "colsample_bynode": 0.67,
    "max_depth": 3,
    "learning_rate": 0.01926536672403492,
    "n_estimators": 2974,
    "min_child_weight": 1,
    "subsample": 0.6020477193081436,
    "reg_alpha": 0.00522653797861164,
    "reg_lambda": 0.04226667783580202,
    "predictor": "cpu_predictor",
    "num_parallel_tree": 6,
    "gamma": 0
}


## Setup model ##

In [11]:
model = XGBRegressor(random_state=0, nthread=9, **xgb_params)
pipeline = Pipeline(steps=[
    ('model', model)
], verbose=True)

scores = score_dataset(X, y, model=pipeline)

write_to_results(scores)


[Pipeline] ............. (step 1 of 1) Processing model, total=   3.6s
[Pipeline] ............. (step 1 of 1) Processing model, total=   3.6s
[Pipeline] ............. (step 1 of 1) Processing model, total=   3.6s
[Pipeline] ............. (step 1 of 1) Processing model, total=   3.6s
[Pipeline] ............. (step 1 of 1) Processing model, total=   3.6s
{'prev': 0.11720723019509208, 'curr': 0.11696939948027624, 'best': 0.11696939948027624}


# Train Model and Create Submissions #

In [None]:
xgb = XGBRegressor(random_state=0, **xgb_params)
xgb.fit(X, np.log(y))
predictions = np.exp(xgb.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")
