## Imports and Configuration ##

In [50]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from pandas.core.frame import DataFrame
from category_encoders import MEstimateEncoder
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer


# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')

## Load data ##

In [51]:
data_dir = Path("input/house-prices-advanced-regression-techniques/")
df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
# Merge the splits so we can process them together
df_in = pd.concat([df_train, df_test])

## Score dataset ##

In [52]:
def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

## Preprocess data ##

In [53]:
from sklearn.preprocessing import FunctionTransformer

def clean(df: DataFrame):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(
        df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df


def filter_categorical(df: DataFrame):
    return df.select_dtypes(exclude='object')


cleanup_transformer = FunctionTransformer(clean)
filter_categorical_transformer = FunctionTransformer(filter_categorical)
imputer = SimpleImputer(strategy='median')

## Setup pipeline ##

In [54]:

pipeline = Pipeline(steps=[
    ('clean', cleanup_transformer),
    ('filter', filter_categorical_transformer)
])

transformed_df = pipeline.transform(df_in)
df_train = transformed_df.loc[df_train.index, :]
df_test = transformed_df.loc[df_test.index, :]

y = df_train.pop('SalePrice')
X = df_train

df_test.pop('SalePrice')
X_test = df_test


model = XGBRegressor(random_state=0)
pipeline = Pipeline(steps=[
    ('imputer', imputer),
    ('model', model)
])

scores = score_dataset(X, y, model=pipeline)

print("Scores:\n", scores)


Scores:
 0.14555807805491183


prev: 0.14699598651540163

new:  0.14555807805491183

# Train Model and Create Submissions #

In [None]:
xgb = XGBRegressor(random_state=0)
xgb.fit(X, np.log(y))
predictions = np.exp(xgb.predict(X_test))

output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
