In [3]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
from plotly import express as px, graph_objects as go
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

%matplotlib inline

In [4]:
train_data=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
train_data.head()

In [5]:
train_data.describe()

In [6]:
train_data.info()

In [15]:
train_data.loc[:,train_data.isna().mean()>0.4].isna().mean()


In [17]:
train_data = train_data.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

In [20]:
X = train_data.drop('SalePrice', axis=1)
y = np.log(train_data['SalePrice'])

In [21]:
nominal_features = [
    'MSSubClass', 'MSZoning', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 
    'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'SaleType', 
    'SaleCondition','GarageType'
]

ordinal_features = [
    'LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 
    'ExterCond', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 
    'Electrical', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
    'GarageFinish', 'GarageQual', 'GarageCond'
]

continuous_features = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
    'MiscVal'
]

discrete_features = [
    'YearBuilt', 'YearRemodAdd', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
    'MoSold', 'YrSold'
]

In [22]:
for col in (nominal_features + ordinal_features):
    X[col] = X[col].fillna('None')

In [23]:
for col in (continuous_features + discrete_features):
    X[col] = X[col].fillna(0)

In [24]:
print('\nOne-hot encoding...\n')

dummies = pd.get_dummies(X[nominal_features]).sort_index()

X = pd.concat([X, dummies], axis=1)
X = X.drop(nominal_features, axis=1)

X.info()
X.head()

In [25]:
rating = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

ordinal_encoding = {
    'LotShape': {'None': 0, 'Reg': 1, 'IR1': 2, 'IR2': 3, 'IR3': 4}, 
    'Utilities': {'None': 0, 'ElO': 1, 'NoSeWa': 2, 'NoSeWr': 3, 'AllPub': 4}, 
    'LandSlope': {'None': 0, 'Gtl': 1, 'Mod': 2, 'Sev': 3}, 
    'ExterQual': rating, 
    'ExterCond': rating, 
    'BsmtQual': rating, 
    'BsmtCond': rating, 
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}, 
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
    'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
    'HeatingQC': rating, 
    'CentralAir': {'None': 0, 'N': 1, 'Y': 2}, 
    'Electrical': {'None': 0, 'Mix': 1, 'FuseP': 2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5}, 
    'KitchenQual': rating, 
    'Functional': {'None': 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}, 
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}, 
    'GarageQual': rating, 
    'GarageCond': rating, 
    'PavedDrive': {'None': 0, 'N': 1, 'P': 2, 'Y': 3}
}

In [26]:
print('\nOrdinal encoding...\n')

X = X.replace(ordinal_encoding)

X.info()
X.head()

In [27]:
print('\nScaling features...\n')

scaler = StandardScaler().fit(X)

X[:] = scaler.transform(X)

X.info()
X.head()

In [28]:
pd.concat([X, y], axis=1).to_csv('HousePrices.csv', index=False)

In [29]:
def prepare_features(df: pd.DataFrame, scaler, feature_names: list = None):
    """Preparing features for training"""
    
    # Creating DF
    
    if feature_names is not None:
        X = pd.concat([pd.DataFrame(columns=feature_names), df])
    else:
        X = df
    
    # Defining numerical and categorical features
    
    nominal_features = [
        'MSSubClass', 'MSZoning', 'Street', 'LandContour', 'LotConfig', 'Neighborhood', 
        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 
        'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'SaleType', 
        'SaleCondition','GarageType'
    ]

    ordinal_features = [
        'LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 
        'ExterCond', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 
        'Electrical', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
        'GarageFinish', 'GarageQual', 'GarageCond'
    ]

    continuous_features = [
        'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
        'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
        'MiscVal'
    ]

    discrete_features = [
        'YearBuilt', 'YearRemodAdd', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
        'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 
        'MoSold', 'YrSold'
    ]
    
    # Filling missing categorical values with None

    for col in (nominal_features + ordinal_features):
        X[col] = X[col].fillna('None')

    # Filling numerical missing values with 0

    for col in (continuous_features + discrete_features):
        X[col] = X[col].fillna(0)
    
    # One Hot Encoding

    dummies = pd.get_dummies(X[nominal_features]).sort_index()
    dummies_cols = list(set(dummies.columns) & set(X.columns))
    X[dummies_cols] = dummies[dummies_cols]
    X = X.drop(nominal_features, axis=1)
    
    # Ordinal Encoding

    rating = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    
    ordinal_encoding = {
        'LotShape': {'None': 0, 'Reg': 1, 'IR1': 2, 'IR2': 3, 'IR3': 4}, 
        'Utilities': {'None': 0, 'ElO': 1, 'NoSeWa': 2, 'NoSeWr': 3, 'AllPub': 4}, 
        'LandSlope': {'None': 0, 'Gtl': 1, 'Mod': 2, 'Sev': 3}, 
        'ExterQual': rating, 
        'ExterCond': rating, 
        'BsmtQual': rating, 
        'BsmtCond': rating, 
        'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}, 
        'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
        'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}, 
        'HeatingQC': rating, 
        'CentralAir': {'None': 0, 'N': 1, 'Y': 2}, 
        'Electrical': {'None': 0, 'Mix': 1, 'FuseP': 2, 'FuseF': 3, 'FuseA': 4, 'SBrkr': 5}, 
        'KitchenQual': rating, 
        'Functional': {'None': 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}, 
        'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}, 
        'GarageQual': rating, 
        'GarageCond': rating, 
        'PavedDrive': {'None': 0, 'N': 1, 'P': 2, 'Y': 3}
    }

    X = X.replace(ordinal_encoding)
    
    # Feature selection
    
    if feature_names is not None:
        X = X[feature_names]
    
    # Filling NAs
    
    X = X.fillna(0)
    
    # Scaling features
    
    X[:] = scaler.transform(X)
    
    return X

In [30]:
def plot_actual_vs_pred(model, X, y):
    """Plotting actual vs predicted label"""
    
    y_pred = np.exp(model.predict(X.values))
    
    plot_data = pd.concat([np.exp(y), pd.Series(y_pred, name='PredictedPrice', index=y.index)], axis=1)
    plot_data = plot_data.sort_values('SalePrice')
    plot_data.index = y.index
    plot_data = plot_data.reset_index()
    
    fig = px.scatter(plot_data, x='Id', y='SalePrice')
    fig.add_trace(go.Scatter(x=plot_data['Id'], y=plot_data['PredictedPrice'], name='Prediction'))
    fig.show()

In [31]:
from sklearn.ensemble import (
    BaggingRegressor,
    ExtraTreesRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor
)
from xgboost import XGBRegressor

In [32]:
bagging = BaggingRegressor(n_jobs=-1)
extraTrees = ExtraTreesRegressor(max_depth=10, n_jobs=-1)
randomForest = RandomForestRegressor(n_jobs=-1)
histGradientBoosting = HistGradientBoostingRegressor()
XGB = XGBRegressor(n_jobs=-1)

model = StackingRegressor([
    ('bagging', bagging),
    ('extraTress', extraTrees),
    ('randomforest', randomForest),
    ('histGradientBoosting', histGradientBoosting),
    ('XGB', XGB)
], n_jobs=-1)

In [33]:
model = model.fit(X.values, y)
model.score(X.values, y)

In [34]:
print('\nRMSE: ', np.sqrt(mean_squared_log_error(y, model.predict(X.values))))

In [36]:
plot_actual_vs_pred(model, X, y)

In [37]:
print('\nLoading test data...\n')

test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col="Id")

test_df.head()

In [38]:
print('\nPreparing data...\n')

X_test = prepare_features(test_df, scaler, X.columns)

X_test.head()

In [39]:
print('\nPredicting target...\n')

test_preds = pd.DataFrame.from_dict({'Id': test_df.index,'SalePrice': np.exp(model.predict(X_test.values))})

test_preds.head()

In [40]:
print('\nSaving output...')

test_preds.to_csv('submission.csv', index=False)

print('\nSaved successfully!\n')