In [None]:
!wget https://raw.githubusercontent.com/xslyr/LbEncoder/main/lbencoder.py
!pip install pycaret

In [None]:
import numpy as np 
import pandas as pd 
import lbencoder, warnings
from enum import Enum

from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split

from pycaret.regression import setup, compare_models, tune_model

import tensorflow as tf
from tensorflow.keras import layers as ly
from tensorflow.keras import Model
from tensorflow.keras.utils import plot_model

warnings.filterwarnings('ignore')
tf.random.set_seed(10)

EXEC_MODE = 'test'
FEAT_SELECT = 0.5
QTY_REGRESSORS = 10
TUNE_REGRESSORS = False

# **Loading input data**

In [None]:
# Loading xtrain datraframe
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train_id = train['Id']
train['MSSubClass'] = train['MSSubClass'].astype(str)

# Loading xtest dataframe
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_id = test['Id']
test['MSSubClass'] = test['MSSubClass'].astype(str)

# Merging x_train and x_test. 
# This is convenient for us to prepare data just one time instead do it separately in x_test. 
# The right way is to build a pipeline of methods that will do this transformations for us.
dataframe = pd.concat([train,test], axis=0)
dataframe.index = list(dataframe['Id'])

y_train = dataframe.iloc[:list(train_id)[-1],-1]
dataframe = dataframe.drop(['Id', 'SalePrice'], axis=1) 


## **Miss values treatment**
Some columns adopt the string 'None' or 'NA' to mention nan values. Right below we ensure this occurrence properly.
For object columns, we fill nan values based on K-Neighbors Classification proximity., in the same way, K-Neighbors Regression to change nan values by numbers.

In [None]:
fill_None = ['MasVnrType']
fill_NA = ['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu',
           'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']

for col in fill_NA:
    dataframe[col] = dataframe[col].fillna('NA')
    
for col in fill_None:
    dataframe[col] = dataframe[col].fillna('None')
    

In [None]:
def get_nulls(strtype):
    if strtype=='all':
        return pd.DataFrame(dataframe.isna().sum(), columns=['sum'])
    else:
        param = object if strtype=='object' else np.number
        na = pd.DataFrame(dataframe.select_dtypes(param).isna().sum(), columns=['sum'])
        return na.loc[na['sum']>0].sort_values('sum', ascending=False)
    
def knc_imputer(df, target):
    numdf = dataframe.select_dtypes(np.number)
    columns = numdf.loc[:, numdf.notna().all() ].columns
    x_train = df.loc[ df[target].notna(), columns ]
    y_train = df.loc[ df[target].notna(), target ]
    x_test = df.loc[ df[target ].isna(), columns ]
    knn = KNeighborsClassifier()
    knn.fit(x_train,y_train)
    return knn.predict(x_test)

for item in get_nulls('object').index:
    dataframe.loc[dataframe[item].isna()==True, item] = knc_imputer(dataframe, item)
  

In [None]:
def knr_imputer(df, target):
    numdf = df.select_dtypes(np.number)
    columns = numdf.loc[:, numdf.notna().all() ].columns

    first_inertia, n_clusters = 0,1
    for i in range(1, 15):
        kmeans = KMeans(n_clusters = i, init = 'k-means++', n_init='auto', random_state = 0)
        kmeans.fit(df[columns])
        if i==1 : first_inertia = kmeans.inertia_
        else:
            if kmeans.inertia_/first_inertia < 0.03: break
            else: n_clusters=i
            
    x_train = df.loc[ df[target].notna(), columns ]
    y_train = df.loc[ df[target].notna(), target ]
    x_test = df.loc[ df[target ].isna(), columns ]
    knr = KNeighborsRegressor(n_neighbors=n_clusters, weights='distance',algorithm='auto')
    knr.fit(x_train,y_train)
    return knr.predict(x_test)

for item in get_nulls('number').index:
    dataframe.loc[dataframe[item].isna()==True, item] = knr_imputer(dataframe, item)


# **Feature Engineering**
ML models read numbers, so we need find ways to transform categorical features on something readable by this machines.
Below I used some wonderful tactic for some columns. It's consist in change each value by statistical occurrences on their "feature space".


In [None]:
def get_dictionary(target, addcols=['mean','median'], log_target=False):
    # range e sum foram retirados das colunas padrões pois são influenciados pela quantidade de registros
    
    aux_df = train[[target,'SalePrice']].copy()
    if log_target: aux_df['SalePrice'] = np.log2(aux_df['SalePrice'])
    dictionary_cols = []
    if 'mean' in addcols: dictionary_cols.append('{}.mean'.format(target))
    if 'median' in addcols: dictionary_cols.append('{}.median'.format(target))
    line = []
    if 'mean' in addcols: line.append(aux_df['SalePrice'].mean())
    if 'median' in addcols: line.append(aux_df['SalePrice'].median())
    dictionary_df = pd.DataFrame([line], index=['AllData'], columns=dictionary_cols)
    for item in aux_df[target].unique():
        ocurrence = aux_df.loc[ aux_df[target]== item ]
        line = []
        if 'mean' in addcols: line.append(ocurrence['SalePrice'].mean())
        if 'median' in addcols: line.append(ocurrence['SalePrice'].median())
        dictionary_df = pd.concat( [dictionary_df, pd.DataFrame([line], index=[item], columns=dictionary_cols)], axis=0)
    dictionary_df = dictionary_df.fillna(0)
    return dictionary_df

def change_columns_to_details(dataframe, column ,dictionary):
    aux_df, data = pd.DataFrame(), dataframe.copy()
    for line in data[column]:
        try: aux_df = pd.concat( [aux_df, pd.DataFrame(dictionary.loc[line]).transpose() ], axis=0)
        except: aux_df = pd.concat( [aux_df, pd.DataFrame(dictionary.loc['AllData']).transpose() ], axis=0)
        
    aux_df.index = data.index
    index_2b_change = data.columns.get_loc(column)
    return pd.concat([data.iloc[:,:index_2b_change], aux_df ,data.iloc[:,index_2b_change+1:]], axis=1)

  
for col in ['Neighborhood','MSSubClass','MSZoning','HouseStyle','BldgType','LotShape','Condition1','Condition2','Electrical','Exterior1st','Exterior2nd','Foundation','Functional','LotConfig','RoofStyle']:
    dataframe = change_columns_to_details(dataframe, col, get_dictionary(col))



In [None]:
dataframe['AverageRoomSize'] = dataframe['GrLivArea']/(dataframe['TotRmsAbvGrd']+ dataframe['FullBath']+ dataframe['HalfBath']+ dataframe['BsmtFullBath'] + dataframe['BsmtHalfBath'] )
dataframe['QualityOverview'] = dataframe['OverallCond'] + dataframe['OverallQual']

dataframe['MoSold'] = dataframe['MoSold'].apply(lambda x:-np.cos(0.5236*x)+1)

dataframe['YrBetweenBuiltSold'] = dataframe['YrSold']-dataframe['YearBuilt']+dataframe['MoSold']/12
dataframe['YrBetweenBuiltSold'] = dataframe['YrBetweenBuiltSold'].apply(lambda x: 0 if x < 0 else x)

dataframe['YrBetweenRemodSold'] = dataframe['YrSold'] - dataframe['YearRemodAdd'] + dataframe['MoSold']/12
dataframe['YrBetweenRemodSold'] = dataframe['YrBetweenRemodSold'].apply(lambda x: 0 if x < 0 else x)


ScalerColumns = { 
    'ExterQual': ['Po','Fa','TA','Gd','Ex'],
    'ExterCond': ['Po','Fa','TA','Gd','Ex'],
    'BsmtQual': ['NA','Po','Fa','TA','Gd','Ex'],
    'BsmtCond': ['NA','Po','Fa','TA','Gd','Ex'],
    'BsmtExposure': ['NA','No','Mn','Av','Gd'],
    'BsmtFinType1': ['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
    'BsmtFinType2': ['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
    'HeatingQC' : ['Po','Fa','TA','Gd','Ex'],
    'KitchenQual' : ['NA','Fa','TA','Gd','Ex'],
    'FireplaceQu' : ['NA','Po','Fa','TA','Gd','Ex'],
    'GarageFinish': ['NA','Unf','RFn','Fin'],
    'GarageQual' : ['NA','Po','Fa','TA','Gd','Ex'],
    'GarageCond' : ['NA','Po','Fa','TA','Gd','Ex'],
    'PoolQC': ['NA','Fa','TA','Gd','Ex'],
    'Alley': ['NA','Grvl','Pave']
}
sc_scalers = {}

for col in ScalerColumns:
    sc_scalers[col] = lbencoder.LbEncoder()
    dataframe[col] = sc_scalers[col].fit_transform(dataframe[col], sort_by=ScalerColumns[col])


dataframe['OpenScreen3SsnPorchSF'] = dataframe['OpenPorchSF']+dataframe['3SsnPorch']+dataframe['ScreenPorch']
dataframe['GeneralExternEvaluation'] = dataframe['ExterQual'] + dataframe['ExterCond']
dataframe['BsmtFinTypes'] = dataframe['BsmtFinType1'] + dataframe['BsmtFinType2']
dataframe['BsmtFinSFs'] = dataframe['BsmtFinSF1']+dataframe['BsmtFinSF2']
dataframe['BsmtBaths'] = dataframe['BsmtFullBath'] + 0.5*dataframe['BsmtHalfBath']
dataframe['LowQualFinSF'] = dataframe['LowQualFinSF']/( dataframe['GrLivArea'] + dataframe['TotalBsmtSF'] )
dataframe['BathsAboveGrade'] = dataframe['FullBath'] + 0.5*dataframe['HalfBath']


dataframe = pd.get_dummies(dataframe, columns=['MiscFeature'])
dataframe['GarageCond'] = dataframe['GarageCond']+ dataframe['MiscVal']*dataframe['MiscFeature_Gar2']/dataframe['MiscVal'].max()
dataframe['MiscFeature_Othr'] = dataframe['MiscVal']*dataframe['MiscFeature_Othr']/dataframe['MiscVal'].max()
dataframe['MiscFeature_Shed'] = dataframe['MiscVal']*dataframe['MiscFeature_Shed']/dataframe['MiscVal'].max()
dataframe['MiscFeature_TenC'] = dataframe['MiscVal']*dataframe['MiscFeature_TenC']/dataframe['MiscVal'].max()


dataframe = dataframe.drop(['MiscVal','MiscFeature_Gar2'], axis=1)
dataframe = dataframe.drop(['OpenPorchSF','3SsnPorch','ScreenPorch'], axis=1)
dataframe = dataframe.drop(['ExterQual','ExterCond'], axis=1)
dataframe = dataframe.drop(['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'], axis=1)
dataframe = dataframe.drop(['YrSold','MoSold'], axis=1)
dataframe = dataframe.drop(['BsmtFinType1','BsmtFinType2'], axis=1)
dataframe = dataframe.drop(['BsmtFinSF1','BsmtFinSF2'], axis=1)


# **Column Encoding and Feature Scaling**

In [None]:
stde = StandardScaler()

data_dummie = pd.get_dummies(dataframe)
data_dummie = pd.DataFrame(stde.fit_transform(data_dummie), columns=data_dummie.columns, index=data_dummie.index)
x_train = data_dummie.iloc[:1460,:]
x_test = data_dummie.iloc[1460:,:]

xtrain, xval, ytrain, yval = train_test_split(x_train, y_train, test_size=0.2)

print(f'Shape of train dataframe: {xtrain.shape}\nShape of test dataframe: {xval.shape}')

-----------

# **Model Selection**

In [None]:
if EXEC_MODE == 'main':
    _ = setup(data = pd.concat([x_train,y_train],axis=1), target='SalePrice', feature_selection=True, n_features_to_select=FEAT_SELECT, verbose=1) 
else:
    _ = setup(data = pd.concat([xtrain,ytrain],axis=1), target='SalePrice', feature_selection=True, n_features_to_select=FEAT_SELECT, verbose=1) 
    
models_set = compare_models(n_select=QTY_REGRESSORS)

In [None]:
models = {}

if TUNE_REGRESSORS == True:
    for model in models_set:
        model_name = str(model.__class__).split('.')[-1][:-2]
        tuned_model, tuner = tune_model( model, n_iter=10, optimize='MAE', choose_better=True, return_train_score=False, return_tuner=True, verbose=True)
        models[ model_name ] = { 'instance':tuned_model, 'tuner':tuner }
else:
    for model in models_set:
        model_name = str(model.__class__).split('.')[-1][:-2]
        models[model_name] = { 'instance':model}

In [None]:
for model_name in models:
    model = models[model_name]['instance']
    features = None
    
    for method in ['feature_name_','feature_names_in_','feature_names_']:
        try: 
            features = list(eval(f'model.{method}'))
            break
        except: pass
        
    if features == None: raise Exception(f'No feature_names_in found for regressor {model_name}') 
    else: print(f"Features of {model_name} are loaded.")
    models[model_name].update({'features':features})


In [None]:
def get_mask( list_of_features:list ):
    result = [0] * x_train.shape[1]
    for f in list_of_features:
        result[ x_train.columns.get_loc(f) ] = 1
    return result


from tensorflow.keras.layers import Layer

class RegressorBlock(Layer):
    def __init__(self, regressor, **kwargs ):
        super().__init__(trainable=False, dynamic=False, **kwargs)
        self.regressor = regressor
        
    def build(self, input_shape):
        super().build(input_shape)
    
    def custom_activation(self, x):
        x_pred = tf.make_ndarray(tf.make_tensor_proto(x))
        return tf.convert_to_tensor(self.regressor.predict(x_pred), dtype=tf.float32)
    
    def call(self, inputs):
        assert len(inputs.shape) == 2
        result = tf.py_function(self.custom_activation, [inputs], tf.float32)
        return tf.reshape(result, shape=(-1,1))



In [None]:
for model_name in models:
    models[model_name].update({'tensor':tf.constant( get_mask(models[model_name]['features']), dtype=tf.float32)})

In [None]:
input_len = x_train.shape[1] 
masking_func = lambda param: tf.boolean_mask(param[0],param[1], axis=1)
concat_array =[]

l_input = ly.Input(shape=(input_len,),name='input') 

for model_name in models:
    n_features = len(models[model_name]['features'])
    l_mask = ly.Lambda(masking_func, output_shape=(None, n_features), name=f"masking_layer_{ model_name }")([l_input, models[model_name]['tensor']])
    
    l_reg = RegressorBlock( models[model_name]['instance'], name=f"block_{ model_name }" )(l_mask) 
    concat_array.append(l_reg)

l_concat = ly.Concatenate(name='concat_outputs')(concat_array)

balance_layers = [ly.Dense(QTY_REGRESSORS, name='balance_layer_0')(l_concat)]
for i in range(QTY_REGRESSORS):
    balance_layers.append(ly.Dense(QTY_REGRESSORS, name=f'balance_layer_{i+1}')(balance_layers[-1]))

l_output = ly.Dense(1, name='output')(balance_layers[-1])

model_0 = Model(inputs=l_input, outputs=l_output)
plot_model(model_0, show_shapes=True)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def compile_and_fit_model(model, mode='main', batch=1, epochs=100, verbose=1):
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                      loss=tf.keras.losses.mae, metrics=['mae'])
    
    if mode == 'main':
        history = model.fit(
            x=tf.constant(x_train), y=tf.constant(y_train),
            batch_size=batch, epochs=epochs, steps_per_epoch=int(.8*len(x_train)/batch), 
            validation_split=0.2, verbose=verbose,
            callbacks=[
                ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, verbose=1),
                EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            ])
    elif mode =='test':
        history = model.fit(
            x=tf.constant(xtrain), y=tf.constant(ytrain),
            batch_size=batch, epochs=epochs, steps_per_epoch=int(.75*len(xtrain)/batch), 
            validation_data=0.25, verbose=verbose,
            callbacks=[
                ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, verbose=1),
                EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            ])
    else: raise Exception(f'Mode {mode} is not available to our fit process.')
        
    pd.DataFrame(history.history).plot()
    return history

def evaluate_model(model):
    evaluate = lambda y_true, y_pred: { 'r2': r2_score(y_true, y_pred), 'mae': mean_absolute_error(y_true, y_pred), 'rmse': mean_squared_error(y_true, y_pred, squared=True) }
    y_pred = model.predict(tf.constant(xval), verbose=0)
    result = evaluate(yval.values.reshape(-1,1), y_pred)
    print(f'\nResultado = {result}\n')


In [None]:
model_0.fit(
    x=tf.constant(xtrain), y=tf.constant(ytrain),
    batch_size=1, epochs=100, steps_per_epoch=1168, 
    validation_data=0.25, verbose=1,
    callbacks=[
        ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, verbose=1),
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    ])

In [None]:
compile_and_fit_model(model_0, mode=EXEC_MODE, batch=1, epochs=100, verbose=1)
evaluate_model(model_0)