# <div style="color:white;display:fill;border-radius:5px;background-color:#0E2031;letter-spacing:0.5px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Data Preparation</p></div> 

- **Data Preparation**
    - Libraries
    - Loading Data
    - Encoding
    - Feature Selection
    - Normalization
    - Standardzation
    - Boruta

## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Libraries</p></div>

In [1]:
# Basic Tools
import pandas as pd
import numpy as np



from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, cross_validate



from boruta                import BorutaPy

## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Loading Data</p></div>

In [2]:
data = pd.read_csv('../data/processed/data_processed.csv')

In [3]:
data_prep = data.copy()
data_prep.shape

(5880, 28)

In [4]:
data_prep.isna().sum().sum()

5876

## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Encoding</p></div>

In [5]:
data_prep['re_types'].value_counts()

Padrão               5720
Cobertura              97
Loft                   41
Duplex ou triplex      14
Kitnet                  8
Name: re_types, dtype: int64

In [6]:
re_types_dict = {'Padrão': 1,  'Loft': 2, 'Cobertura': 3, 'Duplex ou triplex':4, 'Kitnet': 5}
data_prep['re_types'] = data_prep['re_types'].map(re_types_dict)


## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Feature Selection</p></div>

In [7]:
data_prep.columns

Index(['bathrooms', 'condominio', 'garage_spaces', 'price', 'rooms', 'size',
       'zipcode', 're_types', 're_rent_full_price', 'condominio_fechado',
       'elevador', 'permitido_animais', 'piscina', 'portaria',
       'portao_eletronico', 'salao_de_festas', 'seguranca24_h', 'academia',
       'area_murada.1', 'ar_condicionado', 'armarios_na_cozinha',
       'armarios_no_quarto', 'churrasqueira', 'mobiliado', 'quarto_de_servico',
       'varanda', 'area_de_servico', 'size_comodo'],
      dtype='object')

In [8]:
# SELECTED_FEATURES = ['bathrooms', 
#                      'condominio',
#                      'garage_spaces',
#                      'rooms',
#                      'size',
#                      'zipcode', 
#                      #'size_comodo',
#                      #'re_types',
#                      'condominio_fechado', 
#                      'elevador',
#                      'permitido_animais',
#                      'piscina', 
#                      'portaria',
#                      'salao_de_festas',
#                      'seguranca24_h',
#                      'academia',
#                      'ar_condicionado', 
#                      'armarios_na_cozinha',
#                      'armarios_no_quarto',
#                      'churrasqueira', 
#                      'mobiliado', 
#                      'quarto_de_servico',
#                      'varanda', 
#                      'area_de_servico']


# SELECTED_FEATURES = ['bathrooms', 'condominio', 'garage_spaces', 'size', 'zipcode','elevador']



SELECTED_FEATURES = ['bathrooms', 'condominio', 'garage_spaces', 'rooms','size', 'zipcode','condominio_fechado', 'elevador',
                     'piscina','portaria','salao_de_festas','academia','area_de_servico']




TARGET = 'price'

DROP_FEATURES = [value for value in data if value != 'price' and value not in data[SELECTED_FEATURES]]

In [9]:
data_prep[SELECTED_FEATURES]

Unnamed: 0,bathrooms,condominio,garage_spaces,rooms,size,zipcode,condominio_fechado,elevador,piscina,portaria,salao_de_festas,academia,area_de_servico
0,1,270,1,2,58,41150115,0,0,0,0,0,0,0
1,4,1080,2,3,105,40296320,0,0,1,0,1,1,0
2,3,800,1,3,120,41940340,1,1,0,1,1,1,1
3,2,858,1,2,64,41750000,1,0,1,0,0,0,0
4,3,797,1,3,110,41720060,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5875,2,545,2,2,52,41601075,1,0,1,1,1,1,0
5876,1,633,1,1,41,40140090,1,1,1,1,1,1,1
5877,3,900,2,3,102,40155810,0,1,1,1,1,1,0
5878,3,990,1,2,136,41760035,0,1,0,0,1,0,0


## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Normalization</p></div>

In [10]:
log_columns = data_prep[SELECTED_FEATURES].skew().sort_values(ascending=False)
log_columns = log_columns.loc[log_columns > 0.75]
print('Columns/Skew\n',log_columns)    

Columns/Skew
 area_de_servico       2.282548
size                  1.477421
condominio_fechado    0.860762
dtype: float64


In [11]:
data_prep['price'] = np.log1p( data_prep['price'] )

In [12]:
data_prep.head()

Unnamed: 0,bathrooms,condominio,garage_spaces,price,rooms,size,zipcode,re_types,re_rent_full_price,condominio_fechado,...,area_murada.1,ar_condicionado,armarios_na_cozinha,armarios_no_quarto,churrasqueira,mobiliado,quarto_de_servico,varanda,area_de_servico,size_comodo
0,1,270,1,11.849405,2,58,41150115,1,,0,...,0,0,0,0,0,0,0,0,0,3
1,4,1080,2,13.28788,3,105,40296320,1,,0,...,0,1,1,0,1,0,0,0,0,7
2,3,800,1,13.444448,3,120,41940340,1,,1,...,0,1,1,1,0,0,1,0,1,6
3,2,858,1,12.948012,2,64,41750000,1,,1,...,1,0,1,1,1,0,1,1,0,4
4,3,797,1,13.038984,3,110,41720060,1,,1,...,0,0,1,0,0,0,0,0,0,6


In [13]:
y = data_prep[TARGET]
X = data_prep[SELECTED_FEATURES]

In [14]:
seed=7
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state=seed)#, stratify=y
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((4704, 13), (4704,), (1176, 13), (1176,))

In [15]:
number_folds = 4
Kfold = KFold(n_splits=number_folds, shuffle=True, random_state=seed)

# Boruta

In [16]:
# # training and test dataset for Boruta
# X_train_n = X_train.values
# y_train_n = y_train.values.ravel()

# # define RandomForestRegressor
# rf = RandomForestRegressor( n_jobs=-1 )

# # define Boruta
# boruta = BorutaPy( rf, n_estimators='auto', verbose=2, random_state=42 ).fit( X_train_n, y_train_n )

In [17]:
# cols_selected = boruta.support_.tolist()

# # best features
# X_train_fs = X_train
# cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns.to_list()

# # not selected boruta
# cols_not_selected_boruta = list( np.setdiff1d( X_train_fs.columns, cols_selected_boruta ) )
# cols_not_selected_boruta

# # selected boruta
# cols_selected_boruta
