In [19]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from missingpy import MissForest
import time

In [52]:
bureau = pd.read_csv('../base_de_dados/bureau.csv')
bureau_balance = pd.read_csv('../base_de_dados/bureau_balance.csv')
main = pd.read_csv('../base_de_dados/application_train_transformed.csv')
explicativo = pd.read_csv('../base_de_dados/HomeCredit_columns_description.csv')

In [53]:
explicativo[explicativo['Row'] == 'MONTHS_BALANCE'].iloc[0]['Description']

'Month of balance relative to application date (-1 means the freshest balance date)'

In [55]:
nulos = pd.DataFrame(bureau.isnull().sum(), columns=['Nulos'])
nulos = nulos[nulos['Nulos'] >= bureau.shape[0] * 0.3]

In [56]:
bureau.drop(nulos.index, axis=1, inplace=True)

In [57]:
bureau_expl = explicativo[explicativo['Table'] == 'bureau.csv'].copy()
bureau_expl[bureau_expl['Row'] == 'AMT_CREDIT_SUM'].iloc[0]['Description']

'Current credit amount for the Credit Bureau credit'

In [58]:
# Transformando coluna de crÃ©ditos ativos e inativos
bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].map({'Active':1, 'Closed':0})

In [59]:
bureau_balance = bureau_balance[bureau_balance['MONTHS_BALANCE'] >= -6]

In [60]:
bureau = bureau.merge(bureau_balance, on='SK_ID_BUREAU', how='inner')

In [61]:
cols_obj = bureau.select_dtypes('O').columns

le = LabelEncoder()

for col in cols_obj:
    bureau[col] = le.fit_transform(bureau[col])

In [62]:
mf = MissForest(max_depth=10, min_samples_split=16, max_features=0.7, max_iter=3, n_estimators=10)
mf.fit(bureau)

MissForest(max_depth=10, max_features=0.7, max_iter=3, min_samples_split=16,
           n_estimators=10)

In [63]:
inicio = time.time()
transform = mf.transform(bureau)
fim = time.time()

Iteration: 0
Iteration: 1
Iteration: 2


In [64]:
bureau[bureau.columns] = transform

In [65]:
bureau_original = pd.read_csv('../base_de_dados/bureau.csv')

In [66]:
inteiras = bureau_original.select_dtypes(['float64', 'int64'])
colunas_arredondadas = []

for col in inteiras.columns:
    pares_impares = bureau_original[col] % 2
    if pares_impares.isin([0, 1]).all():
        colunas_arredondadas.append(col)

In [67]:
bureau[colunas_arredondadas] = round(bureau[colunas_arredondadas])

In [68]:
from sklearn.preprocessing import OneHotEncoder

In [69]:
ohe = OneHotEncoder(sparse=False, drop='first')
for col in cols_obj:
    ohe.fit(bureau[[col]])
    
    categorias = ohe.categories_[0]
    labels = [col + '_' + str(i) for i in range(1, len(categorias))]
    
    bureau[labels] = ohe.transform(bureau[[col]])
    bureau.drop(col, axis=1, inplace=True)

In [70]:
variancias = []

for var in bureau.columns:
    variancia = np.var(bureau[var])
    if variancia < 1500:
        print(variancia)
        variancias.append(var)
        
variancias

0.2374398373547506
579.483660331871
0.006401972348410668
3.971025174356653
0.0005531265043759842
8.274036018144047e-05
3.407226337625316e-06
0.015978726344761784
6.887013760582595e-05
0.20228429770904052
0.18431306328274133
0.0009300394957863826
4.867480412040857e-07
1.1925190600572492e-05
0.00040918773014242014
0.008001086329839391
1.70361607113027e-06
0.011278016590868788
1.5088968938112077e-05
0.00024015257427981398
0.008623250611166968
0.0004167260946898025
0.0001416236859836252
0.00010439660473121037
0.0015583223871783092
0.2484946982411254
0.1532390167502113


['CREDIT_ACTIVE',
 'CREDIT_DAY_OVERDUE',
 'CNT_CREDIT_PROLONG',
 'MONTHS_BALANCE',
 'CREDIT_CURRENCY_1',
 'CREDIT_CURRENCY_2',
 'CREDIT_CURRENCY_3',
 'CREDIT_TYPE_1',
 'CREDIT_TYPE_2',
 'CREDIT_TYPE_3',
 'CREDIT_TYPE_4',
 'CREDIT_TYPE_5',
 'CREDIT_TYPE_6',
 'CREDIT_TYPE_7',
 'CREDIT_TYPE_8',
 'CREDIT_TYPE_9',
 'CREDIT_TYPE_10',
 'CREDIT_TYPE_11',
 'CREDIT_TYPE_12',
 'CREDIT_TYPE_13',
 'STATUS_1',
 'STATUS_2',
 'STATUS_3',
 'STATUS_4',
 'STATUS_5',
 'STATUS_6',
 'STATUS_7']

In [72]:
bureau['valor_pago'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']

In [75]:
# Criando dicionario para o agregate
dic = {var:np.sum for var in bureau.columns}
dic.pop('SK_ID_CURR')
dic['DAYS_CREDIT'] = np.max
dic['CREDIT_DAY_OVERDUE'] = np.mean
dic['CNT_CREDIT_PROLONG'] = np.mean


In [76]:
bureau_unif = bureau.groupby('SK_ID_CURR').agg(dic)

In [77]:
bureau_unif.reset_index(inplace=True)

In [78]:
bureau_unif.to_csv('../base_de_dados/bureau_filled.csv')