In [20]:
import pandas as pd
import datetime
import collections
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import pearsonr

from fklearn.training.imputation import imputer
from fklearn.training.transformation import ecdfer
from fklearn.training.pipeline import build_pipeline
from fklearn.training.transformation import standard_scaler
from fklearn.training.regression import xgb_regression_learner
from fklearn.training.classification import xgb_classification_learner
from fklearn.preprocessing.splitting import space_time_split_dataset
from fklearn.validation.evaluators import combined_evaluators, mean_prediction_evaluator, r2_evaluator, mse_evaluator

from sklearn import metrics

In [21]:
dados = pd.read_csv('abt_cobrancas.csv', sep = ';')

In [22]:
dados.head(10)

Unnamed: 0,liquidacao,parc_abertas,rating_cliente,rating_contrato,genero,idade,quantidade_parcelas_plano,renda_cliente
0,1,7.0,1,0,0,58.0,15.0,2850.0
1,1,6.0,1,0,0,58.0,15.0,2850.0
2,1,5.0,1,0,0,58.0,15.0,2850.0
3,1,4.0,1,0,0,58.0,15.0,2850.0
4,1,3.0,1,0,0,58.0,15.0,2850.0
5,1,2.0,1,0,0,58.0,15.0,2850.0
6,1,1.0,1,0,0,58.0,15.0,2850.0
7,1,2.0,1,0,0,60.0,10.0,2500.0
8,0,1.0,1,0,0,60.0,10.0,2500.0
9,0,1.0,2,2,0,60.0,10.0,2500.0


In [23]:
# Drop de linhas com dados desaparecidos
linhas_faltando = dados[dados.idade.isnull()].index
dados = dados.drop(linhas_faltando)
dados = dados.reset_index(drop=True)

In [24]:
# SEPARACAO ENTRE TREINO E TESTE
dados_teste = dados.sample(int(len(dados)/5), replace = False)
index_teste = dados_teste.index
dados_treino = dados.drop(index_teste)
dados_teste = dados_teste.reset_index(drop=True)
dados_treino = dados_treino.reset_index(drop=True)
print('dados:  ', len(dados), '  treino:  ', len(dados_treino), '  teste:  ', len(dados_teste))

dados:   576247   treino:   460998   teste:   115249


In [25]:
AUXILIARY = []
TARGET = ['liquidacao']
FEATURES = set(dados.columns) - set(AUXILIARY) - set(TARGET)

my_model = xgb_classification_learner(
    features=FEATURES,
    target=TARGET[0],
    prediction_column='prediction',
    extra_params={'seed': 139, 'nthread': 8},
)
my_ecdefer = ecdfer(prediction_column='prediction', ecdf_column='prediction_ecdf')
my_learner = build_pipeline(my_model, my_ecdefer)

# TREINO
(prediction_function, treinado, logs) = my_learner(dados_treino)
# TESTE
testados = prediction_function(dados_teste)

# USANDO O MODELO

In [26]:
dados_teste.head(3)

Unnamed: 0,liquidacao,parc_abertas,rating_cliente,rating_contrato,genero,idade,quantidade_parcelas_plano,renda_cliente
0,1,14.0,2,2,1,24.0,15.0,2390.0
1,1,9.0,1,0,1,80.0,10.0,1950.0
2,1,11.0,1,0,0,45.0,12.0,3000.0


In [8]:
teste_MESMO = dados_teste.drop('liquidacao', axis=1).copy()

In [27]:
teste_MESMO.head(3)

Unnamed: 0,parc_abertas,rating_cliente,rating_contrato,genero,idade,quantidade_parcelas_plano,renda_cliente
0,10.0,1,0,0,34.0,15.0,2000.0
1,5.0,1,1,0,56.0,8.0,2800.0
2,10.0,1,0,0,33.0,10.0,2300.0


In [28]:
prediction_function(teste_MESMO)

Unnamed: 0,parc_abertas,rating_cliente,rating_contrato,genero,idade,quantidade_parcelas_plano,renda_cliente,prediction,prediction_ecdf
0,10.0,1,0,0,34.0,15.0,2000.00,0.865813,672.061484
1,5.0,1,1,0,56.0,8.0,2800.00,0.846917,552.082655
2,10.0,1,0,0,33.0,10.0,2300.00,0.820121,444.750303
3,11.0,1,0,1,67.0,15.0,2000.00,0.889420,897.804329
4,10.0,1,0,1,40.0,15.0,2495.00,0.856302,604.905010
5,2.0,1,1,0,32.0,2.0,1850.00,0.819846,443.932512
6,1.0,2,2,1,50.0,10.0,4000.00,0.711246,313.862533
7,11.0,1,0,0,56.0,12.0,3450.00,0.868923,694.820802
8,7.0,1,0,1,37.0,10.0,2000.00,0.866506,676.608141
9,7.0,1,0,0,32.0,10.0,2000.00,0.857590,613.095935


# METRICAS

In [10]:
metrics.roc_auc_score(treinado.liquidacao, treinado.prediction)

0.8593336994029852

In [11]:
metrics.roc_auc_score(testados.liquidacao, testados.prediction)

0.8563085103114761

In [12]:
treinado['resultado'] = treinado.prediction.apply(lambda x: 1 if x > 0.5 else 0)

In [13]:
testados['resultado'] = testados.prediction.apply(lambda x: 1 if x > 0.5 else 0)

In [14]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [15]:
confusion_matrix(testados.liquidacao, testados.resultado) / (25961 + 14281 + 3241 + 71802)

array([[0.22531986, 0.1209611 ],
       [0.02820835, 0.62519842]])

In [16]:
testados.loc[:,['liquidacao','prediction']].to_csv('score_recuperacao_teste.csv', index=False, sep=';')

# VISUALIZACAO DOS DADOS

In [17]:
pearsonr(dados.liquidacao, dados.rating_cliente)

(-0.6362344220160357, 0.0)

In [18]:
dados.corr().liquidacao

liquidacao                   1.000000
parc_abertas                -0.170252
rating_cliente              -0.636234
rating_contrato             -0.649325
genero                      -0.059784
idade                        0.152357
quantidade_parcelas_plano   -0.087500
renda_cliente                0.001135
Name: liquidacao, dtype: float64

In [19]:
dados.head()

Unnamed: 0,liquidacao,parc_abertas,rating_cliente,rating_contrato,genero,idade,quantidade_parcelas_plano,renda_cliente
0,1,7.0,1,0,0,58.0,15.0,2850.0
1,1,6.0,1,0,0,58.0,15.0,2850.0
2,1,5.0,1,0,0,58.0,15.0,2850.0
3,1,4.0,1,0,0,58.0,15.0,2850.0
4,1,3.0,1,0,0,58.0,15.0,2850.0
