# Análise de Dados de Crédito
Para a análise, iniciaremos com uma Descritiva das variáveis de interesse presentes no Dataset. Após isso, elaboraremos modelos de previsão de bons e maus pagadores, a fim de compará-los e estudar a acurácia de cada um deles na amostra disponível. Os dados estão disponíveis no Kaggle (<a href="https://www.kaggle.com/rikdifos/credit-card-approval-prediction">acesse aqui</a>).

In [1]:
import pandas as pd

In [2]:
app_record = pd.read_csv('Data/application_record.csv')
app_record.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [3]:
cred_record = pd.read_csv('Data/credit_record.csv')
cred_record.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [4]:
# Categorização e Merge dos Dados: dados do Dataframe APP_RECORD não possuem a categoria de serem bons ou maus pagadores.
def categorize(row):
    if row['STATUS'] == 'C' or row['STATUS'] == 'X':
        return False
    else:
        return True

df_cred = pd.merge(app_record, cred_record, on="ID")
df_cred['BAD_PAYER'] = df_cred.apply(lambda row: categorize(row), axis=1)
df_cred.drop('MONTHS_BALANCE', axis=1, inplace=True)
df_cred_ = df_cred.groupby(df_cred['ID']).aggregate('first')

In [5]:
print(df_cred_.shape)
print(df_cred_['BAD_PAYER'].value_counts())
df_cred_.head(10)

(36457, 19)
False    27619
True      8838
Name: BAD_PAYER, dtype: int64


Unnamed: 0_level_0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,STATUS,BAD_PAYER
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,C,False
5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,C,False
5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,C,False
5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,0,True
5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,X,False
5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,C,False
5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,C,False
5008812,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0,0,True
5008813,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0,0,True
5008814,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,-22464,365243,1,0,0,0,,1.0,0,True


In [9]:
from sklearn.model_selection import train_test_split

x = df_cred_.drop("BAD_PAYER", axis=1)
y = df_cred_["BAD_PAYER"]

xtrain, xtest, ytrain, ytest = train_test_split(x, y)

ModuleNotFoundError: No module named 'sklearn'

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

knn = KNeighborsClassifier()
hyperparams = {'n_neighbors': np.arange(1,25)}
knn_gscv = GridSearchCV(knn, hyperparams, cv=5)
knn_gscv.fit(xtrain, ytrain)
knn_prob = KNeighborsClassifier.predict_proba()

ModuleNotFoundError: No module named 'sklearn'