# Pré processamento Geração e validação dos modelos

### Tiago Tambonis - 02/11/19

#######################################################################################################################
# Predição
#######################################################################################################################

In [1]:
#Imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
import dill as pickle
from sklearn import linear_model

In [2]:
def estimador(df):
    
    estimativas = {'mode_gender':df['Gender'].mode()[0], 'mode_married':df['Married'].mode()[0], 
               'mode_dependents':df['Dependents'].mode()[0], 'mode_self_employed':df['Self_Employed'].mode()[0], 
               'mode_credit_history':df['Credit_History'].mode()[0], 'mode_LoanAmount':df['LoanAmount'].median(), 
               'mode_LoanAmountTerm':df['Loan_Amount_Term'].median()}
    
    return(estimativas)

In [3]:
def pre_processamento(df, estimativas):
        
    #Resolver nulls
    df['Gender'].fillna(estimativas['mode_gender'], inplace=True)
    df['Married'].fillna(estimativas['mode_married'], inplace=True)
    df['Dependents'].fillna(estimativas['mode_dependents'], inplace=True)
    df['Self_Employed'].fillna(estimativas['mode_self_employed'], inplace=True)
    df['Credit_History'].fillna(estimativas['mode_credit_history'], inplace=True)
    df['LoanAmount'].fillna(estimativas['mode_LoanAmount'], inplace=True)
    df['Loan_Amount_Term'].fillna(estimativas['mode_LoanAmountTerm'], inplace=True)
    
    gender_values = {'Female' : 0, 'Male' : 1} 
    married_values = {'No' : 0, 'Yes' : 1}
    education_values = {'Graduate' : 0, 'Not Graduate' : 1}
    employed_values = {'No' : 0, 'Yes' : 1}
    property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
    dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
    df.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \
                'Self_Employed': employed_values, 'Property_Area': property_values, \
                'Dependents': dependent_values}, inplace=True)
        
    return(df)

In [4]:
#Carregar dados 

dados = pd.read_csv('../dataset.csv')

In [5]:
#Checagem

dados.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
#chacagem dados únicos na tabela de dados.
#Os IDS são únicos dado que o shape[0] é igual ao número de valores únicos Loan_ID: 614.

dados.apply(lambda x: len(x.unique()))

Loan_ID              614
Gender                 3
Married                3
Dependents             5
Education              2
Self_Employed          3
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           204
Loan_Amount_Term      11
Credit_History         3
Property_Area          3
Loan_Status            2
dtype: int64

In [7]:
#Divisão treino/teste - Na classificação o 'LoanAmount' não é o target e foi considerado como informação relevante.

pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\
            'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

X_treino, X_teste, y_treino, y_teste = train_test_split(dados[pred_var], dados['Loan_Status'], \
                                                    test_size=0.1, random_state=42)

In [8]:
#Estimador do LoanAmount e LoanAmountTerm 

estimativas = estimador(X_treino)

In [9]:
#Pre-processamento

X_treino = pre_processamento(X_treino, estimativas)

In [10]:
#visualizar resultados do pré-processamento

X_treino.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
550,1,1,2,0,1,6633,0.0,127.5,360.0,0.0,0
439,0,1,0,1,1,7142,0.0,138.0,360.0,1.0,0
394,1,1,2,0,0,3100,1400.0,113.0,360.0,1.0,1
382,0,0,0,0,0,6000,0.0,156.0,360.0,1.0,1
29,0,0,2,0,0,3750,2083.0,120.0,360.0,1.0,2


In [11]:
#visualizar resultados do pré-processamento - Shape

X_treino.shape

(552, 11)

In [12]:
#Trocar Y e N por 01 e 0

y_treino = y_treino.replace({'Y':1, 'N':0}).as_matrix()
y_teste = y_teste.replace({'Y':1, 'N':0}).as_matrix()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [13]:
#Parâmetros do xgboost

params_bst = {'objective': 'binary:logistic', 
             'max_depth': 5,
             'learning_rate': 0.01, 
             'n_estimator': 300}

In [14]:
#Modelo

xgb_model_bst = XGBClassifier(**params_bst)

In [15]:
#Treinamento do modelo

xgb_model_bst.fit(X_treino, y_treino)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimator=300,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [16]:
#Pré-processamento teste

X_teste = pre_processamento(X_teste, estimativas)

In [17]:
X_teste.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
350,1,1,0,0,0,9083,0.0,228.0,360.0,1.0,2
377,1,1,0,0,0,4310,0.0,130.0,360.0,1.0,2
163,1,1,2,0,0,4167,1447.0,158.0,360.0,1.0,0
609,0,0,0,0,0,2900,0.0,71.0,360.0,1.0,0
132,1,0,0,0,0,2718,0.0,70.0,360.0,1.0,2


In [18]:
X_teste.dtypes

Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
dtype: object

In [19]:
#Empacotar objetos necessários para predição

modelo_classifacao = [xgb_model_bst, estimativas]

In [20]:
#Predição

y_pred = modelo_classifacao[0].predict(X_teste)

In [21]:
#Avaliação do modelo:

float(np.sum(y_pred==y_teste))/y_teste.shape[0]

0.7580645161290323

In [22]:
filename = '../API/Modelos/modelo_classificao.pk'

with open(filename, 'wb') as file:
    pickle.dump(modelo_classifacao, file)

#######################################################################################################################
# Regressão
#######################################################################################################################

In [23]:
#Como o empréstimo vai ser aprovado para quem tiver "aceite" vou estimar q quantidade de empréstimo somente por quem foi 
#tem histórico de aprovação.

X_treino_regressao = X_treino.iloc[np.where(y_treino==1)].head()

In [24]:
#Target

target = pd.DataFrame(X_treino_regressao, columns=['LoanAmount'])

In [25]:
#Remove target

X_treino_regressao = X_treino_regressao.drop(['LoanAmount'], 1)

In [26]:
#Checagem 

X_treino_regressao.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area
439,0,1,0,1,1,7142,0.0,360.0,1.0,0
394,1,1,2,0,0,3100,1400.0,360.0,1.0,1
382,0,0,0,0,0,6000,0.0,360.0,1.0,1
29,0,0,2,0,0,3750,2083.0,360.0,1.0,2
478,1,1,1,0,1,16667,2250.0,360.0,1.0,2


In [27]:
lm = linear_model.LinearRegression()
lm.fit(X_treino_regressao, target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [28]:
#Teste - regressão

#Somente quem foi aprovado
X_teste_regressao = X_teste.iloc[np.where(y_teste==1)].head()

#Target - teste
target_teste = pd.DataFrame(X_teste_regressao, columns=['LoanAmount'])

#Remove target
X_teste_regressao = X_teste_regressao.drop(['LoanAmount'], 1)

In [29]:
#Predição do teste

predictions = lm.predict(X_teste_regressao)

In [30]:
#Futuramente deve-se analisar se o modelo está bom (provavelmente não)

print('Em milhares: ', predictions)

('Em milhares: ', array([[142.38097421],
       [148.98175037],
       [105.22823416],
       [154.84074961],
       [159.18775379]]))


In [31]:
filename = '../API/Modelos/modelo_regressao.pk'

with open(filename, 'wb') as file:
    pickle.dump(lm, file)

In [32]:
print('OK.')

OK.
