 ## Deploy Web API with Flask

### Tutorial
https://minerandodados.com.br/realizando-o-deploy-de-um-modelo-de-machine-learning-em-producao/

### YouTube Tutorial
https://www.youtube.com/watch?v=_dRfScGH7NA&t=704s

In [1]:
import os
import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### Carregando a Base de Dados

In [2]:
data = pd.read_csv('loan.csv')

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
data.Loan_Status.value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [7]:
data.Married.value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [8]:
data.Education.value_counts()

Graduate        480
Not Graduate    134
Name: Education, dtype: int64

In [9]:
data2 = data[data.Loan_Status=='Y'].sample(200)

In [10]:
data = data2.append(data[data.Loan_Status=='N'].sample(192))

In [11]:
data.Loan_Status.value_counts()

Y    200
N    192
Name: Loan_Status, dtype: int64

### Checando Missing Values

In [12]:
data2

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
146,LP001516,Female,Yes,2,Graduate,No,14866,0.0,70.0,360.0,1.0,Urban,Y
362,LP002175,Male,Yes,0,Graduate,No,4750,2333.0,130.0,360.0,1.0,Urban,Y
111,LP001387,Female,Yes,0,Graduate,,2929,2333.0,139.0,360.0,1.0,Semiurban,Y
368,LP002190,Male,Yes,1,Graduate,No,6325,0.0,175.0,360.0,1.0,Semiurban,Y
217,LP001726,Male,Yes,0,Graduate,No,3727,1775.0,131.0,360.0,1.0,Semiurban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,LP001120,Male,No,0,Graduate,No,1800,1213.0,47.0,360.0,1.0,Urban,Y
306,LP001993,Female,No,0,Graduate,No,3762,1666.0,135.0,360.0,1.0,Rural,Y
271,LP001891,Male,Yes,0,Graduate,No,11146,0.0,136.0,360.0,1.0,Urban,Y
10,LP001024,Male,Yes,2,Graduate,No,3200,700.0,70.0,360.0,1.0,Urban,Y


In [13]:
data2.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
146,LP001516,Female,Yes,2,Graduate,No,14866,0.0,70.0,360.0,1.0,Urban,Y
362,LP002175,Male,Yes,0,Graduate,No,4750,2333.0,130.0,360.0,1.0,Urban,Y
111,LP001387,Female,Yes,0,Graduate,,2929,2333.0,139.0,360.0,1.0,Semiurban,Y
368,LP002190,Male,Yes,1,Graduate,No,6325,0.0,175.0,360.0,1.0,Semiurban,Y
217,LP001726,Male,Yes,0,Graduate,No,3727,1775.0,131.0,360.0,1.0,Semiurban,Y


In [14]:
data2.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,200.0,200.0,194.0,197.0,183.0
mean,5122.05,1473.525,148.628866,349.15736,0.983607
std,3774.373775,1603.403262,77.464325,54.838172,0.127331
min,210.0,0.0,25.0,60.0,0.0
25%,2933.5,0.0,110.0,360.0,1.0
50%,3855.5,1427.0,132.0,360.0,1.0
75%,5826.5,2330.75,167.5,360.0,1.0
max,23803.0,8333.0,650.0,480.0,1.0


In [15]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,392.0,392.0,375.0,383.0,362.0
mean,5280.757653,1671.540816,149.88,346.684073,0.765193
std,5476.74391,3276.91964,81.526276,62.21885,0.424465
min,150.0,0.0,9.0,36.0,0.0
25%,2912.75,0.0,105.0,360.0,1.0
50%,3855.5,1283.5,132.0,360.0,1.0
75%,5826.5,2330.75,172.5,360.0,1.0
max,81000.0,41667.0,650.0,480.0,1.0


In [16]:
data.isnull().sum()

Loan_ID               0
Gender               10
Married               2
Dependents            9
Education             0
Self_Employed        20
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           17
Loan_Amount_Term      9
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [17]:
data2.isnull().sum()

Loan_ID               0
Gender                5
Married               2
Dependents            3
Education             0
Self_Employed        11
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            6
Loan_Amount_Term      3
Credit_History       17
Property_Area         0
Loan_Status           0
dtype: int64

#### Preenchendo Missing Values
- Dependents: Assumindo o valor majoritário da Coluna
- Self_Employed: Assumindo o valor majoriário da Coluna
- Loan_Amount_Term: Preenchendo com o Valor Médio da Coluna
- Credit_History: Assumindo o Valor Majoritário da Coluna
- Married: Assumindo o Valor Majoritário da Coluna
- Gender: Assumindo o Valor Majoritário da Coluna

In [18]:
data['Gender'] = data['Gender'].fillna('Male')

In [19]:
data['Married'] = data['Married'].fillna('No')

In [20]:
data['Dependents'] = data['Dependents'].fillna('0')

In [21]:
data['Self_Employed'] = data['Self_Employed'].fillna('No')

In [22]:
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean())

In [23]:
data['Credit_History'] = data['Credit_History'].fillna(1.0)

In [24]:
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean())

In [25]:
data.Credit_History.value_counts()

1.0    307
0.0     85
Name: Credit_History, dtype: int64

### Checando Novament Missing Values

In [26]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Transformando Dados Categóricos
- Várias Coluans do DataFrame são categóricos, precisamos transformá-las, são elas:
    - Gender
    - Married
    - Education
    - Self_Employed
    - Property_Area

In [None]:
#data.dtypes

In [27]:
from sklearn.preprocessing import LabelEncoder

In [28]:
gender_values = {'Female': 0, 'Male': 1}
married_values = {'No': 0, 'Yes': 1}
education_values = {'Graduate':0, 'Not Graduate': 1}
employed_values = {'No':0, 'Yes': 1}
dependent_values = {'3+':3, '0':0, '2':2, '1':1}
loan_values = {'Y':1, 'N':0}
data.replace({'Gender': gender_values, 
              'Married': married_values, 
              'Education': education_values, 
              'Self_Employed':employed_values,
              'Dependents': dependent_values,
              'Loan_Status': loan_values
             }, inplace=True)
              

In [29]:
data.drop(['Loan_ID', 'CoapplicantIncome', 'Loan_Amount_Term','Credit_History','Property_Area'],axis=1,inplace=True)

In [30]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Status
146,0,1,2,0,0,14866,70.0,1
362,1,1,0,0,0,4750,130.0,1
111,0,1,0,0,0,2929,139.0,1
368,1,1,1,0,0,6325,175.0,1
217,1,1,0,0,0,3727,131.0,1


#### Selecionado o Melhor Classificador através de Pipeline e GridSearchCV

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
from sklearn.model_selection import train_test_split

In [41]:
clf_rf = RandomForestClassifier(n_estimators=100,min_samples_split=2)

In [42]:
X_train = data.drop('Loan_Status', axis=1)

In [43]:
y = data['Loan_Status']

In [44]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X_train, y, test_size=0.20, random_state=43)

In [45]:
clf_rf.fit(X_treino,y_treino)

RandomForestClassifier()

In [48]:
clf_rf.fit(X_treino,y_treino)

RandomForestClassifier()

### Métricas de Validação

In [47]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [50]:
print(pd.crosstab(y_teste, clf_rf.predict(X_teste), rownames=['Real'], colnames=['Predito'], margins=True),)

Predito   0   1  All
Real                
0        19  21   40
1        23  16   39
All      42  37   79


In [51]:
print(metrics.classification_report(y_teste,clf_rf.predict(X_teste)))

              precision    recall  f1-score   support

           0       0.45      0.47      0.46        40
           1       0.43      0.41      0.42        39

    accuracy                           0.44        79
   macro avg       0.44      0.44      0.44        79
weighted avg       0.44      0.44      0.44        79



### Persistindo o Modelo Machine Learning para o Disco

In [None]:
#from sklearn.externals import joblib

In [56]:
import joblib

In [70]:
joblib.dump(clf_rf, 'model.pkl')

['model.pkl']

### Listando os arquivos em Disco

In [59]:
!ls -la

total 1556
drwxr-xr-x 3 biolabs biolabs    4096 Oct 30 12:13 .
drwxr-xr-x 5 biolabs biolabs    4096 Oct 29 18:01 ..
drwxr-xr-x 2 biolabs biolabs    4096 Oct 30 06:45 .ipynb_checkpoints
-rw-r--r-- 1 biolabs biolabs      72 Oct 30 06:45 Untitled.ipynb
-rw-r--r-- 1 biolabs biolabs   38011 Oct 30 06:50 loan.csv
-rw-r--r-- 1 biolabs biolabs     909 Oct 30 06:50 loan.csv:Zone.Identifier
-rw-r--r-- 1 biolabs biolabs 1471650 Oct 30 12:12 model.plk
-rw-r--r-- 1 biolabs biolabs   55804 Oct 30 12:13 persistindo-modelo-machine-learning-disco.ipynb


### Carregando o Modelo a partir do Disco para a Memória

In [62]:
model = joblib.load('model.plk')

### Verificando os Atributos do Modelo

In [64]:
print("Atributos do Modelo:\n\nClasses:{}\n\nEstimators:{}\n\nParametros:{}".format(model.classes_,model.n_estimators,model.base_estimator))

Atributos do Modelo:

Classes:[0 1]

Estimators:100

Parametros:DecisionTreeClassifier()


### Verificando o DataSet Final Gerado

In [65]:
X_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount
146,0,1,2,0,0,14866,70.0
362,1,1,0,0,0,4750,130.0
111,0,1,0,0,0,2929,139.0
368,1,1,1,0,0,6325,175.0
217,1,1,0,0,0,3727,131.0


### Teste de Classificação.

In [67]:
teste = np.array([[1,1,3,0,0,9504,275.0]])

In [68]:
model.predict(teste)

array([1])

### Probabilidade de Classes.

In [69]:
model.predict_proba(teste)

array([[0.18, 0.82]])