# Arvores de Decisão - Diabetes
###  Disponível em https://www.kaggle.com/uciml/pima-indians-diabetes-database

Attributes:

Pregnancies: Number of times pregnant - Gravidez

Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test - Glicose

BloodPressure: Diastolic blood pressure (mm Hg) - Pressão Arterial

SkinThickness: Triceps skin fold thickness (mm)  - Espessura do tríceps

Insulin: 2-Hour serum insulin (mu U/ml) - Insulina

BMI: Body mass index (weight in kg/(height in m)^2) - IMC

DiabetesPedigreeFunction: Diabetes pedigree function - Função que leva em conta doenças na familia

Age: Age (years)

Outcome: Class variable (0 or 1) - 0 : Não tem Diabetes, 1: Possui Diabetes

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("diabetes.csv")
df.rename(columns={"Outcome": "Class"} , inplace=True)
df.head()

In [None]:
df.describe().T

In [None]:
df.info()

## Preparação dos dados

### limpeza dos dados missing

In [None]:
len(df)

In [None]:
df2 = df
df2 = df2.dropna()
len(df2)

In [None]:
df2.head()

## Aplicar o algoritmo de Classificação - Árvore de Decisão

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [None]:
# particionar os conjuntos de treino e teste
from sklearn.model_selection import train_test_split

diabetes_data = df2.loc[:,["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
                       "BMI", "DiabetesPedigreeFunction", "Age"]]
diabetes_target = df2["Class"]

In [None]:
diabetes_data[:3]

In [None]:
diabetes_target[:3]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    diabetes_data, diabetes_target, test_size=0.33, random_state=42)

X_train[:3]

In [None]:
print("# dados de treino = ", len(X_train))
print("# dados de teste = ", len(X_test))    

### aplicar o algoritmo de arvores de decisao

In [None]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

In [None]:
import sklearn.metrics as metrics

metrics.confusion_matrix(y_test, tree.predict(X_test))

### Previsao

In [None]:
import numpy as np

In [None]:
# 0 = não tem diabetes, 1 = tem diabetes

ocorrencias = [ 
#Pregnancies Glucose BloodPressure    SkinThickness Insulin    BMI    DiabetesPedigreeFunction  Age
[   3,          150,         75,             36,        0,     36.1,   0.62,                      55  ],
[   0,          90,          90,             40,        90,    30,     0.7,                      32  ],
[   1,          120,         75,             28,        70,    29,     0.5,                      27  ]
]
               
   
saida =  '{:03.1f}\t\t{:03.1f}\t{:03.1f}\t\t{:03.1f}\t\t{:03.1f}\t{:03.1f}\t\t{:03.1f}\t{:03.1f}\t{:s}'               
  
print("Pregnancies \tGlucose\tBloodPressure\tSkinThickness Insulin    BMI    DiabetesPFun    Age")
for ocorrencia in ocorrencias:
    ocorrencia = np.array(ocorrencia).reshape(1, -1) 
    classe = "Não tem diabetes" if tree.predict(ocorrencia) == 0 else "Possui Diabetes"
    #print(classe)
    print(saida.format(ocorrencia[0][0], ocorrencia[0][1], ocorrencia[0][2], ocorrencia[0][3],
                       ocorrencia[0][4], ocorrencia[0][5], ocorrencia[0][6], ocorrencia[0][7], classe))

### Verificando os atributos mais relevantes

In [None]:
print(list(df.columns[:-1]))
tree.feature_importances_

In [None]:
dfi = pd.DataFrame()
dfi['atributo'] = list(df.columns[:-1])
dfi['importancia'] = tree.feature_importances_
dfi

In [None]:
dfi.sort_values(by="importancia", ascending=False)

In [None]:
# mostrar os atributos mais relavantes (features)
import matplotlib.pyplot as plt
%matplotlib inline

def plot_feature_importances_cancer(model):
    columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", 
                       "BMI", "DiabetesPedigreeFunction", "Age"]
    n_features = len(columns)
    plt.barh(range(n_features), tree.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)
    plt.figure(figsize=(12,10))
    print ("Atributos mais relavantes")
    plt.show()
    

plot_feature_importances_cancer(tree)

In [None]:
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", 
                       "BMI", "DiabetesPedigreeFunction", "Age"]
dict_features =  {}
for name, feature, in zip(columns,  tree.feature_importances_):
    dict_features[name] = feature
dict_features

In [None]:
# ordenar
sorted(dict_features.items(), key=lambda x: -x[1])

### Analisando a árvore de decisao

In [None]:
#!pip install graphviz

In [None]:
# Class: negative, positive
Class = ["negative", "positive"]
features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
                       "BMI", "DiabetesPedigreeFunction", "Age"]
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.dot", class_names=["negative","positive",],
                feature_names=features, impurity=False, filled=True)

In [None]:
# instalar o graphviz: https://anaconda.org/anaconda/graphviz
# http://www.graphviz.org/Download_macos.php
# !pip install graphviz

import graphviz
from IPython.display import set_matplotlib_formats, display

with open("tree.dot") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))