# Gradient Boosting Classifier

## Bibliotecas

In [7]:
import pandas as pd 
import numpy as np 
import scipy 
import _pickle as cPickle
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn import model_selection, preprocessing, feature_selection, ensemble, linear_model, metrics, decomposition

## Importando dados 

In [11]:
with open(r"poverty.pickle", "rb") as input_file:
    data = cPickle.load(input_file)

### Importando os conjuntos de treino e teste

In [17]:
X_train,X_test,y_train,y_test = data['X_train'],data['X_test'],data['y_train'],data['y_test']

## Gradient Boosting Classifier

### Modelo

model = ensemble.GradientBoostingClassifier()

### Definindo Hiperparametros

In [22]:
param_dic = {'learning_rate':[1, 0.1, 0.01, 0.001], 
             # fator de ponderação para as correções por novas árvores quando adicionadas ao modelo
             'n_estimators':[100,250,500,750,1000,1250,1500], 
             # número de árvores adicionadas ao modelo
             'max_depth':[2,3,4,5,6,7],
             # profundidade máxima da árvore
             'min_samples_split':[2,4,6,8,10,20,40,60,100], 
             # define o número mínimo de amostras para dividir
             'min_samples_leaf':[1,3,5,7,9], 
             # o número mínimo de amostras para formar uma "folha"
             'max_features':[2,3,4,5,6,7],
             # raiz quadrada de atributos geralmente é um bom ponto de partida
             'subsample':[0.7,0.75,0.8,0.85,0.9,0.95,1]}

### Aplicando Cross Validation

In [None]:
random_search = model_selection.RandomizedSearchCV(model,param_distributions=param_dic,n_iter=100, scoring="accuracy").fit(X_train, y_train)
print("Best Model parameters:", random_search.best_params_)
print("Best Model mean accuracy:", random_search.best_score_)
model = random_search.best_estimator_

### Curva ROC

In [None]:
cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True)
tprs, aucs = [], []
mean_fpr = np.linspace(0,1,100)
fig = plt.figure(figsize=(15,10))
i = 1
for train, test in cv.split(X_train, y_train):
   prediction = model.fit(X_train[train],
                y_train[train]).predict_proba(X_train[test])
   fpr, tpr, t = metrics.roc_curve(y_train[test], prediction[:, 1])
   tprs.append(scipy.interp(mean_fpr, fpr, tpr))
   roc_auc = metrics.auc(fpr, tpr)
   aucs.append(roc_auc)
   plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
   i = i+1
   
plt.plot([0,1], [0,1], linestyle='--', lw=2, color='black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = metrics.auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue', label=r'Mean ROC (AUC = %0.2f )' % (mean_auc), lw=2, alpha=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('K-Fold Validation')
plt.legend(loc="lower right")

### Ajuste do modelo

In [None]:
model.fit(X_train, y_train)

### Previsão

In [None]:
predicted_prob = model.predict_proba(X_test)[:,1]
predicted = model.predict(X_test)

## Avaliação do Modelo

### Acuracia e Auc

In [None]:
accuracy = metrics.accuracy_score(y_test, predicted)
auc = metrics.roc_auc_score(y_test, predicted_prob)

### Precisão e Recall 

In [None]:
recall = metrics.recall_score(y_test, predicted)
precision = metrics.precision_score(y_test, predicted)
print(metrics.classification_report(y_test, predicted, target_names=[str(i) for i in np.unique(y_test)]))

### Matriz de Confusão

In [None]:
classes = np.unique(y_test)
fig, ax = plt.subplots()
cm = metrics.confusion_matrix(y_test, predicted, labels=classes)