# Treino e Teste 

<img src='https://i.ytimg.com/vi/ClZYAou70Ug/maxresdefault.jpg'>

## Reporte da metrica de avaliação

<img src='amostragem.png'>

In [None]:

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, roc_curve, auc, accuracy_score
from matplotlib import pyplot


In [None]:
#dataset 1 - balanceado; não linearmente separavel, sem outliers; normalizado 
classe_1 = pd.DataFrame(np.random.normal(25, 2.5, size=(100, 2)))
classe_1['clase'] = 0
classe_1.columns=['feature_1', 'feature_2', 'classe']
classe_2 = pd.DataFrame(np.random.normal(22, 2.5, size=(100, 2)))
classe_2['clase'] = 1
classe_2.columns=['feature_1', 'feature_2', 'classe']
sample = pd.concat([classe_1, classe_2])
sample.head()

In [None]:
fig, ax = plt.subplots()
sample.plot(kind="scatter", x="feature_1", y="feature_2", s=50, c="classe", cmap="viridis", ax=ax);


In [None]:
X = sample[['feature_1', 'feature_2']]
y = sample['classe']

## Resubstituição

In [None]:
model = SVC(kernel='rbf', C=1, random_state=42)
model.fit(X,y)
model.predict(X)
score = model.score(X,y)
print("%0.2f accuracy with a standard deviation of %0.2f" % (score.mean()*100, score.std()*100))

# print("Accuracy: %.2f%%" % (score*100.0))

* Toda a amostra de treino está sendo usada para teste
* Não identificamos se está acontecendo overfitting 

## Holdout 


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30, random_state=100, shuffle=True)
model = SVC(kernel='rbf', C=1, random_state=42)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("%0.2f%% accuracy with a standard deviation of %0.2f%%" % (result.mean()*100, result.std()*100))


* o dataset é dividido em 2, normalmente na proporção de 2/3
* uma critica a essa abordagem é que este não permite avaliar o quanto o desempenho de uma técnica varia quando diferentes combinações de objetos são apresentadas


## K-fold

In [None]:
kfold = KFold(n_splits=10, shuffle=True)
model_kfold = SVC(kernel='rbf', C=1, random_state=42)
results_kfold = cross_val_score(model_kfold, X, y, cv=kfold)
print("%0.2f%% accuracy with a standard deviation of %0.2f%%" % (results_kfold.mean()*100, results_kfold.std()*100))


* o conjunto de exemplos é dividido em k subconjuntos de tamanho aproximadamente igual
* k-1 subconjuntos são usados no treinamento e o que sobra é usado no teste
* não existe uma regra para o valor de k, é comum utilizar k=10
* a principal critica a esse metodo é que uma parte dos dados é compartilhada entre os subconjuntos de treinamento 

## Stratified

In [None]:
skfold = StratifiedKFold(n_splits=3, shuffle=True)
model_skfold = SVC(kernel='rbf', C=1, random_state=42)
results_skfold = cross_val_score(model_skfold, X, y, cv=skfold)
print("%0.2f%% accuracy with a standard deviation of %0.2f%%" % (results_skfold.mean()*100, results_skfold.std()*100))


* mantém em cada partição a proporção de exemplos de cada classe semelhante a proporção contida no conjunto de dados total


## Leave one out

In [None]:
loocv = LeaveOneOut()
model_loocv = SVC(kernel='rbf', C=1, random_state=42)
results_loocv = cross_val_score(model_loocv, X, y, cv=loocv)
print("%0.2f%% accuracy with a standard deviation of %0.2f%%" % (results_loocv.mean()*100, results_loocv.std()*100))


* caso extremo em que k =n
* um exemplo é separado para teste, enquanto n-1 são usados para treino 


## Verificando overfitting 

In [None]:
# synthetic classification dataset
from sklearn.datasets import make_classification
# define dataset
X, y = make_classification(n_samples=10000, n_features=20, n_informative=5, n_redundant=15, random_state=1)
# summarize the dataset
print(X.shape, y.shape)

In [None]:
# split a dataset into train and test sets
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# create dataset
X, y = make_classification(n_samples=10000, n_features=20, n_informative=5, n_redundant=15, random_state=1)
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# summarize the shape of the train and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# define the tree depths to evaluate
values = [i for i in range(1, 30)]

In [None]:
train_scores, test_scores = list(), list()


In [None]:
for i in values:
    # configure the model
    model = DecisionTreeClassifier(max_depth=i)
    # fit model on the training dataset
    model.fit(X_train, y_train)
    # evaluate on the train dataset
    train_yhat = model.predict(X_train)
    train_acc = accuracy_score(y_train, train_yhat)
    train_scores.append(train_acc)
    # evaluate on the test dataset
    test_yhat = model.predict(X_test)
    test_acc = accuracy_score(y_test, test_yhat)
    test_scores.append(test_acc)
    # summarize progress
    print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

In [None]:
# plot of train and test scores vs number of neighbors
pyplot.plot(values, train_scores, '-o', label='Train')
pyplot.plot(values, test_scores, '-o', label='Test')
pyplot.legend()
pyplot.show()