In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

In [None]:
mnist=fetch_openml("mnist_784",version=1)

In [None]:
X,y=mnist.data,mnist.target.astype(float)

In [None]:
plt.figure(figsize=[20,20])
idx_al=np.random.randint(0,X.shape[0],size=30)

for i in range(len(idx_al)):
    x_val=X[idx_al[i],:].reshape(28,28)
    y_val=y[idx_al[i]]
    plt.subplot(5,6,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(x_val,cmap=plt.cm.Greys_r)
    plt.title("Digito"+str(int(y_val)),fontsize=15,fontweight="bold")
   

In [None]:
from sklearn.model_selection import KFold,train_test_split


In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.25)

In [None]:
xtrain.shape,ytest.shape

In [None]:
ytrain=np.where(ytrain%2==0,1,0)
ytest=np.where(ytest%2==0,1,0)

In [None]:
ytrain[:50]

In [None]:
ytest[:50]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score,precision_score,f1_score,accuracy_score,roc_auc_score,roc_curve

### determinando o melhor limiar

In [None]:
reg_log=LogisticRegression()

In [None]:
reg_log.fit(xtrain,ytrain)

In [None]:
reg_log.predict_proba(xtrain)

In [None]:
prob_1=reg_log.predict_proba(xtrain)[:,1]

In [None]:
limiar=np.linspace(0.05,0.95,19)

In [None]:
precision=[]
recall=[]

for i in limiar:
    pred_i=np.where(prob_1>i,1,0)
    precision.append(precision_score(ytrain,pred_i))
    recall.append(recall_score(ytrain,pred_i))

In [None]:
plt.figure(figsize=[10,5])
plt.plot(range(len(limiar)),precision,c="red",lw=2,label="Curva Precision")
plt.plot(range(len(limiar)),precision,c="red",lw=4,alpha=0.8)
plt.plot(range(len(limiar)),recall,c="blue",lw=2,label="Curva Recall")
plt.xticks(range(len(limiar)), limiar.round(2), fontsize = 8)
plt.title("Precision X Recall",fontsize=16)
plt.legend(fontsize=15,bbox_to_anchor=[1,1])

plt.show()

##### Com as curvas plotadas podemos definir que o melhor limiar para nosso estudo será o 0.5

### Iniciaremos a plotagem da curva ROC e da AUC

In [None]:
rl=LogisticRegression()

In [None]:
rl.fit(xtrain,ytrain)

In [None]:
rltrain_pred=rl.predict_proba(xtrain)[:,1]
rlteste_pred=rl.predict_proba(xtest)[:,1]

fpr,vpr,the=roc_curve(ytrain,rltrain_pred)
fpr1,vpr1,the1=roc_curve(ytest,rlteste_pred)

plt.figure(figsize=[16,6])
plt.subplot(1,2,1)
plt.plot(fpr,vpr,c="red")
plt.title("Curva ROC de treino ",fontsize=14)
plt.subplot(1,2,2)
plt.plot(fpr1,vpr1,c="blue")
plt.title("Curva ROC de teste",fontsize=14)
plt.show()

print("A AUC para dados de treino foi de: ", roc_auc_score(ytrain,rltrain_pred))

print("A AUC para dados de teste foi de: ", roc_auc_score(ytest,rlteste_pred))



In [None]:
knn=KNeighborsClassifier()

In [None]:
knn.fit(xtrain,ytrain)

In [None]:
knntrain_pred=knn.predict_proba(xtrain)[:,1]
knnteste_pred=knn.predict_proba(xtest)[:,1]

fpr,vpr,the=roc_curve(ytrain,knntrain_pred)
fpr1,vpr1,the1=roc_curve(ytest,knnteste_pred)

plt.figure(figsize=[16,6])
plt.subplot(1,2,1)
plt.plot(fpr,vpr,c="red")
plt.title("Curva ROC de treino ",fontsize=14)
plt.subplot(1,2,2)
plt.plot(fpr1,vpr1,c="blue")
plt.title("Curva ROC de teste",fontsize=14)
plt.show()

print("A AUC para dados de treino foi de: ", roc_auc_score(ytrain,knntrain_pred))

print("A AUC para dados de teste foi de: ", roc_auc_score(ytest,knnteste_pred))



### Introdução da validação cruzada manual

In [None]:
kf=KFold(n_splits=5)

In [None]:
def validacao_cruzada_manual(classificador,x,y,num_folds,metrica):
    kf=KFold(n_splits=num_folds)
    metrica_train=[]
    metrica_test=[]
    plt.figure(figsize=[20,15])
    for train_idx,test_idx in kf.split(x,y):
        xtrain_folds=x[train_idx]
        ytrain_folds=y[train_idx]
        xtest_folds=x[test_idx]
        ytest_folds=y[test_idx]
        
        reg=classificador.fit(xtrain_folds,ytrain_folds)
        pred_train=reg.predict(xtrain_folds)
        pred_test=reg.predict(xtest_folds)
        metrica_train.append(metrica(ytrain_folds,pred_train))
        metrica_test.append(metrica(ytest_folds,pred_test))
    print("A média da métrica solicitada nos dados de treino foi de:   ",np.mean(metrica_train))    
    print("A média da métrica solicitada nos dados de teste foi de:   ",np.mean(metrica_test))
   
       

In [None]:
validacao_cruzada_manual(LogisticRegression(),xtrain,ytrain,5,f1_score)