# Repeated K Fold with multi-output classification

The intention of this notebook is provide a way to get the mean and std deviation of each repetition.

In [None]:
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve,
    multilabel_confusion_matrix, precision_recall_fscore_support
)
from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

## Dataset

We will use the `reuters` dataset v.2, because the multioutput property.

In [None]:
data, target = fetch_openml(data_id=41470, return_X_y=True, as_frame=True)

### The firt 5 rows of the features

In [None]:
data.head()

### The firt 5 rows of the classes

In [None]:
target.head()

## Preprocessing

We will convert the classes in numerical instances

In [None]:
le = LabelEncoder()

## Classifier Estimator

We will use a graph based classifier the K Neighbors Classificator.

In [None]:
knn = KNeighborsClassifier(n_neighbors=10,n_jobs=4)

## Cross Validation

To simulate the same behavior of a published article, we will use a 10-fold with 30 repetitions.

In [None]:
rkf = RepeatedKFold(n_splits=10,n_repeats=30,random_state=2022)

Here we make list to store the each fold metrics, being these methods: `accuracy_score`,`precision_score`,`recall_score` and `multilabel_confusion_matrix`. The `default_dicts` are used to make the means's and the std's.

In [None]:
scores_acc = []
scores_precision = [] 
scores_recall = []
scores_mc = []

# TODO: Improve the average and standard deviation
table_acc = defaultdict(list)
table_precision = defaultdict(list)
table_recall = defaultdict(list)
table_cm = defaultdict(list)

In [None]:
for i,(train, test) in enumerate(rkf.split(data, target)):
    X_train, X_test = data.values[train], data.values[test]
    y_train, y_test = target.values[train], target.values[test]
    
    encoded_y_train = np.empty(y_train.shape)
    encoded_y_test = np.empty(y_test.shape)
    for i in range(y_train.shape[1]):
        encoded_y_train[:,i] = le.fit_transform(y_train[:,i])
        encoded_y_test[:,i] = le.transform(y_test[:,i])
        
    
    knn.fit(X_train, encoded_y_train)
    
    y_pred = knn.predict(X_test)
    
    pred = y_pred.ravel()
    
    score_acc = accuracy_score(encoded_y_test, y_pred,)
    scores_acc.append(score_acc)
    
    score_precision = precision_score(encoded_y_test, y_pred, average='micro', zero_division='warn')
    scores_precision.append(score_precision)
    
    score_recall = recall_score(encoded_y_test, y_pred, average='micro', zero_division='warn')
    scores_recall.append(scores_recall)
    
# TODO: Put in a pandas DataFrame to be better visualized.    
#     print(multilabel_confusion_matrix(encoded_y_test, y_pred))
#     print('*' * 80)
#     print(score_acc)
#     print('=' * 80)
#     print(score_precision)
#     print('-' * 80)
#     print(score_recall)
#     print('.' * 80)
#     print(precision_recall_fscore_support(encoded_y_test, y_pred, average=None, zero_division='warn',))

In [None]:
for j in range(0,rkf.get_n_splits(),10):
    aux = j + 10
    if aux <= rkf.get_n_splits():
        if aux % 10 == 0:
            table_acc['Accuracy_mean'].append(np.mean(scores_acc[j:aux]))
            table_acc['Accuracy_std'].append(np.std(scores_acc[j:aux]))
            
            table_precision['Precision_mean'].append(np.mean(scores_precision[j:aux]))
            table_precision['Precision_std'].append(np.std(scores_precision[j:aux]))
            
            table_recall['Recall_mean'].append(np.mean(scores_recall[j:aux]))
            table_recall['Recall_std'].append(np.std(scores_recall[j:aux]))
        
# display(table_acc)
# display(table_precision)
# display(table_recall)