# Make a Multi Output classification and validation

In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer

## Fetching data from [OpenML](https://www.openml.org)

There as not many datasets who have the multi-outputs labels one that i found is the `reuters` v2.

In [2]:
data, target = fetch_openml(data_id=41470,as_frame=True,return_X_y=True)

## Preview of the data

In [3]:
data.head()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature234,feature235,feature236,feature237,feature238,feature239,feature240,feature241,feature242,feature243
0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,7.0,5.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preview of the targets

In [4]:
target.head()

Unnamed: 0,label1,label2,label3,label4,label5,label6,label7
0,False,False,False,True,False,False,False
1,False,False,False,True,False,False,True
2,False,True,False,False,False,False,False
3,False,False,True,False,False,False,False
4,False,False,True,False,False,False,True


## Selection

In this case will use one simple function (`train_test_split`) to split the data, but the recommended to validate is use more robust techniques.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=42)

## Classifier Estimator

KNN is by default in scikit-learn multi-output, there's no need to adapt with other estimator.

In [6]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [7]:
y_pred = knn.predict(X_test)

## Validation

Does not exist any metric to validate the result of prediction of a multi output classification, so i adapt to flatten the prediction and make the comparison. A simple metrics is used `accuracy_score`

In [8]:
true, pred = y_test.values.ravel(), y_pred.ravel()

In [9]:
bin_true = LabelBinarizer().fit_transform(true)
bin_pred = LabelBinarizer().fit_transform(pred)

accuracy_score(true, pred)

0.9083333333333333