# Comparaison des classifieurs 

In [None]:
from pathlib import Path
import numpy as np

import info9

In [2]:
LABELS = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
N_ROWS_TRAIN = 10000
N_ROWS_TEST = 5000

ftrain = Path("../data/train.hdf5")
ftesta = Path("../data/testa.hdf5")

In [3]:
projecter = info9.RandomProjection(1024, 128, "Gaussian")

In [4]:
def read_dataset(fdataset, row_limit=500):
    proj_dset = projecter.project(info9.read_hdf5_dataset(str(fdataset), "representation")[:row_limit])
    labels = info9.read_hdf5_dataset(str(fdataset), "true_labels")[:row_limit]
    return info9.Dataset(proj_dset, labels)

In [5]:
def read_dataset_binary(fdataset, row_limit=500, label_of_interest=4):
    proj_dset = projecter.project(info9.read_hdf5_dataset(str(fdataset), "representation")[:row_limit])
    labels = (info9.read_hdf5_dataset(str(fdataset), "true_labels")[:row_limit] == label_of_interest).astype(int)
    return info9.Dataset(proj_dset, labels)

## Binary K-NN classification

In [6]:
dtrain_bin = read_dataset_binary(ftrain, row_limit=N_ROWS_TRAIN)
dtesta_bin = read_dataset_binary(ftesta, row_limit=N_ROWS_TEST)

print(dtrain_bin.show(False))

Dataset with 10000 samples, and 128 dimensions.



In [34]:
classifier_bin = info9.KnnClassificationBinary(k=10, dataset=dtrain_bin, threshold=0.2)

In [35]:
confusion_matrix_bin = classifier_bin.estimate_all(dtesta_bin)

In [36]:
print("Confusion matrix after binary k-NN classification on I-PER:\n")
print(confusion_matrix_bin.PrintEvaluation())

Confusion matrix after binary k-NN classification on I-PER:

		Predicted
		0	1
Actual	0	4323	264
	1	66	347

Error rate		0.066
False alarm rate	0.057554
Detection rate		0.840194
F-score			0.677734
Precision		0.567921



## Multiclass K-NN classification

In [37]:
dtrain = read_dataset(ftrain, row_limit=N_ROWS_TRAIN)
dtesta = read_dataset(ftesta, row_limit=N_ROWS_TEST)

print(dtrain.show(False))

Dataset with 10000 samples, and 128 dimensions.



In [38]:
classifier = info9.KnnClassificationMulticlass(k=10, dataset=dtrain, labels=LABELS)

In [39]:
cm = classifier.estimate_all(dtesta)

In [40]:
print("Full confusion matrix after multilabel k-nn classification:\n")
print(cm.PrintMatrix())

Full confusion matrix after multilabel k-nn classification:

	O	B-MISC	I-MISC	B-PER	I-PER	B-ORG	I-ORG	B-LOC	I-LOC	
O	2589	20	399	0	342	0	346	0	291	
B-MISC	0	0	0	0	0	0	0	0	0	
I-MISC	2	7	92	0	6	0	25	0	18	
B-PER	0	0	0	0	0	0	0	0	0	
I-PER	8	1	37	0	315	0	29	0	23	
B-ORG	0	0	0	0	0	0	0	0	0	
I-ORG	4	4	32	0	10	0	159	0	30	
B-LOC	0	0	0	0	0	0	0	0	0	
I-LOC	1	1	35	0	11	0	34	0	129	



In [41]:
print("One vs all for I-PER after multiclass K-NN:\n")
print(cm.OneVsAllConfusionMatrix(4).PrintEvaluation())

One vs all for I-PER after multiclass K-NN:

		Predicted
		0	1
Actual	0	4218	369
	1	98	315

Error rate		0.0934
False alarm rate	0.0804447
Detection rate		0.762712
F-score			0.574294
Precision		0.460526

