In [1]:
import warnings
#warnings.filterwarnings('ignore', category=UndefinedMetricWarning, module='UndefinedMetricWarning')

from sklearn import datasets
from sklearn.metrics import classification_report

from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn import tree  
from sklearn import naive_bayes
from sklearn import ensemble
import numpy as np
import time
import pandas as pd
from sklearn import dummy, metrics
from sklearn import model_selection as ms
from statistics import mean 

## Функция для загрузки датасетов

In [2]:
def load_dataset(loader):
    dataset = loader()
    data_frame = pd.DataFrame(dataset.data)
    target_frame = pd.DataFrame(dataset.target)
    
    return dataset.data, dataset.target

dataset_loaders = {
    'iris': datasets.load_iris,
    'digits': datasets.load_digits,
    'breast_canсer': datasets.load_breast_cancer,
    'wine': datasets.load_wine
}

## Генераторы классификаторов

In [7]:
def get_logistic_regression_model():
    return linear_model.LogisticRegression(random_state = 1, max_iter = 5000)

def get_svm_model():  
    hyper_params_svc = {'kernel': ['linear','poly','rbf'],
                        'gamma': [1e-3, 1e-4, 'scale'],
                        'C': [1, 10, 100]
                       }
    f1_scorer = metrics.make_scorer(metrics.f1_score, average='weighted')
    
    classifier_svc = svm.SVC()
    
    svm_grid = ms.GridSearchCV(classifier_svc, 
                               hyper_params_svc,  
                               refit=True,        
                               scoring=f1_scorer,  
                               cv=ms.StratifiedKFold(n_splits=8)
    )
    return svm_grid

def get_decision_tree_classifier_model():
    return tree.DecisionTreeClassifier(random_state=1, criterion='entropy')

def get_naive_bayes_model():
    return naive_bayes.MultinomialNB()

def get_random_forest_classifier_model():
    return ensemble.RandomForestClassifier(random_state=1, n_estimators=150, criterion='entropy', max_leaf_nodes=200)

def get_add_boost_classifier_model():
    return ensemble.AdaBoostClassifier(random_state=1, base_estimator=ensemble.RandomForestClassifier(random_state=1, n_estimators=150, criterion='entropy'))

def get_voting_classifier_model():
    return ensemble.VotingClassifier(estimators=[('1', get_decision_tree_classifier_model()), ('2', get_add_boost_classifier_model()), ('3', get_logistic_regression_model())])

def get_dummy_classifier_model():
    return dummy.DummyClassifier(strategy='most_frequent');

model_producers = {
    'Baseline': get_dummy_classifier_model,
    'LogisticRegression': get_logistic_regression_model,        
    'SVM': get_svm_model,
    'DecisionTreeClassifier': get_decision_tree_classifier_model,
    'NaiveBayes': get_naive_bayes_model,
    'RandomForestClassifier': get_random_forest_classifier_model,
    'AdaBoostClassifier': get_add_boost_classifier_model,
    'VotingClassifier': get_voting_classifier_model
}

## Фукнции для тестирования классификаторов

In [4]:
def test_model_and_print_results(producer, data, labels, folds = 3):
    model = producer()
    percentage_split_test(model, data, labels, 25)
    cross_validation_test(model, data, labels, folds)
    cross_validation_scoring(model, data, labels, folds)

def percentage_split_test(model, data, labels, percentage):
    train_data, test_data, train_labels, test_labels = ms.train_test_split(data, labels, test_size=percentage/100)
    
    model.fit(train_data, train_labels)
    predictions = model.predict(test_data)

    print_result(predictions, test_labels, 'Method: Percentage split with {}%'.format(percentage))

    
def cross_validation_test(model, data, labels, folds):
    # Фактически это всё собрано из предсказаний для тестовых значений разных фолдов
    predictions = ms.cross_val_predict(model, data, labels, cv=folds)
    
    print_result(predictions, labels, 'Method: Cross validation with {} folds'.format(folds))
    
    
def cross_validation_scoring(model, data, labels, folds):
    f1_scorer = metrics.make_scorer(metrics.f1_score, average='weighted')
    scoring = {'f1': f1_scorer}
    model_scores = ms.cross_validate (model,
                                      data, 
                                      labels,
                                      scoring=scoring,
                                      cv=folds,
                                      return_train_score=True)
    print('\tTable of each fit and test of the classifier')
    for (k,v) in model_scores.items():
        print(('\t{:10}: {}').format(k,v)) 
        
    print('\n\tAverage F1 for test: {:0.3f}'.format(mean(model_scores['test_f1'])))
    
    
def print_result(predictions, true_labels, header):
    def drop_useless_keys(dictionary):
        dictionary.pop('accuracy')
        dictionary.pop('macro avg')
        dictionary.pop('weighted avg')
        
    f1_total = 0
    
    metrics_dict = classification_report(true_labels, predictions, zero_division=1, output_dict=True, labels=np.unique(true_labels))
    drop_useless_keys(metrics_dict)

    print(' ' * 4 + header)
    print('\tClass: F1-Score')
    for label, metrics in metrics_dict.items():
        print('\t{:5}: {}'.format(label, metrics['f1-score']))
        f1_total += float(metrics['f1-score'])
        
    print('\n\tAverage F1: {}\n'.format(f1_total / len(metrics_dict)))
      

## MAIN

In [5]:
folds_for_models = {
    'iris': 3,
    'digits': 7,
    'breast_canсer': 7,
    'wine': 10
}
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

# Везде считаю только F меру, так как и так много текста получается

### IRIS

In [6]:
set_name = 'iris'
loader = dataset_loaders[set_name]
data, labels = load_dataset(loader)

for model_name, producer in model_producers.items():
    print('Testing {}\n'.format(model_name))
    test_model_and_print_results(producer, data, labels, folds_for_models[set_name])
    print('\n')

Testing Baseline

    Method: Percentage split with 25%
	Class: F1-Score
	0    : 0.0
	1    : 0.4489795918367347
	2    : 0.0

	Average F1: 0.14965986394557823

    Method: Cross validation with 3 folds
	Class: F1-Score
	0    : 0.32
	1    : 0.32
	2    : 0.32

	Average F1: 0.32

	Table of each fit and test of the classifier
	fit_time  : [ 0.000  0.000  0.000]
	score_time: [ 0.001  0.001  0.001]
	test_f1   : [ 0.155  0.155  0.155]
	train_f1  : [ 0.173  0.173  0.173]

	Average F1 for test: 0.155


Testing LogisticRegression

    Method: Percentage split with 25%
	Class: F1-Score
	0    : 1.0
	1    : 0.9375
	2    : 0.923076923076923

	Average F1: 0.953525641025641

    Method: Cross validation with 3 folds
	Class: F1-Score
	0    : 1.0
	1    : 0.9591836734693877
	2    : 0.9607843137254902

	Average F1: 0.9733226623982927

	Table of each fit and test of the classifier
	fit_time  : [ 0.032  0.014  0.015]
	score_time: [ 0.001  0.001  0.001]
	test_f1   : [ 0.980  0.960  0.980]
	train_f1  : [ 0.960

_Winner: LogisticRegression (25% 0.953, Cross 0.973)_

### DIGITS

In [7]:
set_name = 'digits'
loader = dataset_loaders[set_name]
data, labels = load_dataset(loader)

for model_name, producer in model_producers.items():
    print('Testing {}\n'.format(model_name))
    test_model_and_print_results(producer, data, labels, folds_for_models[set_name])
    print('\n')

Testing Baseline

    Method: Percentage split with 25%
	Class: F1-Score
	0    : 0.0
	1    : 0.0
	2    : 0.0
	3    : 0.0
	4    : 0.14814814814814814
	5    : 0.0
	6    : 0.0
	7    : 0.0
	8    : 0.0
	9    : 0.0

	Average F1: 0.014814814814814814

    Method: Cross validation with 7 folds
	Class: F1-Score
	0    : 0.0
	1    : 0.1187214611872146
	2    : 0.0
	3    : 0.18097447795823665
	4    : 0.0
	5    : 0.0
	6    : 0.0
	7    : 0.0
	8    : 0.0
	9    : 0.0

	Average F1: 0.029969593914545123

	Table of each fit and test of the classifier
	fit_time  : [ 0.001  0.000  0.001  0.000  0.001  0.000  0.001]
	score_time: [ 0.000  0.001  0.000  0.001  0.000  0.001  0.000]
	test_f1   : [ 0.019  0.019  0.019  0.019  0.019  0.019  0.019]
	train_f1  : [ 0.019  0.019  0.019  0.019  0.019  0.019  0.019]

	Average F1 for test: 0.019


Testing LogisticRegression

    Method: Percentage split with 25%
	Class: F1-Score
	0    : 1.0
	1    : 0.9863013698630138
	2    : 0.988235294117647
	3    : 0.9514563106796117
	

_Winner: SVM (25% 0.991, Cross 0.975)_

### BREAST_CANCER

In [8]:
set_name = 'breast_canсer'
loader = dataset_loaders[set_name]
data, labels = load_dataset(loader)

for model_name, producer in model_producers.items():
    print('Testing {}\n'.format(model_name))
    test_model_and_print_results(producer, data, labels, folds_for_models[set_name])
    print('\n')

Testing Baseline

    Method: Percentage split with 25%
	Class: F1-Score
	0    : 0.0
	1    : 0.7672413793103448

	Average F1: 0.3836206896551724

    Method: Cross validation with 7 folds
	Class: F1-Score
	0    : 0.0
	1    : 0.7710583153347732

	Average F1: 0.3855291576673866

	Table of each fit and test of the classifier
	fit_time  : [ 0.001  0.000  0.000  0.000  0.000  0.000  0.000]
	score_time: [ 0.000  0.001  0.001  0.001  0.001  0.001  0.000]
	test_f1   : [ 0.477  0.477  0.487  0.487  0.487  0.487  0.487]
	train_f1  : [ 0.485  0.485  0.483  0.483  0.483  0.483  0.483]

	Average F1 for test: 0.484


Testing LogisticRegression

    Method: Percentage split with 25%
	Class: F1-Score
	0    : 0.930232558139535
	1    : 0.9426751592356688

	Average F1: 0.9364538586876019

    Method: Cross validation with 7 folds
	Class: F1-Score
	0    : 0.9377990430622011
	1    : 0.9638888888888889

	Average F1: 0.950843965975545

	Table of each fit and test of the classifier
	fit_time  : [ 0.564  0.112

_Winner: RandomForestClassifier(25% 0.962, Cross 0.970)_

### WINE

In [6]:
set_name = 'wine'
loader = dataset_loaders[set_name]
data, labels = load_dataset(loader)

for model_name, producer in model_producers.items():
    print('Testing {}\n'.format(model_name))
    test_model_and_print_results(producer, data, labels, folds_for_models[set_name])
    print('\n')

Testing Baseline

    Method: Percentage split with 25%
	Class: F1-Score
	0    : 0.0
	1    : 0.6153846153846153
	2    : 0.0

	Average F1: 0.2051282051282051

    Method: Cross validation with 10 folds
	Class: F1-Score
	0    : 0.0
	1    : 0.570281124497992
	2    : 0.0

	Average F1: 0.19009370816599733

	Table of each fit and test of the classifier
	fit_time  : [ 0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000]
	score_time: [ 0.001  0.000  0.001  0.000  0.001  0.001  0.001  0.000  0.001  0.001]
	test_f1   : [ 0.218  0.218  0.218  0.218  0.218  0.218  0.218  0.218  0.240  0.301]
	train_f1  : [ 0.229  0.229  0.229  0.229  0.229  0.229  0.229  0.229  0.226  0.220]

	Average F1 for test: 0.228


Testing LogisticRegression

    Method: Percentage split with 25%
	Class: F1-Score
	0    : 0.9333333333333333
	1    : 0.9583333333333334
	2    : 1.0

	Average F1: 0.9638888888888889

    Method: Cross validation with 10 folds
	Class: F1-Score
	0    : 0.9747899159663865
	1    : 0.

_Winner: AdaBoostClassifier (25% 1.0, Cross 0.978)_