In [1]:
import markdownTable
import matplotlib.pyplot
import numpy
import operator
import os
import pathlib
import re
import sklearn.ensemble
import sklearn.neighbors
import sklearn.neural_network
import sklearn.preprocessing
import sklearn.svm
import sklearn.tree
import time

from result import calculate_test, Result

In [2]:
cfg = {
    "fold": 5,
    "n_labels": 5,
    "path_base": "dataset",
    "path_out": "out",
    "test_size": 0.2,
    "train_size": 0.8,
}

In [3]:
hyperparams = {
    "DecisionTreeClassifier": {
        "criterion": ["gini", "entropy"],
        "splitter": ["best", "random"],
        "max_depth": [10, 100, 1000]
    },
    "KNeighborsClassifier": {
        "n_neighbors": [2, 4, 6, 8, 10],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    },
    "MLPClassifier": {
        "activation": ["identity", "logistic", "tanh", "relu"],
        "solver": ["adam", "sgd"],
        "learning_rate_init": [0.01, 0.001, 0.0001],
        "momentum": [0.9, 0.4, 0.1]
    },
    "RandomForestClassifier": {
        "n_estimators": [200, 400, 600, 800, 1000],
        "max_features": ["sqrt", "log2"],
        "criterion": ["gini", "entropy"],
        "max_depth": [10, 100, 1000]
    },
    "SVC": {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    }
}
index = sklearn.model_selection.StratifiedShuffleSplit(n_splits=cfg["fold"], train_size=cfg["train_size"], test_size=cfg["test_size"], random_state=1)

In [4]:
surf = numpy.loadtxt(os.path.join("../", cfg["path_base"], "surf64.txt"))
samples, n_features = surf.shape
x, y = surf[0:, 0:n_features - 1], surf[:, n_features - 1]
x = sklearn.preprocessing.StandardScaler().fit_transform(x)


print(x.shape, y.shape)

(375, 257) (375,)


In [14]:
def save_mean_std(cfg, list_result_fold, path):
    cfg_used = {
        "fold": str(cfg["fold"]),
        "n_labels": str(cfg["n_labels"]),
        "path_base": str(cfg["path_base"]),
        "path_out": str(cfg["path_out"]),
        "test_size": str(cfg["test_size"]),
        "train_size": str(cfg["train_size"])
    }
    try:
        with open(os.path.join(path, "mean.md"), "w") as file:
            list_result_between_rule = list()
            for rule in ("max", "prod", "sum"):
                file.write(re.sub(r"```$", "\n```\n\n", markdownTable.markdownTable(list([cfg_used])).getMarkdown()))

                list_result_per_rule = list(filter(lambda l: getattr(l, "rule") == rule, list_result_fold))
                mean_accuracy = numpy.mean(list(getattr(l, "accuracy") for l in list_result_per_rule))
                std_deviation = numpy.std(list(getattr(l, "accuracy") for l in list_result_per_rule))
                best_fold = max(list_result_fold, key=operator.attrgetter("accuracy"))

                mean = {
                    "mean_accuracy": str(mean_accuracy),
                    "mean_accuracy_per": str(round(mean_accuracy * 100, 4)),
                    "std_deviation": str(std_deviation),
                }

                best = {
                    "best_fold": str(getattr(best_fold, "fold")),
                    "best_accuracy": str(getattr(best_fold, "accuracy")),
                    "best_accuracy_per": str(round(getattr(best_fold, "accuracy") * 100, 4)),
                }


                file.write(re.sub(r"```$", "\n```\n\n", markdownTable.markdownTable(list([mean])).getMarkdown()))
                file.write(re.sub(r"```$", "\n```\n\n", markdownTable.markdownTable(list([best])).getMarkdown()))


                result = Result(None, None, rule, numpy.zeros(shape=(1,)), numpy.zeros(shape=(1,)), numpy.zeros(shape=(1,)))
                setattr(result, "accuracy", mean_accuracy)
                list_result_between_rule.append(result)
            best_rule = max(list_result_between_rule, key=operator.attrgetter("accuracy"))
            b = {
              "best_accuracy": str(round(getattr(best_rule, "accuracy") * 100, 4)),
              "best_rule": str(getattr(best_rule, "rule"))
            }
            file.write(re.sub(r"```$", "\n```\n\n", markdownTable.markdownTable(list([b])).getMarkdown()))
            file.close()
    except Exception as e:
        print(f"exception in {e}")
        raise

def save_confusion_matrix(classifier_name, dataset, list_result, path):
    for result in list_result:
        filename = f"confusion_matrix-{getattr(result, 'rule')}.png"
        labels = ["$\it{Manekia}$", "$\it{Ottonia}$", "$\it{Peperomia}$", "$\it{Piper}$", "$\it{Pothomorphe}$"]
        confusion_matrix = sklearn.metrics.ConfusionMatrixDisplay(getattr(result, "confusion_matrix"))
        confusion_matrix.plot(cmap="Reds")
        title = f"Confusion Matrix\ndataset: {dataset}, classifier: {classifier_name}\naccuracy: {round(getattr(result, 'accuracy') * 100, 4)}, rule: {getattr(result, 'rule')}"
        matplotlib.pyplot.ioff()
        matplotlib.pyplot.title(title, pad=20)
        matplotlib.pyplot.xticks(numpy.arange(5), labels, rotation=(45))
        matplotlib.pyplot.yticks(numpy.arange(5), labels)
        matplotlib.pyplot.ylabel("y_test", fontsize=12)
        matplotlib.pyplot.xlabel("y_pred", fontsize=12)
        matplotlib.pyplot.gcf().subplots_adjust(bottom=0.15, left=0.25)
        matplotlib.pyplot.rcParams["figure.facecolor"] = "white"
        matplotlib.pyplot.rcParams["figure.figsize"] = (10, 10)
        matplotlib.pyplot.savefig(os.path.join(path, filename))
        matplotlib.pyplot.cla()
        matplotlib.pyplot.clf()
        matplotlib.pyplot.close()

def save_fold(classifier_name, dataset, list_result, path, time):
    try:
        with open(os.path.join(path, "out.md"), "w") as file:
            for result in list_result:
                r = {
                    "fold": str(getattr(result, "fold")),
                    "rule": str(getattr(result, "rule")),
                    "accuracy": str(getattr(result, "accuracy")),
                    "accuracy_per": str(round(getattr(result, 'accuracy') * 100, 4)),
                    # "time": str(time.strftime("%H:%M:%S", time.gmtime(time)))
                }
                file.write(re.sub(r"```$", "\n```\n\n", markdownTable.markdownTable(list([r])).getMarkdown()))
            save_confusion_matrix(classifier_name, dataset, list_result, path)
            file.close()
    except Exception as e:
        print(f"exception in {e}")
        raise

list_result_classifier = list()
for classifier in (sklearn.tree.DecisionTreeClassifier(random_state=1), sklearn.neighbors.KNeighborsClassifier(n_jobs=-1), sklearn.neural_network.MLPClassifier(random_state=1), sklearn.ensemble.RandomForestClassifier(random_state=1), sklearn.svm.SVC(random_state=1, probability=True)):
    classifier_name = classifier.__class__.__name__

    model = sklearn.model_selection.GridSearchCV(classifier, hyperparams[classifier_name], scoring="accuracy", cv=cfg["fold"])
    model.fit(x, y)

    best_classifier = model.best_estimator_
    best_params = model.best_params_

    list_result_fold = list()
    list_time = list()

    path_classifier = os.path.join(cfg["path_out"], "surf", classifier_name)
    pathlib.Path(path_classifier).mkdir(parents=True, exist_ok=True)
    for fold, (index_train, index_test) in enumerate(index.split(x, y)):
        x_train, y_train = x[index_train], y[index_train]
        x_test, y_test = x[index_test], y[index_test]

        print(fold, classifier_name, x_train.shape, x_test.shape)

        start_time = time.time()
        best_classifier.fit(x_train, y_train)
        y_pred = best_classifier.predict_proba(x_test)
        end_time = time.time()

        path_fold = os.path.join(path_classifier, str(fold))
        pathlib.Path(path_fold).mkdir(parents=True, exist_ok=True)

        result_max_rule, result_prod_rule, result_sum_rule = calculate_test(cfg, classifier, fold, y_pred, y_test)

        final_time = end_time - start_time

        list_result_fold.append(result_max_rule)
        list_result_fold.append(result_prod_rule)
        list_result_fold.append(result_sum_rule)
        list_time.append(final_time)

        save_fold(classifier_name, "surf", (result_max_rule, result_prod_rule, result_sum_rule), path_fold, final_time)
    save_mean_std(cfg, list_result_fold, path_classifier)
    list_result_classifier = list_result_classifier + list_result_fold

0 DecisionTreeClassifier (300, 257) (75, 257)
1 DecisionTreeClassifier (300, 257) (75, 257)
2 DecisionTreeClassifier (300, 257) (75, 257)
3 DecisionTreeClassifier (300, 257) (75, 257)
4 DecisionTreeClassifier (300, 257) (75, 257)
