In [1]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import time

In [2]:
models = {OneVsRestClassifier(SVC()) : "OneVsRest",
            OneVsOneClassifier(SVC()) : "OneVsOne",
            KNeighborsClassifier(n_neighbors=5) : "KNeighborsClassifier",
            DecisionTreeClassifier() : "CART",
            RandomForestClassifier() : "RandomForestClassifier",
            GaussianNB() : "GaussianNB"}

In [3]:
def models_quality(X_tr, y_tr, X_t, y_t):
  #рисуем изначальную тестовую модель
  plt.figure(figsize=(10, 5))
  sns.scatterplot(x=X_t[:,0], y=X_t[:,1], hue=y_t)
  plt.grid(ls="--")
  plt.title("Тестовая выборка")
  top = {}#топ моделей по точности
  for i in models:
    model = i
    start = time.time()
    model.fit(X_tr, y_tr)
    plt.figure(figsize=(10,5))
    visualizer = ClassPredictionError(i)#рисуем гистограмму ошибок для каждого класса
    visualizer_report = ClassificationReport(i)#рисуем таблицу со всеми оценками классификации
    visualizer_roc = ROCAUC(i, micro=False , macro=False)#рисуем ROC кривую
    y_pred = model.predict(X_t)
    end = time.time()
    top[models.get(i)] = round(accuracy_score(y_t, y_pred), 2)
    #рисуем диаграмму рассеивания, помечая неправильно предсказанные данные
    errors = y_pred != y_t
    plt.figure(figsize=(10,5))
    sns.scatterplot(x=X_t[:,0], y=X_t[:,1], hue=y_pred)
    sns.scatterplot(x=X_t[errors,0], y=X_t[errors,1], c='red', marker="X")
    plt.title(f"Результат классификации {models.get(i)}\nВремя:{round(end - start, 1)}секунд")
    plt.grid(linestyle="--")
    plt.show()

    visualizer.fit(X_tr, y_tr)
    visualizer.score(X_t , y_t)
    visualizer.title = f"Accuracy : {round(accuracy_score(y_t, y_pred), 2)}"
    visualizer.show()

    visualizer_report.fit(X_tr, y_tr)
    visualizer_report.score(X_t , y_t)
    visualizer_report.show()

    visualizer_roc.fit(X_tr , y_tr)
    visualizer_roc.score(X_t , y_t)
    visualizer_roc.show()

    print("<--------------------------------------------------------------------------------------------------------------------->")
  return top

Посмотрим на эффективность алгоритмов на линейно разделимой выборке

In [None]:
X, y = make_blobs(n_samples=10000, cluster_std=2.5, n_features=2, centers=4, random_state=42, )
X_tr , X_t , y_tr , y_t = train_test_split(X, y, test_size =0.3,
random_state =42, stratify=y)
top = models_quality(X_tr, y_tr, X_t, y_t)
top = sorted(top.items(), key=lambda item: item[1], reverse=True)
for i in top:
  print(top.index(i)+1, ".", i[0], " : ", i[1], sep="")

Output hidden; open in https://colab.research.google.com to view.

Теперь посмотрим на множество, которое сложно линейно разделить

In [None]:
from sklearn.datasets import make_circles
X, y = make_circles(n_samples=10000, noise=0.1, random_state=42)
X_tr , X_t , y_tr , y_t = train_test_split(X, y, test_size =0.3,
random_state =42, stratify=y)
top = models_quality(X_tr, y_tr, X_t, y_t)
top = sorted(top.items(), key=lambda item: item[1], reverse=True)
for i in top:
  print(top.index(i)+1, ".", i[0], " : ", i[1], sep="")


Output hidden; open in https://colab.research.google.com to view.