In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [7]:
def classifier_comparison(X, y, multiclass=False, red_perf=False):
    x_train, x_test, y_train, y_test = train_test_split(X, y)


    rf = RandomForestClassifier()
    svm = SVC()
    nb = GaussianNB()
    ada_boost = AdaBoostClassifier()
    grad_boost = GradientBoostingClassifier()
    classifiers = [rf, nb, ada_boost, grad_boost]

    results = []
    for classifier in classifiers:
        classifier.fit(x_train, y_train)
        predictions = classifier.predict(x_test)
        # Use weighted F1 score across multiclass inputs, otherwise default binary score
        if multiclass:
            results.append([accuracy_score(y_test, predictions), precision_score(y_test, predictions, average='weighted'), recall_score(y_test, predictions, average='weighted')])
        else:
            results.append([accuracy_score(y_test, predictions), precision_score(y_test, predictions), recall_score(y_test, predictions)])

    # Scale data for use in SVMs so larger/more spread variables do not dominate (also improves runtime)
    scaler = StandardScaler().fit(x_train)
    x_train_std = scaler.transform(x_train)
    x_test_std = scaler.transform(x_test)
    svm.fit(x_train_std, y_train)
    svm_pred = svm.predict(x_test_std)
    if multiclass:
            results.append([accuracy_score(y_test, svm_pred), precision_score(y_test, svm_pred, average='weighted'), recall_score(y_test, svm_pred, average='weighted')])
    else:
            results.append([accuracy_score(y_test, svm_pred), precision_score(y_test, svm_pred), recall_score(y_test, svm_pred)])

    return pd.DataFrame(results, ["RF", "Naive Bayes", "AdaBoost", "GradientBoost", "SVM"], ["Accuracy", "Precision", "Recall"])

In [8]:
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
heart = pd.read_csv("../Datasets/heart_cleveland_upload.csv")
y_heart = heart["condition"]
X_heart = heart.drop("condition", axis=1)
stars = pd.read_csv("../Datasets/star_classification.csv")
y_stars = stars["class"]
X_stars = stars.drop("class", axis=1)

In [52]:
print(classifier_comparison(X_iris, y_iris, True))

               Accuracy  Precision    Recall
RF             0.947368   0.947368  0.947368
Naive Bayes    0.921053   0.921952  0.921053
AdaBoost       0.868421   0.868758  0.868421
GradientBoost  0.894737   0.898661  0.894737
SVM            0.973684   0.975439  0.973684


In [53]:
print(classifier_comparison(X_stars, y_stars, True, True))

  _warn_prf(average, modifier, msg_start, len(result))


               Accuracy  Precision   Recall
RF              0.97916   0.979046  0.97916
Naive Bayes     0.59648   0.442431  0.59648
AdaBoost        0.63756   0.645962  0.63756
GradientBoost   0.97844   0.978347  0.97844
SVM             0.95760   0.957872  0.95760


In [11]:
print(classifier_comparison(X_heart, y_heart))

               Accuracy  Precision    Recall
RF             0.786667   0.937500  0.681818
Naive Bayes    0.813333   0.875000  0.795455
AdaBoost       0.786667   0.937500  0.681818
GradientBoost  0.773333   0.909091  0.681818
SVM            0.773333   0.885714  0.704545
