In [37]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [38]:
def classifier_comparison(X, y, multiclass=False, red_perf=False):
    x_train, x_test, y_train, y_test = train_test_split(X, y)


    rf = RandomForestClassifier()
    svm = SVC()
    nb = GaussianNB()
    ada_boost = AdaBoostClassifier()
    grad_boost = GradientBoostingClassifier()
    classifiers = [rf, nb, ada_boost, grad_boost]

    results = []
    for classifier in classifiers:
        classifier.fit(x_train, y_train)
        predictions = classifier.predict(x_test)
        # Use weighted F1 score across multiclass inputs, otherwise default binary score
        if multiclass:
            results.append([accuracy_score(y_test, predictions), f1_score(y_test, predictions, average='weighted')])
        else:
            results.append([accuracy_score(y_test, predictions), f1_score(y_test, predictions)])

    # Scale data for use in SVMs so larger/more spread variables do not dominate (also improves runtime)
    scaler = StandardScaler().fit(x_train)
    x_train_std = scaler.transform(x_train)
    x_test_std = scaler.transform(x_test)
    svm.fit(x_train_std, y_train)
    svm_pred = svm.predict(x_test_std)
    if multiclass:
            results.append([accuracy_score(y_test, svm_pred), f1_score(y_test, svm_pred, average='weighted')])
    else:
            results.append([accuracy_score(y_test, svm_pred), f1_score(y_test, svm_pred)])

    return pd.DataFrame(results, ["RF", "Naive Bayes", "AdaBoost", "GradientBoost", "SVM"], ["Accuracy", "F1 Score"])

In [39]:
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
heart = pd.read_csv("../Datasets/heart_cleveland_upload.csv")
y_heart = heart["condition"]
X_heart = heart.drop("condition", axis=1)
stars = pd.read_csv("../Datasets/star_classification.csv")
y_stars = stars["class"]
X_stars = stars.drop("class", axis=1)

In [45]:
print(classifier_comparison(X_iris, y_iris, True))

               Accuracy  F1 Score
RF             0.973684  0.973645
Naive Bayes    0.973684  0.973645
AdaBoost       0.921053  0.920936
GradientBoost  0.973684  0.973645
SVM            0.947368  0.947368


In [46]:
print(classifier_comparison(X_stars, y_stars, True, True))

               Accuracy  F1 Score
RF              0.97988  0.979736
Naive Bayes     0.60228  0.480350
AdaBoost        0.78924  0.731177
GradientBoost   0.97756  0.977346
SVM             0.96364  0.963459


In [47]:
print(classifier_comparison(X_heart, y_heart))

               Accuracy  F1 Score
RF             0.840000  0.818182
Naive Bayes    0.853333  0.845070
AdaBoost       0.840000  0.823529
GradientBoost  0.800000  0.776119
SVM            0.826667  0.811594
