# Comparison of various classification models

It is using the [Credit Card Fraud Detection dataset](https://www.kaggle.com/dalpozz/creditcardfraud) from [Kaggle](https://www.kaggle.com/).

In [None]:
from time import time

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from utility import *

## Load and Split Data

In [None]:
start = time()
data_file = "data/creditcard.csv"
# Note: it is around 180MB
check_and_download(data_file, "https://github.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/raw/master/creditcard.csv")
dataset, target = load_dataset(data_file)
print(f"Loaded data in {(time() - start)} seconds")

## Prepare data for models

In [None]:
start = time()
x_train, x_test, y_train, y_test = train_test_split(
    dataset, target, test_size=0.2, random_state=42
)
print(f"Training set size {len(x_train)}, Testing set size: {len(x_test)}")
print(f"Prepared data for models in {(time() - start)} seconds")

## Run models over data

In [None]:
scores = []
models = {
    "GNB": GaussianNB(),
    "DT": DecisionTreeClassifier(max_depth=5),
    "MLP": MLPClassifier(alpha=1.0),
    # "LSVC": SVC(kernel="linear", C=0.025), # very slow as there is too much data
    "NN": KNeighborsClassifier(),
    "RF": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "ABC": AdaBoostClassifier(),
    "SGD": SGDClassifier(),
}
names = []
for k, model in models.items():
    print(f"Running {k}")
    start = time()
    fitted_model = model.fit(x_train, y_train)
    print(f"Training time: {time() - start} seconds")
    start = time()
    y_predicted = fitted_model.predict(x_test)
    print(f"Testing time: {time() - start} seconds")
    scores.append(display(y_test, y_predicted))
    names.append(k)

## Scatter plot scores of all the models

In [None]:
plot_scores(scores, names)