Daniel Rocha Ruiz, MSc in Data Science and Business Analytics

# Set-up
## Import packages

In [None]:
# general packages
import pandas as pd
import numpy as np

# scikit-learn
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Load data

In [None]:
df = pd.read_parquet("dataset.parquet")

# Modeling
## Prepare the data

In [26]:
np.random.seed(42)

X = df.iloc[:,1:]
y = df.iloc[:,:1]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print("X_train", X_train.shape)
print("X_test", X_test.shape)

# column transformer
ct = ColumnTransformer(transformers=[
    ('transformer1',MinMaxScaler(),['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'])
],remainder='passthrough')
ct = ct.fit(X_train)

X_train_transform = ct.transform(X_train)
X_test_transform = ct.transform(X_test)
print("X_train_transform", X_train_transform.shape)
print("X_test_transform", X_test_transform.shape)

# adjust y
y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()
print("y_train", y_train.shape)
print("y_test", y_test.shape)

X_train (266, 9)
X_test (67, 9)
X_train_transform (266, 9)
X_test_transform (67, 9)
y_train (266,)
y_test (67,)


## Model training
We try out multiple classifiers available in the Scikit-Learn package.

In [40]:
names = ["Nearest Neighbors"
         , "Linear SVM"
         , "RBF SVM"
         , "Gaussian Process"
         , "Decision Tree"
         , "Random Forest"
         , "Neural Net"
         , "AdaBoost"
         , "Naive Bayes"
         , "Logistic"
        ]

classifiers = [
    KNeighborsClassifier(20),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=2),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    LogisticRegression()]

## View results

In [41]:
for name, clf in zip(names, classifiers):
    clf.fit(X_train_transform, y_train)
    score = clf.score(X_test_transform, y_test)
    print(name, "{:.1%}".format(score))

Nearest Neighbors 85.1%
Linear SVM 74.6%
RBF SVM 98.5%
Gaussian Process 98.5%
Decision Tree 91.0%
Random Forest 100.0%
Neural Net 100.0%
AdaBoost 95.5%
Naive Bayes 77.6%
Logistic 100.0%
