In [2]:
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
import pandas as pd
import numpy as np


In [None]:
data = load_breast_cancer()
X = data.data
y= data.target

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

In [18]:
df = pd.read_csv("./data/training_landmarks.csv")
df_np = df.to_numpy(copy=True)

X = df_np[:, 1:-1]
y= df_np[:, -1]

# Encode y into numerical labels
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.8,random_state =123)

# Imputing missing values AFTER the data split
X_train = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_train)
X_test = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_test)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████| 29/29 [00:03<00:00,  7.80it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 1.00               1.00     1.00      1.00   
GaussianNB                         1.00               1.00     1.00      1.00   
BaggingClassifier                  1.00               1.00     1.00      1.00   
LinearDiscriminantAnalysis         1.00               1.00     1.00      1.00   
ExtraTreesClassifier               1.00               1.00     1.00      1.00   
RandomForestClassifier             0.96               0.96     0.96      0.96   
BernoulliNB                        0.93               0.93     0.93      0.93   
LogisticRegression                 0.86               0.87     0.87      0.86   
SVC                                0.86               0.86     0.86      0.86   
RidgeClassifier                    0.79               0.78     0.78      0.79   
LinearSVC                   




In [3]:
# Use ham's data as testing data
train_data = pd.read_csv("./data/training_landmarks.csv").to_numpy(copy=True)
test_data = pd.read_csv("./data/testing_landmarks.csv").to_numpy(copy=True)

X_train = train_data[:, 1:-1]
y_train = train_data[:, -1]
X_test = test_data[:, 1:-1]
y_test = test_data[:, -1]

# Encode y into numerical labels
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

X_train, y_train, y_test

(array([[0.515096128, 0.292093754, 0.504475772, ..., 0.924711823,
         0.516159475, 0.956537724],
        [0.455881566, 0.300834924, 0.469460577, ..., 0.902029693,
         0.445862234, 0.885526538],
        [0.515096128, 0.292093754, 0.504475772, ..., 0.924711823,
         0.516159475, 0.956537724],
        ...,
        [0.496946275, 0.29766351, 0.484862566, ..., 0.911811292,
         0.545915365, 0.905646861],
        [0.562297046, 0.294404089, 0.552963018, ..., 0.889972568,
         0.604453743, 0.916724324],
        [0.520139694, 0.300726354, 0.531307757, ..., 0.923831046,
         0.449077696, 0.927506685]], dtype=object),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([0, 0, 0, 0, 0, 0, 1, 1, 1]))

In [5]:
# Imputing missing values AFTER the data split
X_train = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_train)
X_test = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_test)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████| 29/29 [00:02<00:00, 10.06it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LinearSVC                          1.00               1.00     1.00      1.00   
LinearDiscriminantAnalysis         1.00               1.00     1.00      1.00   
NuSVC                              1.00               1.00     1.00      1.00   
CalibratedClassifierCV             1.00               1.00     1.00      1.00   
DecisionTreeClassifier             1.00               1.00     1.00      1.00   
RandomForestClassifier             1.00               1.00     1.00      1.00   
SVC                                1.00               1.00     1.00      1.00   
ExtraTreesClassifier               1.00               1.00     1.00      1.00   
LogisticRegression                 1.00               1.00     1.00      1.00   
BaggingClassifier                  1.00               1.00     1.00      1.00   
RidgeClassifierCV           




# Actual Model Training

Simply train a random forest model.

In [6]:
# Use ham's data as testing data
train_data = pd.read_csv("./data/training_landmarks.csv").to_numpy(copy=True)
test_data = pd.read_csv("./data/testing_landmarks.csv").to_numpy(copy=True)

X_train = train_data[:, 1:-1]
y_train = train_data[:, -1]
X_test = test_data[:, 1:-1]
y_test = test_data[:, -1]

# Encode y into numerical labels
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# Imputing missing values AFTER the data split
X_train = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_train)
X_test = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_test)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(y_pred)

from sklearn.metrics import classification_report
target_names = ['Ready', 'Dab']
print(classification_report(y_test, y_pred, target_names=target_names))

[0 0 0 0 1 0 1 1 1]
              precision    recall  f1-score   support

       Ready       1.00      0.83      0.91         6
         Dab       0.75      1.00      0.86         3

    accuracy                           0.89         9
   macro avg       0.88      0.92      0.88         9
weighted avg       0.92      0.89      0.89         9



In [7]:
# Save the model to a pkl file for deployment
import pickle
pickle.dump(clf, open("./models/classifier.pkl", 'wb'))