In [1]:
import pandas as pd
from sklearn.metrics import classification_report

# Load iris dataset

Download here: https://www.kaggle.com/uciml/iris

In [2]:
iris_full = pd.read_csv("dataset/Iris.csv")

In [3]:
iris_full = iris_full.set_index("Id")

In [4]:
iris_full.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
iris_full["Species"] = pd.Categorical(iris_full["Species"])
iris_full["Species_code"] = iris_full["Species"].cat.codes

In [6]:
target_names = iris_full["Species"].unique()

In [7]:
target_names

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Categories (3, object): ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

# Take sample

In [8]:
# take sample as training data
data_train = iris_full.sample(frac=0.8, random_state=786)
data_train = data_train.drop("Species", axis=1) # we alread make Species code 
# test data
data_test = iris_full.drop(data_train.index)

In [9]:
data_train.count()

SepalLengthCm    120
SepalWidthCm     120
PetalLengthCm    120
PetalWidthCm     120
Species_code     120
dtype: int64

In [10]:
data_test.count()

SepalLengthCm    30
SepalWidthCm     30
PetalLengthCm    30
PetalWidthCm     30
Species          30
Species_code     30
dtype: int64

In [11]:
iris_full.count()

SepalLengthCm    150
SepalWidthCm     150
PetalLengthCm    150
PetalWidthCm     150
Species          150
Species_code     150
dtype: int64

In [12]:
print('Data for Modeling: ' + str(data_train.shape))
print('Unseen Data For Predictions ' + str(data_test.shape))

Data for Modeling: (120, 5)
Unseen Data For Predictions (30, 6)


# Setup classifier environment

In [13]:
from pycaret.classification import *

In [33]:
exp_clf_iris = setup(data = data_train, target = 'Species_code',  session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Species_code
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(120, 5)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [34]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.9639,0.9954,0.9611,0.9759,0.9631,0.9444,0.95,0.003
qda,Quadratic Discriminant Analysis,0.9625,0.9954,0.95,0.9531,0.953,0.9405,0.9497,0.003
lda,Linear Discriminant Analysis,0.9514,0.9954,0.9444,0.9666,0.9494,0.9249,0.9326,0.004
knn,K Neighbors Classifier,0.9403,0.9954,0.9278,0.9509,0.9384,0.9073,0.9127,0.006
lr,Logistic Regression,0.9389,0.9954,0.9389,0.9551,0.9368,0.9086,0.9166,0.009
et,Extra Trees Classifier,0.9389,0.9912,0.9278,0.9374,0.9294,0.9049,0.9176,0.044
dt,Decision Tree Classifier,0.9264,0.9512,0.9222,0.9426,0.9264,0.8872,0.8946,0.003
ada,Ada Boost Classifier,0.9264,0.9791,0.9167,0.9343,0.9264,0.8863,0.8899,0.017
gbc,Gradient Boosting Classifier,0.9264,0.985,0.9167,0.9343,0.9264,0.8863,0.8899,0.036
xgboost,Extreme Gradient Boosting,0.9264,0.9797,0.9167,0.9343,0.9264,0.8863,0.8899,0.017


# Let create model and do on test data

## create model 

In [16]:
nb_model = create_model("nb")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.8889,1.0,0.8889,0.9259,0.8889,0.8302,0.8462
3,0.875,1.0,0.8333,0.9167,0.8667,0.8,0.8208
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.875,0.9542,0.8889,0.9167,0.875,0.814,0.8333
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
nb_model

GaussianNB(priors=None, var_smoothing=1e-09)

In [36]:
qda_model = create_model("qda")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.75,1.0,0.6667,0.625,0.6667,0.6,0.6708
4,0.875,0.9542,0.8333,0.9062,0.8631,0.8049,0.826
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [37]:
qda_model

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)

## prepare test data

In [38]:
data_test.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Species_code
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,5.4,3.9,1.7,0.4,Iris-setosa,0
12,4.8,3.4,1.6,0.2,Iris-setosa,0
21,5.4,3.4,1.7,0.2,Iris-setosa,0
24,5.1,3.3,1.7,0.5,Iris-setosa,0
31,4.8,3.1,1.6,0.2,Iris-setosa,0


In [39]:
test_x = data_test.drop(["Species", "Species_code"], axis=1)

In [40]:
test_y = data_test[["Species_code"]]

In [41]:
test_x.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,5.4,3.9,1.7,0.4
12,4.8,3.4,1.6,0.2
21,5.4,3.4,1.7,0.2
24,5.1,3.3,1.7,0.5
31,4.8,3.1,1.6,0.2


In [42]:
test_y.head()

Unnamed: 0_level_0,Species_code
Id,Unnamed: 1_level_1
6,0
12,0
21,0
24,0
31,0


## Evaluation

On Naive Bayes model

In [43]:
# finalize_model ready for deploy in production
final_nb_model = finalize_model(nb_model)

In [44]:
y_pred_1 = final_nb_model.predict(test_x)

In [45]:
y_pred_1

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2], dtype=int8)

In [46]:
print(classification_report(test_y, y_pred_1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       0.92      1.00      0.96        11
           2       1.00      0.92      0.96        13

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [47]:
y_pred_1

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2], dtype=int8)

On qda model 

In [48]:
# finalize_model ready for deploy in production
final_qda_model = finalize_model(qda_model)

In [49]:
y_pred_2 = final_qda_model.predict(test_x)

In [50]:
print(classification_report(test_y, y_pred_2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00        13

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

