In [1]:
import pandas as pd
import numpy as np

cols = ["sepal-length","sepal-width","petal-length","petal-width","flower-class"]

dataset = pd.read_excel("4_data.xlsm", names=cols)


In [2]:
#data cleaning
def dataclean(d_set):
    for x in d_set:
        data_without_str = []
        for i in d_set[x]:
            if i != "  ":
              data_without_str.append(i)

        data_without_str_ds = pd.Series(data_without_str)
        # mean 
        #replace_value = np.mean(data_without_str_ds)
        # mode
        replace_value = np.array(data_without_str_ds.value_counts().keys())[0]
        d_set[x] = d_set[x].replace("  ",replace_value)

dataclean(dataset)



In [3]:
# splitting dataset into train and test
predictors = dataset[["sepal-length","sepal-width","petal-length","petal-width"]]
label = dataset["flower-class"]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, label, random_state=14, test_size=0.2)


In [4]:
# dara preprocessing
from sklearn.preprocessing import StandardScaler
std_scl = StandardScaler()
x_train = std_scl.fit_transform(x_train)
x_train

array([[-1.16185448,  1.01438206, -1.4354799 , -1.14287075],
       [-0.42117225, -0.81112214,  0.20486485, -1.27227286],
       [ 0.56640406, -0.35474609,  0.26142846,  0.15115037],
       [-0.29772521,  3.06807428, -1.32235268, -1.01346864],
       [-0.05083113,  0.32981798,  0.54424652,  0.79816092],
       [-1.28530152,  0.10162996, -1.32235268, -1.40167498],
       [-0.05083113, -0.12655806,  0.71393736, -1.27227286],
       [-0.54461929, -1.49568621, -0.07795321, -0.23705597],
       [ 1.18363925,  0.32981798,  1.05331903,  1.44517148],
       [ 1.18363925,  0.10162996,  0.60081013,  0.40995459],
       [ 0.31950998, -0.58293411,  0.54424652,  0.79816092],
       [-1.65564264,  0.32981798, -1.37891629, -1.27227286],
       [-0.66806633,  1.92713416, -1.4354799 , -1.01346864],
       [ 0.07261591, -0.81112214,  0.71393736,  0.5393567 ],
       [ 1.06019222, -0.12655806,  0.9401918 ,  1.18636726],
       [ 2.17121556, -0.12655806,  1.27957348,  1.44517148],
       [-1.16185448,  0.

In [5]:
# model development and training
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import  GaussianNB

knn_clf_model = KNeighborsClassifier(n_neighbors=15)
knn_clf_model.fit(x_train, y_train)

gnb_clf_model = GaussianNB()
gnb_clf_model.fit(x_train, y_train)



GaussianNB(priors=None, var_smoothing=1e-09)

In [6]:
# accuracy and scoring
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score


In [7]:
# model ranking according tp their performance criteria
def modelRange(model_acc_data):
    model_df = pd.DataFrame(model_acc_data)

    print("================ > Model Ranking < =====================")
    # based on Accuracy Score
    print("------------------ Based on Accuracy Score ------------------------")
    print(model_df[['model name', 'Accuracy Score', 'Cross Validation Mean Score']].sort_values("Accuracy Score", ascending=False))

    # based on Cross Validation Mean Score
    print("------------------ Based on Cross Validation Mean Score ------------------------")
    print(model_df[['model name', 'Accuracy Score', 'Cross Validation Mean Score']].sort_values("Cross Validation Mean Score",ascending=False))


In [8]:
# each model performance measuring
def getmodelAccuracyReport(model_list, x_test, y_test):
    print("================ > Model Accuracy < =====================")
    model_matrix = []
    for each_model in model_list:
        model_performance_metrix = {}
        model_name = each_model['name']
        model = each_model['model']

        # prediction
        print("=============================================================")
        model_predict = model.predict(x_test)
        model_performance_metrix['model name'] = model_name
        print("Model Name : ", model_name)
        print("-------------------------------------------------------------")
        acu_score = accuracy_score(y_test, model_predict)
        print("Accuracy Score : ",acu_score)
        model_performance_metrix['Accuracy Score'] = acu_score
        
        con_mat = confusion_matrix(y_test, model_predict)
        print("Confusion Matrix \n",con_mat)
        model_performance_metrix['Confusion Matrix'] = con_mat

        clf_report = classification_report(y_test, model_predict)
        print("Classification Report \n", clf_report)
        model_performance_metrix['Classification Report'] = clf_report


        scores = cross_val_score(model, predictors, label, cv=10)
        mean_score = np.mean(scores)
        print("Cross Validation Score : ", scores, "Mean Score : ", mean_score)
        model_performance_metrix['Cross Validation Mean Score'] = mean_score

  
        
        
        model_matrix.append(model_performance_metrix)
    return model_matrix


In [9]:
models = [
    {
        "name" : "KNN",
        "model" : knn_clf_model
    },
    {
        "name": "Gauss-NB",
        "model": gnb_clf_model
    }

]


In [10]:

# calling model performance
model_acc_data = getmodelAccuracyReport(models, x_train, y_train)

# calling the model ranking
modelRange(model_acc_data)


Model Name :  KNN
-------------------------------------------------------------
Accuracy Score :  0.9411764705882353
Confusion Matrix 
 [[35  0  0]
 [ 0 40  1]
 [ 0  6 37]]
Classification Report 
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        35
Iris-versicolor       0.87      0.98      0.92        41
 Iris-virginica       0.97      0.86      0.91        43

      micro avg       0.94      0.94      0.94       119
      macro avg       0.95      0.95      0.94       119
   weighted avg       0.95      0.94      0.94       119

Cross Validation Score :  [0.86666667 0.8        1.         0.93333333 1.         0.86666667
 0.93333333 1.         1.         0.85714286] Mean Score :  0.9257142857142858
Model Name :  Gauss-NB
-------------------------------------------------------------
Accuracy Score :  0.9159663865546218
Confusion Matrix 
 [[35  0  0]
 [ 0 35  6]
 [ 0  4 39]]
Classification Report 
                  precision 