In [1]:
import pandas as pd
import numpy as np

cols = ["Pregnancies",
"Glucose",
"BloodPressure",
"SkinThickness",
"Insulin",
"BMI",
"DiabetesPedigreeFunction",
"Age",
"Outcome"]


In [2]:
dataset = pd.read_excel("3_data.xlsm", names=cols)

In [3]:
#data cleaning
def dataclean(d_set):
    for x in d_set:
        data_without_str = []
        for i in d_set[x]:
            if i != "  ":
              data_without_str.append(i)

        data_without_str_ds = pd.Series(data_without_str)
        # mean 
        replace_value = np.mean(data_without_str_ds)
        # mode
        #replace_value = np.array(data_without_str_ds.value_counts().keys())[0]
        d_set[x] = d_set[x].replace("  ",replace_value)

dataclean(dataset)



In [4]:
# splitting dataset into train and test
predictors = dataset[["Pregnancies",
"Glucose",
"BloodPressure",
"SkinThickness",
"Insulin",
"BMI",
"DiabetesPedigreeFunction",
"Age"]]
label = dataset["Outcome"]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, label, random_state=14, test_size=0.2)



In [5]:
# dara preprocessing
from sklearn.preprocessing import StandardScaler
std_scl = StandardScaler()
x_train_pre = std_scl.fit_transform(x_train)
x_train = x_train_pre

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [6]:
# model development and training
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import  GaussianNB

knn_clf_model = KNeighborsClassifier(n_neighbors=15)
knn_clf_model.fit(x_train, y_train)

gnb_clf_model = GaussianNB()
gnb_clf_model.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [7]:
# accuracy and scoring
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score


In [8]:
# model ranking according tp their performance criteria
def modelRange(model_acc_data):
    model_df = pd.DataFrame(model_acc_data)

    print("================ > Model Ranking < =====================")
    # based on Accuracy Score
    print("------------------ Based on Accuracy Score ------------------------")
    print(model_df[['model name', 'Accuracy Score', 'Roc Auc', 'Cross Validation Mean Score']].sort_values("Accuracy Score", ascending=False))

    # based on ROC AUC
    print("------------------ Based on ROC AUC Score ------------------------")
    print(model_df[['model name', 'Accuracy Score', 'Roc Auc' ,'Cross Validation Mean Score']].sort_values("Roc Auc",ascending=False))

    # based on Cross Validation Mean Score
    print("------------------ Based on Cross Validation Mean Score ------------------------")
    print(model_df[['model name', 'Accuracy Score', 'Roc Auc', 'Cross Validation Mean Score']].sort_values("Cross Validation Mean Score",ascending=False))


In [9]:
# each model performance measuring
def getmodelAccuracyReport(model_list, x_test, y_test):
    print("================ > Model Accuracy < =====================")
    model_matrix = []
    for each_model in model_list:
        model_performance_metrix = {}
        model_name = each_model['name']
        model = each_model['model']

        # prediction
        print("=============================================================")
        model_predict = model.predict(x_test)
        model_performance_metrix['model name'] = model_name
        print("Model Name : ", model_name)
        print("-------------------------------------------------------------")
        acu_score = accuracy_score(y_test, model_predict)
        print("Accuracy Score : ",acu_score)
        model_performance_metrix['Accuracy Score'] = acu_score

        roc_auc = roc_auc_score(y_test, model_predict)
        print("ROC AUC Score : ", roc_auc)
        model_performance_metrix['Roc Auc'] = roc_auc

        con_mat = confusion_matrix(y_test, model_predict)
        print("Confusion Matrix \n",con_mat)
        model_performance_metrix['Confusion Matrix'] = con_mat

        clf_report = classification_report(y_test, model_predict)
        print("Classification Report \n", clf_report)
        model_performance_metrix['Classification Report'] = clf_report


        scores = cross_val_score(model, predictors, label, cv=10)
        mean_score = np.mean(scores)
        print("Cross Validation Score : ", scores, "Mean Score : ", mean_score)
        model_performance_metrix['Cross Validation Mean Score'] = mean_score

  
        
        
        model_matrix.append(model_performance_metrix)
    return model_matrix

models = [
    {
        "name" : "KNN",
        "model" : knn_clf_model
    },
    {
        "name": "Gauss-NB",
        "model": gnb_clf_model
    }

]

In [10]:
# calling model performance
model_acc_data = getmodelAccuracyReport(models, x_train, y_train)

# calling the model ranking
modelRange(model_acc_data)


Model Name :  KNN
-------------------------------------------------------------
Accuracy Score :  0.766721044045677
ROC AUC Score :  0.7072486696390047
Confusion Matrix 
 [[362  47]
 [ 96 108]]
Classification Report 
               precision    recall  f1-score   support

           0       0.79      0.89      0.84       409
           1       0.70      0.53      0.60       204

   micro avg       0.77      0.77      0.77       613
   macro avg       0.74      0.71      0.72       613
weighted avg       0.76      0.77      0.76       613

Cross Validation Score :  [0.74025974 0.76623377 0.7012987  0.61038961 0.68831169 0.77922078
 0.76623377 0.81578947 0.73684211 0.76315789] Mean Score :  0.7367737525632263
Model Name :  Gauss-NB
-------------------------------------------------------------
Accuracy Score :  0.7569331158238173
ROC AUC Score :  0.714655544369337
Confusion Matrix 
 [[344  65]
 [ 84 120]]
Classification Report 
               precision    recall  f1-score   support

     