In [43]:
 import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.pandas.set_option("display.max_columns", None)
# Create Dataframe
df1 = pd.read_csv("diabetes.csv")
# Print shape of dataset
print(df1.shape)

(768, 9)


In [44]:
df1[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df1[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

from sklearn.impute import KNNImputer


# Initialize the imputer
imputer = KNNImputer(n_neighbors=3)

# Impute the null values
imputed_df = pd.DataFrame(imputer.fit_transform(df1), columns=df1.columns)
imputed_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.000000,125.333333,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.000000,66.666667,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,30.000000,195.000000,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.000000,94.000000,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.000000,168.000000,43.1,2.288,33.0,1.0
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.000000,180.000000,32.9,0.171,63.0,0.0
764,2.0,122.0,70.0,27.000000,166.666667,36.8,0.340,27.0,0.0
765,5.0,121.0,72.0,23.000000,112.000000,26.2,0.245,30.0,0.0
766,1.0,126.0,60.0,35.333333,120.666667,30.1,0.349,47.0,1.0


In [45]:
 X=imputed_df.drop('Outcome',axis=1)
y=imputed_df['Outcome']

In [46]:
from sklearn.preprocessing import PowerTransformer

pt=PowerTransformer(method='yeo-johnson')
transform_features=['Insulin','DiabetesPedigreeFunction','Age']
X_copy=pt.fit_transform(X[transform_features])

In [47]:
X_copy=pd.DataFrame(X_copy,columns=transform_features)

In [48]:
X_copy.skew(axis=0,skipna=True)

Insulin                     0.003128
DiabetesPedigreeFunction    0.142321
Age                         0.150219
dtype: float64

In [49]:
from sklearn.preprocessing import  StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Std=StandardScaler()
numeric_features=['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']
Transformer=['Insulin','DiabetesPedigreeFunction','Age']
transform_pipe=Pipeline(steps=[('PowerTransorm',PowerTransformer(method='yeo-johnson'))])
preprocessor=ColumnTransformer(transformers=[
    ('Power_transform',transform_pipe,Transformer),
    ('StandardScaler',Std,numeric_features)
])
X=preprocessor.fit_transform(X)

In [50]:
 from imblearn.combine import SMOTETomek,SMOTEENN

smt=SMOTEENN(random_state=42,sampling_strategy='minority')
print("before sampling target data has 0 and 1 with ",np.bincount(y)," values and difference between them is",abs(np.diff((np.bincount(y)))))
X_res,y_res=smt.fit_resample(X,y)
print("after sampling target data has 0 and 1 with  ",np.bincount(y_res)," values and difference between them is",abs(np.diff((np.bincount(y_res)))))

before sampling target data has 0 and 1 with  [500 268]  values and difference between them is [232]
after sampling target data has 0 and 1 with   [262 329]  values and difference between them is [67]


In [51]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,test_size=0.3,random_state=42)

In [52]:
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

In [53]:
def evaluate_clf(true, predicted):
    acc = accuracy_score(true, predicted) # Calculate Accuracy
    precision = precision_score(true, predicted) # Calculate Precision
    recall = recall_score(true, predicted)  # Calculate Recall
    f1 = f1_score(true, predicted) # Calculate F1-score
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    return acc, precision, recall, f1 , roc_auc

In [54]:
models = {
    # "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
}

In [55]:
def evaluate_models(X, y, models):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

    models_list = []
    accuracy_list = []
    auc= []

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model
           # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_accuracy,model_train_precision,model_train_recall,model_train_f1,model_train_rocauc_score=evaluate_clf(y_train ,y_train_pred)


        # Test set performance
        model_test_accuracy,model_test_precision,model_test_recall,model_test_f1,model_test_rocauc_score=evaluate_clf(y_test, y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i]) ## Append to the list of models
        print('Model performance for Test set')
        print('- Accuracy: {:.4f}'.format(model_test_accuracy))
        accuracy_list.append(model_test_accuracy) ## Append to the list of accuracy
        print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
        auc.append(model_test_rocauc_score)
        print('='*35)
        print('\n')

    report=pd.DataFrame(list(zip(models_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'], ascending=False)

    return report

In [56]:
Model_report=evaluate_models(X_res,y_res,models)
Model_report

Gradient Boosting
Model performance for Test set
- Accuracy: 0.9607
- Roc Auc Score: 0.9594




Unnamed: 0,Model Name,Accuracy
0,Gradient Boosting,0.960674
