In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import scikitplot

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 10, 8

diabetes_df=pd.read_csv('diabetes.csv')
# diabetes_df.head()
# diabetes_df.info()
print(diabetes_df.describe())

# seaborn.countplot(diabetes_df['Pregnancies']) # there are zero but doesn't seem outliers

# seaborn.heatmap(diabetes_df.corr()) # not any clear corelation

# seaborn.distplot(diabetes_df['BloodPressure']) 

# print(diabetes_df[diabetes_df['BloodPressure']==0].value_counts().sum()) # total 35 zero outliers

# seaborn.distplot(diabetes_df['Glucose']) 

# print(diabetes_df[diabetes_df['Glucose']==0].value_counts().sum()) #total 5 zero outliers

# diabetes_df['Glucose']=diabetes_df['Glucose'].replace(0,120.89)




In [None]:
cols=['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction']

fig_subobj,axes=plt.subplots(2,3, figsize=(10,10))

for i, col in enumerate(cols):
    row=i//3
    column=i%3
    seaborn.displot(diabetes_df[col],kde=True, ax=axes[row,column]) # all six have zero outliers need to switch with median
plt.show()




In [None]:
for col in cols:
    print(f'Number of zeroes in {col}:', (diabetes_df[diabetes_df[col]==0]).value_counts().sum())

In [None]:
for col in cols:
    diabetes_df[col]=diabetes_df[col].replace(0,diabetes_df[col].median())
    

In [None]:
for col in cols:
    print(f'Number of zeroes in {col}:', (diabetes_df[diabetes_df[col]==0]).value_counts().sum())

In [None]:
X=diabetes_df.iloc[:,:8]
Y=diabetes_df.iloc[:,8:]

SScaler=StandardScaler()
X=SScaler.fit_transform(X)

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=7)


In [None]:
def model_results(model,Y_pred, Y_prob,class1_prob):
    print('accuracy score:', accuracy_score(Y_test,Y_pred))
    print('roc_auc_score is:', round(roc_auc_score(Y_test,class1_prob),5)) 
    scikitplot.metrics.plot_confusion_matrix(Y_test,Y_pred,normalize=True,figsize=(7,5))
    scikitplot.estimators.plot_learning_curve(model,X_train,Y_train,figsize=(7,5))
    scikitplot.metrics.plot_roc_curve(Y_test,Y_prob,curves=['each_class'],figsize=(7,5))
    
    
    
    
    

In [None]:
logistic_regression=LogisticRegression()
logistic_regression.fit(X_train,Y_train)
Y_pred=logistic_regression.predict(X_test)
Y_prob=logistic_regression.predict_proba(X_test)
class1_prob=list()
for i in Y_prob:
    class1_prob.append(i[1])

model_results(logistic_regression,Y_pred,Y_prob,class1_prob)



In [None]:
logistic_regression1 = LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced')
logistic_regression1.fit(X_train,Y_train)
Y_pred=logistic_regression1.predict(X_test)
Y_prob=logistic_regression1.predict_proba(X_test)
class1_prob=list()
for i in Y_prob:
    class1_prob.append(i[1])

model_results(logistic_regression1,Y_pred,Y_prob,class1_prob)


In [None]:
Randomforest=RandomForestClassifier()
Randomforest.fit(X_train,Y_train)
Y_pred=Randomforest.predict(X_test)
class1_prob=list()
for i in Y_prob:
    class1_prob.append(i[1])

model_results(Randomforest,Y_pred,Y_prob,class1_prob)

In [None]:
Decision_tree=DecisionTreeClassifier()
Decision_tree.fit(X_train,Y_train)
Y_pred=Decision_tree.predict(X_test)
class1_prob=list()
for i in Y_prob:
    class1_prob.append(i[1])

model_results(Decision_tree,Y_pred,Y_prob,class1_prob)


In [None]:
Gaus=GaussianNB()
Gaus.fit(X_train,Y_train)
Y_pred=Gaus.predict(X_test)
class1_prob=list()
for i in Y_prob:
    class1_prob.append(i[1])

model_results(Gaus,Y_pred,Y_prob,class1_prob)

In [None]:
Knbrs=KNeighborsClassifier()
Knbrs.fit(X_train,Y_train)
Y_pred=Knbrs.predict(X_test)
class1_prob=list()
for i in Y_prob:
    class1_prob.append(i[1])

model_results(Knbrs,Y_pred,Y_prob,class1_prob)

In [None]:
Decision_svc=SVC(kernel='rbf')
Decision_svc.fit(X_train,Y_train)
Y_pred=Decision_svc.predict(X_test)
class1_prob=list()
for i in Y_prob:
    class1_prob.append(i[1])

model_results(Decision_svc,Y_pred,Y_prob,class1_prob)

In [None]:
Randomforest1=RandomForestClassifier(max_depth=5, n_estimators=100, max_features=8)
Randomforest1.fit(X_train,Y_train)
Y_pred=Randomforest1.predict(X_test)
class1_prob=list()
for i in Y_prob:
    class1_prob.append(i[1])

model_results(Randomforest1,Y_pred,Y_prob,class1_prob)