In [1]:
# Importing essential libraries
import numpy as np
import pandas as pd
import pickle

In [2]:
# Loading the dataset
df = pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [4]:
# Renaming DiabetesPedigreeFunction as DPF
df = df.rename(columns={'DiabetesPedigreeFunction':'DPF'})

In [5]:
# Replacing the 0 values from ['Glucose','BloodPressure','SkinThickness','Insulin','BMI'] by NaN
df_copy = df.copy(deep=True)
df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [6]:
df_copy.isnull().sum()

Pregnancies        0
Glucose           13
BloodPressure     90
SkinThickness    573
Insulin          956
BMI               28
DPF                0
Age                0
Outcome            0
dtype: int64

In [7]:
# Replacing NaN value by mean, median depending upon distribution
df_copy['Glucose'].fillna(df_copy['Glucose'].mean(), inplace=True)
df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(), inplace=True)
df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].median(), inplace=True)
df_copy['Insulin'].fillna(df_copy['Insulin'].median(), inplace=True)
df_copy['BMI'].fillna(df_copy['BMI'].median(), inplace=True)

In [8]:
df_copy.isnull().sum()

Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
DPF              0
Age              0
Outcome          0
dtype: int64

In [9]:
# Model Building
from sklearn.model_selection import train_test_split
X = df.drop(columns='Outcome')
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [25]:
# Creating Random Forest Model
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=20)

In [26]:
print("Accuracy Of Model :",classifier.score(X_train,y_train))

Accuracy Of Model : 0.999375


In [27]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred=classifier.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred) #same ans will come as of confusion matrix

[[272   0]
 [  7 121]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       272
           1       1.00      0.95      0.97       128

    accuracy                           0.98       400
   macro avg       0.99      0.97      0.98       400
weighted avg       0.98      0.98      0.98       400



0.9825

In [28]:
from sklearn.linear_model import LogisticRegression
lor=LogisticRegression()
lor.fit(X_train,y_train)
print("Accuracy Of Model :",lor.score(X_train,y_train))

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred=lor.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred) #same ans will come as of confusion matrix


Accuracy Of Model : 0.7825
[[245  27]
 [ 60  68]]
              precision    recall  f1-score   support

           0       0.80      0.90      0.85       272
           1       0.72      0.53      0.61       128

    accuracy                           0.78       400
   macro avg       0.76      0.72      0.73       400
weighted avg       0.78      0.78      0.77       400



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7825

In [29]:
print("Predication of model :",lor.predict([[1,140,75,30,200,25,0.20,30]]))

Predication of model : [0]


In [30]:
from xgboost import XGBClassifier
clf=XGBClassifier()
clf.fit(X_train,y_train)
print("Accuracy Of Model :",clf.score(X_train,y_train))

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred=clf.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred) #same ans will come as of confusion matrix


Accuracy Of Model : 0.90375
[[253  19]
 [ 36  92]]
              precision    recall  f1-score   support

           0       0.88      0.93      0.90       272
           1       0.83      0.72      0.77       128

    accuracy                           0.86       400
   macro avg       0.85      0.82      0.84       400
weighted avg       0.86      0.86      0.86       400



0.8625

In [31]:
# Creating a pickle file for the classifier
filename = 'diabetes-prediction-rfc-model.pkl'
pickle.dump(classifier, open(filename, 'wb'))