# Diabetes Prediction Model

In [70]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn  import svm
from sklearn.metrics import accuracy_score

# Data Collection and Analysis

PIMA Diabetes Dataset

In [71]:
diabetes= pd.read_csv('diabetes.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [72]:
#no of rows and columns
diabetes.shape

(768, 9)

In [73]:
#statistical measures
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [74]:
diabetes['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

0 -> Non-Diabetic

1 -> Diabetic

In [75]:
diabetes.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [76]:
#Separating Data (Features) and Labels (1 and 0 )
X=diabetes.drop(columns='Outcome',axis=1)
Y=diabetes['Outcome']
X = pd.DataFrame(X.values)

# Data Standardization


In [77]:
 #scaler=StandardScaler()

In [78]:
X = pd.DataFrame(X.values)
#scaler.fit(X)

In [79]:
#standardized_data = scaler.transform(X)
#print(standardized_data)

In [80]:
#X=standardized_data

# Training and Test Data


In [81]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y , random_state=2 )
print(X.shape,X_train.shape,X_test.shape)

(768, 8) (614, 8) (154, 8)


# Training The Data

In [82]:
model=LogisticRegression()
model.fit(X_train, Y_train)

#training the SVM classifier
classifier=svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Evaluation of Model Accuracy


Logistic Regression



In [83]:
#Finding Accuracy on training data using Logistic Regression
X_train_pred=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_pred,Y_train)
training_data_accuracy

0.7850162866449512

In [84]:
#Finding Accuracy on test data using Logistic Regression
X_test_pred=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_pred,Y_test)
test_data_accuracy

0.7532467532467533

Support Vector Machine

In [85]:
#Finding Accuracy on training data using SVM
X_train_pred=classifier.predict(X_train)
training_data_accuracy=accuracy_score(X_train_pred,Y_train)
training_data_accuracy

0.7833876221498371

In [86]:
#Finding Accuracy on test data using SVM
X_test_pred=classifier.predict(X_test)
test_data_accuracy=accuracy_score(X_test_pred,Y_test)
test_data_accuracy

0.7727272727272727

# Making a Predictive System

In [87]:
#6,103,72,32,190,37.7,0.324,55    ---     0
#8,196,76,29,280,37.5,0.605,57    ---     1
input_data = (6,103,72,32,190,37.7,0.324,55)
#changing list to numpy array
input_data_np_arr=np.asarray(input_data)

# reshape the np array as we are predicting for one instance
input_data_reshaped = input_data_np_arr.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped_std)

if(prediction[0]==1):
  print('Person has diabetes.')
else:
  print('Person does not have diabetes.')

Person does not have diabetes.


## Saving Trained Model

In [88]:
import pickle

In [89]:
file_name = 'trained_model.sav'
pickle.dump(classifier, open(file_name, 'wb')) #writing in binary mode

## Loading the saved model

In [90]:
loaded_model=pickle.load(open('trained_model.sav','rb')) #reading in binary mode

In [91]:
#6,103,72,32,190,37.7,0.324,55    ---     0
#8,196,76,29,280,37.5,0.605,57    ---     1
input_data = (6,103,72,32,190,37.7,0.324,55)
#changing list to numpy array
input_data_np_arr=np.asarray(input_data)

# reshape the np array as we are predicting for one instance
input_data_reshaped = input_data_np_arr.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped_std)
if(prediction[0]==1):
  print('Person has diabetes.')
else:
  print('Person does not have diabetes.')

Person does not have diabetes.
