#### Importing Libraries

In [116]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

#### Reading data

In [117]:
data=pd.read_csv("./Datasets/diabetes_data.csv")

In [118]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [119]:
data.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


#### Data Analysis

In [120]:
data.shape

(768, 9)

In [121]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [122]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [123]:
data["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

##### 0 ----> Does not have Diabetes
##### 1 ----> Has Diabetes

In [124]:
# checking for missing values
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

#### Splitting independent and dependent variables

In [125]:
X=data.drop('Outcome',axis=1)
Y=data["Outcome"]

In [126]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [127]:
Y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

#### Splitting training and testing data

In [128]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=7)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(614, 8) (154, 8) (614,) (154,)


#### Model training

In [129]:
diabetes_p_model=RandomForestClassifier(n_estimators=200)
diabetes_p_model.fit(X_train,Y_train)

#### Model Evaluation

In [130]:
Train_data_prediction=diabetes_p_model.predict(X_train)
Train_data_accuracy=accuracy_score(Y_train,Train_data_prediction)
print("Train data accuracy:",Train_data_accuracy)
Test_data_prediction=diabetes_p_model.predict(X_test)
Test_data_accuracy=accuracy_score(Y_test,Test_data_prediction)
print("Test data accuracy:",Test_data_accuracy)

Train data accuracy: 1.0
Test data accuracy: 0.7922077922077922


#### Predictive System

In [131]:
input=(2,120,70,22,105,27.8,0.3,45)
# convert to array
input_array=np.asarray(input)
# reshaping the array
reshaped_input=input_array.reshape(1,-1)
# predicting the output
prediction=diabetes_p_model.predict(reshaped_input)
print(prediction)
if (prediction[0]==0):
    print("The person does not have Diabetes.")
else:
    print("The person has Diabetes.")

[0]
The person does not have Diabetes.




#### Saving the model

In [132]:
pickle.dump(diabetes_p_model,open('Diabetes_Prediction.sav','wb'))

In [133]:
# loading the model
pickle.load(open('Diabetes_Prediction.sav','rb'))

In [134]:
for column in X_train.columns:
    print(column)

Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
