In [22]:
import numpy as np
import pandas as pd

In [23]:
diabetes = pd.read_csv('diabetes.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [24]:
diabetes.shape

(768, 9)

In [25]:
diabetes.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [26]:
diabetes['Glucose'].replace(0, np.nan, inplace=True)
diabetes['BloodPressure'].replace(0, np.nan, inplace=True)
diabetes['SkinThickness'].replace(0, np.nan, inplace=True)
diabetes['Insulin'].replace(0, np.nan, inplace=True)
diabetes['BMI'].replace(0, np.nan, inplace=True)

In [27]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [28]:
arr = diabetes['SkinThickness'].values.reshape(-1,1)
arr.shape

(768, 1)

In [29]:
from sklearn.impute import SimpleImputer #provides basic stratigies for imputing missing values

In [30]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(diabetes['SkinThickness'].values.reshape(-1,1))
diabetes['SkinThickness'] = imp.transform(diabetes['SkinThickness'].values.reshape(-1,1))

In [31]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [32]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(diabetes['Glucose'].values.reshape(-1,1))
diabetes['Glucose'] = imp.transform(diabetes['Glucose'].values.reshape(-1,1))
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                35
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [33]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(diabetes['BloodPressure'].values.reshape(-1,1))
diabetes['BloodPressure'] = imp.transform(diabetes['BloodPressure'].values.reshape(-1,1))
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [35]:
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=32)
imp.fit(diabetes['BMI'].values.reshape(-1,1))
diabetes['BMI'] = imp.transform(diabetes['BMI'].values.reshape(-1,1))
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                     374
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [37]:
#multivariate feature imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
#models each feature with a missing values as a function of other features in an iterative round-robin-fashion-fits a regressor to find missing value

In [44]:
diabetes_features = diabetes.drop(columns='Outcome')
diabetes_label = diabetes['Outcome']
diabetes_features

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,,33.6,0.627,50
1,1,85.0,66.0,29.0,,26.6,0.351,31
2,8,183.0,64.0,32.0,,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2,122.0,70.0,27.0,,36.8,0.340,27
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1,126.0,60.0,32.0,,30.1,0.349,47


In [45]:
imp = IterativeImputer(max_iter=10000, random_state=0)

In [46]:
imp.fit(diabetes_features)

IterativeImputer(max_iter=10000, random_state=0)

In [48]:
diabetes_features_arr = imp.transform(diabetes_features)

In [51]:
diabetes_features = pd.DataFrame(diabetes_features_arr, columns= diabetes_features.columns)
diabetes_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [52]:
diabetes = pd.concat([diabetes_features, diabetes_label], axis=1)
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0,0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [54]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [56]:
x_train, x_test, y_train, y_test = train_test_split(diabetes_features, diabetes_label, test_size=0.2)

In [57]:
clf = DecisionTreeClassifier(max_depth=4)

In [58]:
clf.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=4)

In [60]:
clf.score(x_train, y_train)

0.8143322475570033

In [61]:
y_pred= clf.predict(x_test)

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.7467532467532467

In [63]:
pregnancy = input('Enter no. of pregnancy you have had:')
glucose = input('Amount of glucose: ')
BloodPressure = input('Enter your blood : ')
skinthickness = input('Enter your skin thickness: ')
insulin = input('Enter your blood insulin: ')
bmi = input('Enter your body mass index')
diabetespedigreefunction = input('Enter your diabetes pedigree function:')
age = input('Enter your blood age: ')


Enter no. of pregnancy you have had:1
Amount of glucose: 89
Enter your blood : 80
Enter your skin thickness: 32
Enter your blood insulin: 95
Enter your body mass index35
Enter your diabetes pedigree function:0.627
Enter your blood age: 22


In [82]:
result = clf.predict([[pregnancy, glucose, BloodPressure, skinthickness, insulin, bmi, diabetespedigreefunction, age]])
print(result)
if result == 0:
    print('You donot have diabetes')
else:
    print('You have diabtetes')



[0]
You donot have diabetes
