In [43]:
"""
Diabetes is a disease which occurs when 
the blood glucose level becomes 
high, which ultimately leads to other 
health problems such as heart diseases, kidney disease etc. 

Diabetes is caused mainly due to the 
consumption of highly processed 
food, bad consumption habits etc. 

According to WHO, the number of 
people with diabetes has been increased 
over the years.
"""

'\nDiabetes is a disease which occurs when \nthe blood glucose level becomes \nhigh, which ultimately leads to other \nhealth problems such as heart diseases, kidney disease etc. \n\nDiabetes is caused mainly due to the \nconsumption of highly processed \nfood, bad consumption habits etc. \n\nAccording to WHO, the number of \npeople with diabetes has been increased \nover the years.\n'

In [44]:
import pandas as pd
diabetes = pd.read_csv("C:/Users/ganapathi raju/Desktop/Machine Learning/diabetes.csv")
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [45]:
print(diabetes.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [46]:
print(diabetes.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [47]:
print("Diabetes data set dimensions : {}".format(diabetes.shape))

Diabetes data set dimensions : (768, 9)


In [48]:
""" 
‘Outcome’ is the column which we are 
going to predict , patient is 
diabetic or not. 
1 means the person is diabetic and 0 means person is not. 
"""
print(diabetes.groupby('Outcome').size())

Outcome
0    500
1    268
dtype: int64


In [49]:
"""
factors to consider in the data cleaning process.
1. Duplicate or irrelevant observations.
2. Bad labeling of data, same category occurring multiple times.
3. Missing or null data points.
4. Unexpected outliers.
"""

'\nfactors to consider in the data cleaning process.\n1. Duplicate or irrelevant observations.\n2. Bad labeling of data, same category occurring multiple times.\n3. Missing or null data points.\n4. Unexpected outliers.\n'

In [50]:
print(diabetes.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [51]:
""" 
Blood pressure : By observing the data, there are 0 values for 
blood pressure. 

And it is evident that the readings of the data set seems wrong 
because a living person cannot have diastolic blood pressure of zero. 
"""
print("Total : ", diabetes[diabetes.BloodPressure == 0].shape[0])

Total :  35


In [53]:
print(diabetes)   

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
5              5      116             74              0        0  25.6   
6              3       78             50             32       88  31.0   
7             10      115              0              0        0  35.3   
8              2      197             70             45      543  30.5   
9              8      125             96              0        0   0.0   
10             4      110             92              0        0  37.6   
11            10      168             74              0        0  38.0   
12            10      139             

In [55]:
# Plasma glucose levels : Even after fasting glucose level would not be as low as zero.

print("Total : ", diabetes[diabetes.Glucose == 0].shape[0])

Total :  5


In [56]:
print(diabetes[diabetes.Glucose == 0].groupby('Outcome')['Age'].count())

Outcome
0    3
1    2
Name: Age, dtype: int64


In [57]:
# Skin Fold Thickness : For normal people skin fold thickness can’t be less than 10 mm better yet zero.

print("Total : ", diabetes[diabetes.SkinThickness == 0].shape[0])

Total :  227


In [58]:
print(diabetes[diabetes.SkinThickness == 0].groupby('Outcome')['Age'].count())

Outcome
0    139
1     88
Name: Age, dtype: int64


In [59]:
#BMI : Should not be 0 or close to zero unless the person is really underweight which could be life threatening.

print("Total : ", diabetes[diabetes.BMI == 0].shape[0])

Total :  11


In [60]:
print(diabetes[diabetes.BMI == 0].groupby('Outcome')['Age'].count())

Outcome
0    9
1    2
Name: Age, dtype: int64


In [61]:
#Insulin : In a rare situation a person can have zero insulin
print("Total : ", diabetes[diabetes.Insulin == 0].shape[0])

Total :  374


In [62]:
print(diabetes[diabetes.Insulin == 0].groupby('Outcome')['Age'].count())

Outcome
0    236
1    138
Name: Age, dtype: int64


In [None]:
""" handle invalid data values :
Ignore/remove these cases : This is not actually possible in most 
cases because that  would mean losing valuable information.

And in this case “skin thickness” and “insulin” columns means 
have a lot of invalid points. 
But it might work for “BMI”, “glucose ”and “blood pressure” 
data points.

Put average/mean values : This might work for some data sets, 
but in our case putting a mean value to the blood pressure column would send a wrong signal to the model.
  
Avoid using features : It is possible to not use the features with a lot of 
invalid values for the model. This may work for “skin thickness” but its hard to predict that.
"""

In [63]:
#remove the rows which the “BloodPressure”, “BMI” and “Glucose” #are zero.
diabetes_mod = diabetes[(diabetes.BloodPressure != 0) & (diabetes.BMI != 0) & (diabetes.Glucose != 0)]
print(diabetes_mod.shape)

(724, 9)


In [None]:
""" Feature Engineering
Feature engineering is the process of transforming the gathered data 
into features that better 
represent the problem that we are trying to solve to the model, 
to improve its performance and 
accuracy.
"""

In [64]:
feature_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',                  'DiabetesPedigreeFunction', 'Age']
X = diabetes_mod[feature_names]
y = diabetes_mod.Outcome

In [65]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [66]:
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', SVC()))
models.append(('LR', LogisticRegression()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = diabetes_mod.Outcome, random_state=0)

In [68]:
names = []
scores = []
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)
tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)



  Name     Score
0  KNN  0.729282
1  SVC  0.657459
2   LR  0.767956
3   DT  0.723757
4  GNB  0.734807
5   RF  0.767956
6   GB  0.773481


