### Import Packages



In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statistics
import plotly.express as px
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sys, os

### Load Dataset



In [2]:
#Original scientific paper concluded that 'Age', ‘Family_diabetes’, ‘Physically active’, ‘Regular Medicine’ and ‘Pdiabetes’ were the most important factors and prove useful to predicting whether or not someone has diabetes. They used a ratio of 75:25.

In [37]:
diabetes = pd.read_csv("./Data/diabetes_dataset__2019.csv")

In [4]:
diabetes.head()

Unnamed: 0,Age,Gender,Family_Diabetes,highBP,PhysicallyActive,BMI,Smoking,Alcohol,Sleep,SoundSleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregancies,Pdiabetes,UriationFreq,Diabetic
0,50-59,Male,no,yes,one hr or more,39.0,no,no,8.0,6.0,no,occasionally,sometimes,high,0.0,0,not much,no
1,50-59,Male,no,yes,less than half an hr,28.0,no,no,8.0,6.0,yes,very often,sometimes,normal,0.0,0,not much,no
2,40-49,Male,no,no,one hr or more,24.0,no,no,6.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,no
3,50-59,Male,no,no,one hr or more,23.0,no,no,8.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,no
4,40-49,Male,no,no,less than half an hr,27.0,no,no,8.0,8.0,no,occasionally,sometimes,normal,0.0,0,not much,no


In [5]:
diabetes.tail()

Unnamed: 0,Age,Gender,Family_Diabetes,highBP,PhysicallyActive,BMI,Smoking,Alcohol,Sleep,SoundSleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregancies,Pdiabetes,UriationFreq,Diabetic
948,60 or older,Male,yes,yes,more than half an hr,27.0,no,no,6.0,5.0,yes,occasionally,sometimes,high,0.0,0.0,quite often,yes
949,60 or older,Male,no,yes,none,23.0,no,no,6.0,5.0,yes,occasionally,sometimes,high,0.0,0.0,not much,no
950,60 or older,Male,no,yes,less than half an hr,27.0,no,yes,6.0,5.0,yes,occasionally,very often,high,0.0,0.0,not much,no
951,60 or older,Female,yes,yes,one hr or more,30.0,no,no,7.0,4.0,yes,occasionally,sometimes,high,2.0,0.0,quite often,yes
952,$$$$$$,,,,,,,,,,,,,,,,,


In [6]:
print(diabetes.columns)
print("")
print(diabetes.shape)

Index(['Age', 'Gender', 'Family_Diabetes', 'highBP', 'PhysicallyActive', 'BMI',
       'Smoking', 'Alcohol', 'Sleep', 'SoundSleep', 'RegularMedicine',
       'JunkFood', 'Stress', 'BPLevel', 'Pregancies', 'Pdiabetes',
       'UriationFreq', 'Diabetic'],
      dtype='object')

(953, 18)


In [7]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               953 non-null    object 
 1   Gender            952 non-null    object 
 2   Family_Diabetes   952 non-null    object 
 3   highBP            952 non-null    object 
 4   PhysicallyActive  952 non-null    object 
 5   BMI               948 non-null    float64
 6   Smoking           952 non-null    object 
 7   Alcohol           952 non-null    object 
 8   Sleep             952 non-null    float64
 9   SoundSleep        952 non-null    float64
 10  RegularMedicine   952 non-null    object 
 11  JunkFood          952 non-null    object 
 12  Stress            952 non-null    object 
 13  BPLevel           952 non-null    object 
 14  Pregancies        910 non-null    float64
 15  Pdiabetes         951 non-null    object 
 16  UriationFreq      952 non-null    object 
 1

### **Data Cleaning**



. 


In [38]:
#cleaning the data -- dropping missing and duplicate values for sanity
diabetes.dropna(inplace = True)
diabetes = diabetes.reset_index(drop=True)

length = len(diabetes.index) #save length of array of later
length

905

In [39]:
#converting categorical variables into dummy variables
col_category = ['Gender','Family_Diabetes','highBP','PhysicallyActive','Smoking','Alcohol','RegularMedicine','JunkFood','Stress','BPLevel','Pdiabetes','UriationFreq']
for col in col_category:
    diabetes = pd.concat([diabetes.drop(col, axis=1), pd.get_dummies(diabetes[col], prefix=col, prefix_sep='_')], axis=1)

In [40]:
#coverting target variable (yes/no) into 0s and 1s
diabetes['Diabetic'] = diabetes['Diabetic'].replace('no', 0)
diabetes['Diabetic'] = diabetes['Diabetic'].replace(' no', 0)
diabetes['Diabetic'] = diabetes['Diabetic'].replace('yes', 1)
diabetes['Diabetic']

0      0
1      0
2      0
3      0
4      0
      ..
900    1
901    1
902    0
903    0
904    1
Name: Diabetic, Length: 905, dtype: int64

### Exploratory Data Analysis



In [41]:
#the relationship between pregnancies and glucose levels
import plotly.express as px
fig = px.violin(diabetes, x="Gender_Female", y="Age",color="Gender_Female",
            title=" Age, Gender and Body Mass Index of Diabetic Patitents")

fig.show()

In [0]:
#possibly a pie chart

In [43]:
#BMI, age, diabetic
df = diabetes
fig = px.bar(df, x="Diabetic", y="BMI", barmode="group", color = "Age", title="BMI in Diabetics and Non-Diabetics relating to age group")
fig.show()

In [44]:
# Stress always and Quality Sleep
df = diabetes
fig = px.violin(df, x="Stress_always", y="SoundSleep", color= "Diabetic", title="Quality sleep in relation to stress")
fig.show()

In [45]:
# Stress always and Quality Sleep
df = diabetes
fig = px.violin(df, x="Stress_not at all", y="SoundSleep", color= "Diabetic", title="Quality sleep in relation to stress")
fig.show()

In [47]:
# Blood Pressure and Alcohol
df = diabetes
fig = px.bar(df, x="Alcohol_yes", y="BPLevel_high", color="BPLevel_high") 
fig.update_layout(
    title="Alcohol and blood pressure",)

fig.show()

In [48]:
# Gender and BMI
df= diabetes
fig = px.bar(df, x="Gender_Female", y="BMI", color="BMI")

fig.update_layout(
    title="Gender and Body Mass Index",)

fig.show()

In [49]:
# BMI and Gender
df= diabetes
fig = px.violin(df, x="Gender_Male", y="BMI", color="Gender_Male")

fig.update_layout(
    title="BMI of Males",)

fig.show()

In [50]:
# sleep and Age
df= diabetes
fig = px.scatter(df, x="Sleep", y="Age", color="Age")

fig.update_layout(
    title="Sleep concived through different Age categories",)

fig.show()

In [51]:
df = diabetes
fig = px.bar(df, x="Diabetic", y="BMI", barmode='group', color = "Diabetic", height=400, title='BMI of diabetics compared to non-diabetics')
fig.show()

In [52]:
#BMI, age, diabetic
fig = px.bar(df, x="Stress_not at all", y="Age", color = "Age", 
             title="")
fig.show()

In [53]:
#BMI, age, diabetic
fig = px.bar(df, x="Stress_always", y="Gender_Female", barmode="group", color = "Age", title="BMI in Diabetics and Non-Diabetics relating to age group")
fig.show()

In [55]:

fig = px.bar(df, x="Diabetic", y="Stress_always", barmode="group", color = "Age", title="BMI in Diabetics and Non-Diabetics relating to age group")
fig.show()

In [56]:
#BMI, age, diabetic
fig = px.bar(df, x="JunkFood_very often", y="Diabetic", barmode="group", color = "Diabetic",
             title="Determining if Junkfood affects Diabetic People")
fig.show()

In [57]:
#Age and Physical Activity
fig = px.violin(df, x= "PhysicallyActive_one hr or more", y="Age", color="Diabetic", title = 'Physical Activity Relating to Age in Diabetic and Non Diabetic People')
fig.show()

In [58]:
#Age and Physical Activity
fig = px.violin(df, x= "PhysicallyActive_none", y="Age", color="Diabetic")
fig.show()

In [59]:
#Age and Physical Activity
fig = px.violin(df, x= "PhysicallyActive_less than half an hr", y="Age", color="Diabetic")
fig.show()

In [60]:
#BMI, age, diabetic
fig = px.bar(df, x="Stress_always", y="Smoking_yes", barmode="group", color = "Stress_always", title="Does Smoking affect Stress levels?")
fig.show()

In [61]:
#Family_diabetes Pdiabetes 
fig = px.bar(df, x= "Family_Diabetes_yes", y="Pdiabetes_yes", barmode = 'group', color="Diabetic")
fig.show()

In [62]:
#Family_diabetes Pdiabetes 
fig = px.violin(df, x= "RegularMedicine_yes", y="Age", color="Diabetic")
fig.show()

In [63]:
df = diabetes
fig = px.bar(df, x="Stress_very often", y="SoundSleep", barmode="group", color = "Age", title=" Sleep Concived Affecting Stress")
fig.show()

### Model fitting



In [0]:
X = diabetes["BMI"].to_numpy().reshape(-1, 1)
y = diabetes["Pdiabetes_yes"].to_numpy().reshape(-1, 1)

In [11]:
diabetes

Unnamed: 0,Age,Gender,Family_Diabetes,highBP,PhysicallyActive,BMI,Smoking,Alcohol,Sleep,SoundSleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregancies,Pdiabetes,UriationFreq,Diabetic
0,50-59,Male,no,yes,one hr or more,39.0,no,no,8.0,6.0,no,occasionally,sometimes,high,0.0,0,not much,no
1,50-59,Male,no,yes,less than half an hr,28.0,no,no,8.0,6.0,yes,very often,sometimes,normal,0.0,0,not much,no
2,40-49,Male,no,no,one hr or more,24.0,no,no,6.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,no
3,50-59,Male,no,no,one hr or more,23.0,no,no,8.0,6.0,no,occasionally,sometimes,normal,0.0,0,not much,no
4,40-49,Male,no,no,less than half an hr,27.0,no,no,8.0,8.0,no,occasionally,sometimes,normal,0.0,0,not much,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,60 or older,Male,yes,yes,more than half an hr,27.0,no,no,6.0,5.0,yes,occasionally,sometimes,high,0.0,0,quite often,yes
949,60 or older,Male,no,yes,none,23.0,no,no,6.0,5.0,yes,occasionally,sometimes,high,0.0,0,not much,no
950,60 or older,Male,no,yes,less than half an hr,27.0,no,yes,6.0,5.0,yes,occasionally,very often,high,0.0,0,not much,no
951,60 or older,Female,yes,yes,one hr or more,30.0,no,no,7.0,4.0,yes,occasionally,sometimes,high,2.0,0,quite often,yes


In [21]:
#split data into training and testing
train, test = train_test_split(diabetes, test_size = 0.25, shuffle = True)

In [0]:
train.columns[0]

In [24]:
xcols = train.columns

In [25]:
x_train = train[xcols]
x_test = test[xcols]

In [26]:
y_train = train['Diabetic']
y_test = test['Diabetic']

In [69]:
#SVMS
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
clf = svm.SVC()
clf.fit(x_train,y_train)
clf.score(x_test, y_test.ravel())
print("Accuracy: {}%".format(clf.score(x_test, y_test) * 100 ))

Accuracy: 92.07048458149781%


In [68]:
#PARAMETERS TO SCALE VARIABLES FROM 0-1
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# kernel: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
test = make_pipeline(StandardScaler(), SVC(kernel = 'poly', gamma='auto'))
test.fit(x_train, y_train)
test.score(x_test, y_test.ravel())

0.9779735682819384

In [66]:
#RANDOM FORESTS
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
rfc.score(x_test, y_test.ravel())
print("Accuracy: {}%".format(rfc.score(x_test, y_test) * 100 ))

Accuracy: 100.0%


In [31]:
y_test.value_counts()

0    166
1     61
Name: Diabetic, dtype: int64

In [32]:
#PARAMETERS


In [67]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
reg.score(x_test, y_test.ravel())
print("Accuracy: {}%".format(reg.score(x_test, y_test) * 100 ))

Accuracy: 100.0%



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

