In [78]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.default_templates = 'plotly_dark'
from plotly.subplots import make_subplots

In [5]:
train = pd.read_csv('dataset_37_diabetes.csv')

In [28]:
# Information about the columns and diabetes in general to understand about diabetes and the tests done to check for type1 and type2 diabetes
# Type 1 diabetes is often diagnosed more rapidly because it usually develops rapidly and involves high blood sugar levels and symptoms
# Type 2 diabetes develops often in over the course of years. Most common diabetes and usually develops during adulthood
# Diabetes can cause pregnancy complications and is usually recommended to test withtin 24~28 weeks of pregnancy.


# preg = Number of times pregnant
# plas = Plasma glucose concentration in a 2 hours in an oral glucose test
    # This measures how much the body is able to absorb the gluose(sugar). This is usually done to test diabetes during pregnency
# pres = Diastolic blood pressure(mm Hg) - this cannot be 0 because no one has 0 blood pressure when they are alive. 
# skin = Triceps skin folding thickness(mm)
# insu = 2 Hour serum insulin(mu U/ml) // Test is used to find out the levels of insulin in blood, Basically whether the patient is making enough insulin or not 
# mass = body mass index(weight in kgs) - BMI, it is considered overweight when BMI is greater than 25
# pedi = Diabetes pedigree function
# age = Age of patient
# class = Target(whether patient has diabetes or not)

In [7]:
# Finding all the count of the zeros 
zeros = (train==0).astype(int).sum()
zeros

preg     111
plas       5
pres      35
skin     227
insu     374
mass      11
pedi       0
age        0
class      0
dtype: int64

In [8]:
# Replacing the 3 main variables i think that cannot have zeros as a value
cols = ['mass', 'pres', 'plas']
train[cols] = train[cols].replace({0: np.nan})

In [10]:
# Checking to see if how many of Nans there are
train.isna().sum()

preg      0
plas      5
pres     35
skin      0
insu      0
mass     11
pedi      0
age       0
class     0
dtype: int64

In [11]:
# Filling in the Nans with the respective means of age and mass
train['mass'].fillna(train.groupby('age')['mass'].transform('mean'), inplace=True)
train['plas'].fillna(train.groupby('mass')['plas'].transform('mean'), inplace=True)
train['pres'].fillna(train.groupby('mass')['pres'].transform('mean'), inplace=True)
train['pres'].fillna(train.groupby('age')['pres'].transform('mean'), inplace=True)
train['plas'].fillna(train.groupby('age')['plas'].transform('mean'), inplace=True)

In [12]:
train.isna().sum()

preg     0
plas     0
pres     0
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [13]:
# Changing the features into numerical values for training
class_rename = {'tested_positive': 1, 'tested_negative': 0}
train['class'] = train['class'].map(class_rename)

In [14]:
train

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148.0,72.0,35,0,33.6,0.627,50,1
1,1,85.0,66.0,29,0,26.6,0.351,31,0
2,8,183.0,64.0,0,0,23.3,0.672,32,1
3,1,89.0,66.0,23,94,28.1,0.167,21,0
4,0,137.0,40.0,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48,180,32.9,0.171,63,0
764,2,122.0,70.0,27,0,36.8,0.340,27,0
765,5,121.0,72.0,23,112,26.2,0.245,30,0
766,1,126.0,60.0,0,0,30.1,0.349,47,1


In [15]:
# Giving the x and y training datasets accordingly
x_train = train.drop(['class'], axis=1)
y_train = train['class']

In [16]:
# Using RandomForest classifier for this machine learning algorithm
rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [17]:
# Splitting data into training and test data set for training
X_train, x_test, Y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=3)

In [18]:
rf_clf_nocvs = rf_clf.fit(X_train, Y_train)

In [19]:
rf_clf_cvs = cross_val_score(rf_clf, X_train, Y_train, cv=10, scoring='accuracy').mean()

In [21]:
rf_clf_nocvs

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [42]:
 rf_clf_nocvs.feature_importances_

array([0.08193706, 0.28150925, 0.0836224 , 0.06642187, 0.07572218,
       0.15819981, 0.11996991, 0.13261753])

In [43]:
# Reorganizing the feature importance into a list of name of variable and its importance
X_columns = train.columns
final = list(zip(X_columns, rf_clf_nocvs.feature_importances_))

In [61]:
# Function for the key to sort the list by the second element in the list which is the feature importance values
def second(elem):
    return elem[1]

In [74]:
# Sorting the list
final.sort(key=second)
final

[('skin', 0.06642187041216731),
 ('insu', 0.07572217640899996),
 ('preg', 0.08193706194622317),
 ('pres', 0.08362239985313313),
 ('pedi', 0.11996990948509556),
 ('age', 0.1326175260266687),
 ('mass', 0.1581998085166616),
 ('plas', 0.28150924735105054)]

In [76]:
# Checking to see if it has been sorted properly
imp_df = pd.DataFrame(final, columns=['Variables', 'Importance'])
imp_df

Unnamed: 0,Variables,Importance
0,skin,0.066422
1,insu,0.075722
2,preg,0.081937
3,pres,0.083622
4,pedi,0.11997
5,age,0.132618
6,mass,0.1582
7,plas,0.281509


In [80]:
# And finally visualizing the feature importance into a horizontal bar graph using plotly.
fig = px.bar(imp_df, x='Importance', y='Variables', orientation='h')
fig.update_layout(title='Feature importance of Variables')
fig.show()

In [None]:
# From the above we can see that based on how I did my data preprocessing and using RF classifier, the 'plas', 'mass' and 'age' came out to be the 
# top 3 most important features. 