In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
213,0,140,65,26,130,42.6,0.431,24,1
587,6,103,66,0,0,24.3,0.249,29,0
634,10,92,62,0,0,25.9,0.167,31,0
670,6,165,68,26,168,33.6,0.631,49,0
115,4,146,92,0,0,31.2,0.539,61,1


In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


# training without binnig:

In [6]:
# x = df.iloc[:, 0:-1]
x = df.drop(columns="Outcome")

y= df.iloc[:, -1]


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=None)

In [8]:
dt = DecisionTreeClassifier()

In [9]:
dt.fit(x_train, y_train)

In [10]:
y_pred = dt.predict(x_test)

In [11]:
print(f"Accuracy score is: {accuracy_score(y_pred, y_test)*100}")

Accuracy score is: 77.92207792207793


In [12]:
print(f"Mean accuracy score: {np.mean(cross_val_score(dt, x,y, cv=10))*100}")

Mean accuracy score: 71.47983595352015


# now binning and training:

In [13]:
trf1 = ColumnTransformer([
    ("Glucose", KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile"), ["Glucose"]),
    ("BMI", KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile"), ["BMI"]),
    ("Age", KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile"), ["Age"]),
    ("Insulin", KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile"), ["Insulin"])
],
    remainder="passthrough"
)

In [14]:
trf2 = DecisionTreeClassifier()

In [15]:
pipe = Pipeline([
    ("transformer", trf1),
    ("classifier", trf2)
])

In [16]:
pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [17]:
y_pred = pipe.predict(x_test)

In [18]:
print(f"Accuracy score is: {accuracy_score(y_pred, y_test)*100}")

Accuracy score is: 66.66666666666666


# mean accuracy score:

In [22]:
print(f"Mean accuracy score: {np.mean(cross_val_score(dt, x,y, cv=10))*100}")

Mean accuracy score: 71.22351332877648
