In [1]:
#Installation of required libraries
import numpy as np
import pandas as pd 

In [2]:
#Reading the dataset
df = pd.read_csv("diabetes.csv")

In [3]:
# store the column names that have NULL values into a list variable
columnwithnull=df.columns[df.isnull().any()].tolist()

In [4]:
# show the 'columnwithnull' list
columnwithnull

[]

In [5]:
# print the 'columnwithnull' list
for i in columnwithnull:
    print(i)

In [6]:
# replace the missing values in all columnwithnull with 'mean' value
for column in columnwithnull:
    df.fillna({column: df[column].mean()}, inplace=True)

In [7]:
# now check the sum of distinct number of NULL valus in the dataset 'data2' by column
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
# show first 5 rows [0 to 4]
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
# 3) Feature Engineering
# Creating new variables is important for models.
# But you need to create a logical new variable.
# For this data set, some new variables were created according to BMI, Insulin and glucose variables.

In [10]:
# According to BMI, some ranges were determined and categorical variables were assigned.
NewBMI = pd.Series(["Underweight", "Normal", "Overweight", "Obesity 1", "Obesity 2", "Obesity 3"], dtype = "category")
df["NewBMI"] = NewBMI
df.loc[df["BMI"] < 18.5, "NewBMI"] = NewBMI[0]
df.loc[(df["BMI"] > 18.5) & (df["BMI"] <= 24.9), "NewBMI"] = NewBMI[1]
df.loc[(df["BMI"] > 24.9) & (df["BMI"] <= 29.9), "NewBMI"] = NewBMI[2]
df.loc[(df["BMI"] > 29.9) & (df["BMI"] <= 34.9), "NewBMI"] = NewBMI[3]
df.loc[(df["BMI"] > 34.9) & (df["BMI"] <= 39.9), "NewBMI"] = NewBMI[4]
df.loc[df["BMI"] > 39.9 ,"NewBMI"] = NewBMI[5]

In [11]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewBMI
0,6,148,72,35,0,33.6,0.627,50,1,Obesity 1
1,1,85,66,29,0,26.6,0.351,31,0,Overweight
2,8,183,64,0,0,23.3,0.672,32,1,Normal
3,1,89,66,23,94,28.1,0.167,21,0,Overweight
4,0,137,40,35,168,43.1,2.288,33,1,Obesity 3


In [12]:
# A categorical variable creation process is performed according to the insulin value.
def set_insulin(row):
    if row["Insulin"] >= 16 and row["Insulin"] <= 166:
        return "Normal"
    else:
        return "Abnormal"

In [13]:
# The operation performed was added to the dataframe.
df = df.assign(NewInsulinScore=df.apply(set_insulin, axis=1))

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewBMI,NewInsulinScore
0,6,148,72,35,0,33.6,0.627,50,1,Obesity 1,Abnormal
1,1,85,66,29,0,26.6,0.351,31,0,Overweight,Abnormal
2,8,183,64,0,0,23.3,0.672,32,1,Normal,Abnormal
3,1,89,66,23,94,28.1,0.167,21,0,Overweight,Normal
4,0,137,40,35,168,43.1,2.288,33,1,Obesity 3,Abnormal


In [14]:
# Some intervals were determined according to the glucose variable and these were assigned categorical variables.
NewGlucose = pd.Series(["Low", "Normal", "Overweight", "Secret", "High"], dtype = "category")
df["NewGlucose"] = NewGlucose
df.loc[df["Glucose"] <= 70, "NewGlucose"] = NewGlucose[0]
df.loc[(df["Glucose"] > 70) & (df["Glucose"] <= 99), "NewGlucose"] = NewGlucose[1]
df.loc[(df["Glucose"] > 99) & (df["Glucose"] <= 126), "NewGlucose"] = NewGlucose[2]
df.loc[df["Glucose"] > 126 ,"NewGlucose"] = NewGlucose[3]

In [15]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewBMI,NewInsulinScore,NewGlucose
0,6,148,72,35,0,33.6,0.627,50,1,Obesity 1,Abnormal,Secret
1,1,85,66,29,0,26.6,0.351,31,0,Overweight,Abnormal,Normal
2,8,183,64,0,0,23.3,0.672,32,1,Normal,Abnormal,Secret
3,1,89,66,23,94,28.1,0.167,21,0,Overweight,Normal,Normal
4,0,137,40,35,168,43.1,2.288,33,1,Obesity 3,Abnormal,Secret


In [16]:
# 4) One Hot Encoding
# Categorical variables in the data set should be converted into numerical values.
# For this reason, these transformation processes are performed with Label Encoding and One Hot Encoding method.

In [17]:
# Here, by making One Hot Encoding transformation, categorical variables were converted into numerical values. It is also protected from the Dummy variable trap.
df = pd.get_dummies(df, columns =["NewBMI","NewInsulinScore", "NewGlucose"], drop_first = True)

In [18]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight,NewInsulinScore_Normal,NewGlucose_Low,NewGlucose_Normal,NewGlucose_Overweight,NewGlucose_Secret
0,6,148,72,35,0,33.6,0.627,50,1,True,False,False,False,False,False,False,False,False,True
1,1,85,66,29,0,26.6,0.351,31,0,False,False,False,True,False,False,False,True,False,False
2,8,183,64,0,0,23.3,0.672,32,1,False,False,False,False,False,False,False,False,False,True
3,1,89,66,23,94,28.1,0.167,21,0,False,False,False,True,False,True,False,True,False,False
4,0,137,40,35,168,43.1,2.288,33,1,False,False,True,False,False,False,False,False,False,True


In [19]:
categorical_df = df[['NewBMI_Obesity 1','NewBMI_Obesity 2', 'NewBMI_Obesity 3', 'NewBMI_Overweight','NewBMI_Underweight',
                     'NewInsulinScore_Normal','NewGlucose_Low','NewGlucose_Normal', 'NewGlucose_Overweight', 'NewGlucose_Secret']]

In [20]:
categorical_df.head()

Unnamed: 0,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight,NewInsulinScore_Normal,NewGlucose_Low,NewGlucose_Normal,NewGlucose_Overweight,NewGlucose_Secret
0,True,False,False,False,False,False,False,False,False,True
1,False,False,False,True,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,True
3,False,False,False,True,False,True,False,True,False,False
4,False,False,True,False,False,False,False,False,False,True


In [21]:
y = df["Outcome"]
X = df.drop(["Outcome",'NewBMI_Obesity 1','NewBMI_Obesity 2', 'NewBMI_Obesity 3', 'NewBMI_Overweight','NewBMI_Underweight',
                     'NewInsulinScore_Normal','NewGlucose_Low','NewGlucose_Normal', 'NewGlucose_Overweight', 'NewGlucose_Secret'], axis = 1)
cols = X.columns
index = X.index

In [22]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [23]:
# The variables in the data set are an effective factor in increasing the performance of the models by standardization.  
# There are multiple standardization methods. These are methods such as" Normalize"," MinMax"," Robust" and "Scale".
from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X)
X = transformer.transform(X)
X = pd.DataFrame(X, columns = cols, index = index)

In [24]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.6,0.751515,0.0,0.375,-0.239686,0.172043,0.665359,1.235294
1,-0.4,-0.775758,-0.333333,0.1875,-0.239686,-0.580645,-0.056209,0.117647
2,1.0,1.6,-0.444444,-0.71875,-0.239686,-0.935484,0.783007,0.176471
3,-0.4,-0.678788,-0.333333,0.0,0.499018,-0.419355,-0.537255,-0.470588
4,-0.6,0.484848,-1.777778,0.375,1.08055,1.193548,5.007843,0.235294


In [25]:
X = pd.concat([X,categorical_df], axis = 1)

In [26]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,NewBMI_Obesity 1,NewBMI_Obesity 2,NewBMI_Obesity 3,NewBMI_Overweight,NewBMI_Underweight,NewInsulinScore_Normal,NewGlucose_Low,NewGlucose_Normal,NewGlucose_Overweight,NewGlucose_Secret
0,0.6,0.751515,0.0,0.375,-0.239686,0.172043,0.665359,1.235294,True,False,False,False,False,False,False,False,False,True
1,-0.4,-0.775758,-0.333333,0.1875,-0.239686,-0.580645,-0.056209,0.117647,False,False,False,True,False,False,False,True,False,False
2,1.0,1.6,-0.444444,-0.71875,-0.239686,-0.935484,0.783007,0.176471,False,False,False,False,False,False,False,False,False,True
3,-0.4,-0.678788,-0.333333,0.0,0.499018,-0.419355,-0.537255,-0.470588,False,False,False,True,False,True,False,True,False,False
4,-0.6,0.484848,-1.777778,0.375,1.08055,1.193548,5.007843,0.235294,False,False,True,False,False,False,False,False,False,True


In [27]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64