In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB # import Naive Bayes model Gaussian (asumsi data terdistribusi normal)
from sklearn.svm import SVC # import SVM classifier
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

dbt = pd.read_csv('../dataset/diabetes.csv')
dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
from sklearn.impute import SimpleImputer

feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

dbt.head()

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [4]:
# split dan scale data

from sklearn.preprocessing import StandardScaler
std = StandardScaler()

X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

In [9]:
# Esemble Voting data diabetes

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

logreg = LogisticRegression(max_iter=1000, random_state=42)
svm_poly = SVC(kernel='poly', random_state=42)
dt = DecisionTreeClassifier(random_state=42)

voting = VotingClassifier(estimators=[('LogisticRegression', logreg), ('SVM-Polynomial', svm_poly), ('DecisionTree', dt)], voting='hard')

voting.fit(X_train, y_train)

y_pred_test = voting.predict(X_test)
y_pred_train = voting.predict(X_train)

acc_test = accuracy_score(y_test, y_pred_test)
acc_train = accuracy_score(y_train, y_pred_train)

# Print hasil evaluasi
print('Esemble voting')
print('========================')
print('Hard voting')
print(f"Akurasi train set: {acc_test*100:.2f}%")
print(f"Akurasi tes set: {acc_train*100:.2f}%")


Esemble voting
Hard voting
Akurasi train set: 73.38%
Akurasi tes set: 81.76%
