In [21]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [25]:
# Load dataset iris.csv
iris_data = pd.read_csv('data/iris.csv')
X_iris = iris_data.drop(columns=['Species'])
y_iris = iris_data['Species']

In [26]:
# Load dataset wbc.csv
wbc_data = pd.read_csv('data/wbc.csv')
X_wbc = wbc_data.drop(columns=['diagnosis'])
y_wbc = wbc_data['diagnosis']

In [27]:
# Fungsi untuk mengevaluasi model harus didefinisikan sebelumnya
def evaluate_models(X, y):
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score

    # Membagi dataset menjadi training dan testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Random Forest
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    rf_acc = accuracy_score(y_test, rf.predict(X_test))

    # AdaBoost
    ada = AdaBoostClassifier(random_state=42)
    ada.fit(X_train, y_train)
    ada_acc = accuracy_score(y_test, ada.predict(X_test))

    # Stacking
    estimators = [('rf', RandomForestClassifier(random_state=42)), ('ada', AdaBoostClassifier(random_state=42))]
    stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    stack.fit(X_train, y_train)
    stack_acc = accuracy_score(y_test, stack.predict(X_test))

    return rf_acc, ada_acc, stack_acc

In [11]:
# Evaluasi untuk dataset Iris
rf_acc_iris, ada_acc_iris, stack_acc_iris = evaluate_models(X_iris, y_iris)
print("Akurasi untuk dataset Iris:")
print(f"Random Forest: {rf_acc_iris:.4f}")
print(f"AdaBoost: {ada_acc_iris:.4f}")
print(f"Stacking: {stack_acc_iris:.4f}")
print(f"Perbedaan Akurasi Stacking dan Random Forest: {stack_acc_iris - rf_acc_iris:.4f}")
print(f"Perbedaan Akurasi Stacking dan AdaBoost: {stack_acc_iris - ada_acc_iris:.4f}")

Akurasi untuk dataset Iris:
Random Forest: 1.0000
AdaBoost: 1.0000
Stacking: 1.0000
Perbedaan Akurasi Stacking dan Random Forest: 0.0000
Perbedaan Akurasi Stacking dan AdaBoost: 0.0000


In [37]:
# Definisi fungsi evaluasi model
def evaluate_models(X, y):
    # Memeriksa nilai NaN
    print(f"Jumlah nilai NaN di X: {np.isnan(X).sum()}")
    print(f"Jumlah nilai NaN di y: {pd.isnull(y).sum()}")  # Menggunakan pd.isnull untuk y yang mungkin non-numerik

    # Imputasi nilai NaN di X menggunakan rata-rata
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)  # Mengganti NaN dengan rata-rata kolom

    # Pastikan tidak ada nilai NaN setelah imputasi
    assert not np.isnan(X).any(), "Terdapat nilai NaN di X setelah imputasi"

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

    # Model Random Forest
    rf_model = RandomForestClassifier(n_estimators=10, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_accuracy = accuracy_score(y_test, rf_model.predict(X_test))

    # Model AdaBoost
    ada_model = AdaBoostClassifier(n_estimators=50, random_state=42)
    ada_model.fit(X_train, y_train)
    ada_accuracy = accuracy_score(y_test, ada_model.predict(X_test))

    # Model Stacking
    layer_one_estimators = [
        ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
        ('knn', KNeighborsClassifier(n_neighbors=5))
    ]
    
    # Final estimator pada Stacking
    clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=LogisticRegression())
    clf.fit(X_train, y_train)
    stacking_accuracy = accuracy_score(y_test, clf.predict(X_test))

    return rf_accuracy, ada_accuracy, stacking_accuracy

In [40]:
# Memastikan X_wbc_clean dan y_wbc tidak kosong
print(f"Dimensi X_wbc_clean: {X_wbc_clean.shape}")
print(f"Dimensi y_wbc: {y_wbc.shape}")
print(f"Jumlah nilai NaN di y_wbc: {y_wbc.isnull().sum()}")

# Evaluasi untuk dataset WBC setelah menangani missing values
rf_acc_wbc, ada_acc_wbc, stack_acc_wbc = evaluate_models(X_wbc_clean, y_wbc)
print("\nAkurasi untuk dataset WBC:")
print(f"Random Forest: {rf_acc_wbc:.4f}")
print(f"AdaBoost: {ada_acc_wbc:.4f}")
print(f"Stacking: {stack_acc_wbc:.4f}")
print(f"Perbedaan Akurasi Stacking dan Random Forest: {stack_acc_wbc - rf_acc_wbc:.4f}")
print(f"Perbedaan Akurasi Stacking dan AdaBoost: {stack_acc_wbc - ada_acc_wbc:.4f}")

Dimensi X_wbc_clean: (569, 31)
Dimensi y_wbc: (569,)
Jumlah nilai NaN di y_wbc: 0
Jumlah nilai NaN di X: 0
Jumlah nilai NaN di y: 0

Akurasi untuk dataset WBC:
Random Forest: 0.9510
AdaBoost: 0.9650
Stacking: 0.9510
Perbedaan Akurasi Stacking dan Random Forest: 0.0000
Perbedaan Akurasi Stacking dan AdaBoost: -0.0140
