# Klasifikasi DT, Ensemble Learning, dan Random Forest — Kelompok 3\nFitur yang digunakan: **mean radius**, **mean texture**, **mean perimeter**\n\nDataset: `sklearn.datasets.load_breast_cancer()`

## 1. Setup & Data Loading

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay, confusion_matrix, precision_recall_fscore_support, accuracy_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

data = load_breast_cancer()
selected_features = ['mean radius', 'mean texture', 'mean perimeter']  # Kelompok 3
selected_idx = [list(data.feature_names).index(f) for f in selected_features]

X = data.data[:, selected_idx]
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
print('Features:', selected_features)
print('Train size:', X_train.shape, 'Test size:', X_test.shape)


## 2. Decision Tree Classifier — 4-fold CV (Gini vs Entropy)

In [None]:

dt_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
dt_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)

scores_gini = cross_val_score(dt_gini, X_train, y_train, cv=cv, scoring='accuracy')
scores_entropy = cross_val_score(dt_entropy, X_train, y_train, cv=cv, scoring='accuracy')

import pandas as pd
dt_cv_df = pd.DataFrame({
    'Fold': np.arange(1, len(scores_gini) + 1),
    'Gini': scores_gini,
    'Entropy': scores_entropy
})
dt_cv_summary = pd.DataFrame({
    'Criterion': ['Gini', 'Entropy'],
    'Mean Accuracy': [scores_gini.mean(), scores_entropy.mean()],
    'Std Accuracy': [scores_gini.std(), scores_entropy.std()]
})
dt_cv_df, dt_cv_summary


In [None]:

plt.figure()
plt.title("Decision Tree CV Accuracy (Gini vs Entropy) - Kelompok 3")
plt.bar(dt_cv_summary['Criterion'], dt_cv_summary['Mean Accuracy'])
plt.ylabel("Mean Accuracy (4-fold)")
plt.xlabel("Criterion")
plt.tight_layout()
plt.show()


## 3. Ensemble (Pasting) vs Single Decision Tree — Akurasi

In [None]:

dt_single = DecisionTreeClassifier(random_state=42)
dt_single.fit(X_train, y_train)
acc_single = accuracy_score(y_test, dt_single.predict(X_test))

pasting = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=100,
    max_samples=100,
    bootstrap=False,
    random_state=42
)
pasting.fit(X_train, y_train)
acc_pasting = accuracy_score(y_test, pasting.predict(X_test))

pasting_df = pd.DataFrame({
    'Model': ['DecisionTree (single)', 'Pasting Ensemble'],
    'Test Accuracy': [acc_single, acc_pasting]
})
pasting_df


In [None]:

plt.figure()
plt.title("Pasting Ensemble vs Single Tree - Test Accuracy")
plt.bar(pasting_df['Model'], pasting_df['Test Accuracy'])
plt.ylabel("Accuracy")
plt.xlabel("Model")
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()


## 4. Voting Classifier (Soft) — Logistic Regression + Decision Tree

In [None]:

logreg = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])
dt_for_vote = DecisionTreeClassifier(random_state=42)

voting = VotingClassifier(
    estimators=[('lr', logreg), ('dt', dt_for_vote)],
    voting='soft'
)
voting.fit(X_train, y_train)

y_pred_vote = voting.predict(X_test)
cm = confusion_matrix(y_test, y_pred_vote)
cm


In [None]:

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=load_breast_cancer().target_names)
plt.figure()
disp.plot(values_format='d')
plt.title("Confusion Matrix - Soft Voting (LogReg + Decision Tree)")
plt.tight_layout()
plt.show()


In [None]:

prec, rec, f1, support = precision_recall_fscore_support(y_test, y_pred_vote, labels=[0,1])
pr_df = pd.DataFrame({
    'Class (0=malignant, 1=benign)': [0, 1],
    'Precision': prec,
    'Recall': rec,
    'F1-score': f1,
    'Support': support
})
pr_df


## 5. Random Forest vs Single Decision Tree — Akurasi & Feature Importance

In [None]:

rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf.fit(X_train, y_train)
acc_rf = accuracy_score(y_test, rf.predict(X_test))

rf_compare_df = pd.DataFrame({
    'Model': ['DecisionTree (single)', 'RandomForest (n=100)'],
    'Test Accuracy': [acc_single, acc_rf]
})
rf_compare_df


In [None]:

dt_importance = pd.Series(dt_single.feature_importances_, index=selected_features)
rf_importance = pd.Series(rf.feature_importances_, index=selected_features)

fi_df = pd.DataFrame({
    'Feature': selected_features,
    'DT_Importance': dt_importance.values,
    'RF_Importance': rf_importance.values
})
fi_df


In [None]:

plt.figure()
plt.title("Random Forest Feature Importances (Kelompok 3)")
plt.bar(rf_importance.index, rf_importance.values)
plt.ylabel("Importance")
plt.xlabel("Feature")
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()
