In [1]:
# -------------------------------
# Imports
# -------------------------------
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Base models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Boosting
from xgboost import XGBClassifier

In [2]:
# -------------------------------
# Load dataset
# -------------------------------
iris = load_iris()
X, y = iris.data, iris.target

In [3]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 🌳 1. Decision Tree (Baseline)

Pros ✅
	•	Easy to interpret.
	•	Fast and simple.
	•	Handles non-linear data.

Cons ❌
	•	High variance (overfits easily).
	•	Not as accurate alone compared to ensembles.

In [5]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

Decision Tree Accuracy: 0.9333333333333333


# 🌲 2. Random Forest (Bagging)

Pros ✅
	•	Reduces overfitting (bagging many trees).
	•	More stable than single decision tree.
	•	Works well in most tasks.

Cons ❌
	•	Less interpretable than a single tree.
	•	Can be computationally expensive with many trees.

In [6]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.8888888888888888


# 🔥 3. XGBoost (Boosting)

Pros ✅
	•	Often the most accurate in practice (used in Kaggle wins).
	•	Handles bias and variance very well.
	•	Built-in regularization to prevent overfitting.

Cons ❌
	•	More complex to tune.
	•	Training can be slower than Random Forest.

In [9]:
xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

XGBoost Accuracy: 0.9333333333333333


# 🗳️ 4. Voting Classifier

Pros ✅
	•	Combines different model types (diversity = strength).
	•	Reduces risk of relying on one weak model.

Cons ❌
	•	No learning about how much weight to give each model (just averages).
	•	Can underperform if base models are too similar.




In [10]:
log_clf = LogisticRegression(max_iter=1000, random_state=42)
svm_clf = SVC(probability=True, random_state=42)
dt_clf = DecisionTreeClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('svm', svm_clf), ('dt', dt_clf)],
    voting='soft'   # soft = based on probabilities, better than hard majority vote
)
voting_clf.fit(X_train, y_train)
y_pred_vote = voting_clf.predict(X_test)

print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_vote))

Voting Classifier Accuracy: 0.9555555555555556


# 🏗️ 5. Stacking Classifier

Pros ✅
	•	Meta-model learns which base model to trust more.
	•	Often best performance when base models are diverse.

Cons ❌
	•	More complex & slower (nested models).
	•	Harder to interpret.


In [11]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('svm', SVC(probability=True, random_state=42))
]

stack_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)
stack_clf.fit(X_train, y_train)
y_pred_stack = stack_clf.predict(X_test)

print("Stacking Classifier Accuracy:", accuracy_score(y_test, y_pred_stack))

Stacking Classifier Accuracy: 0.9555555555555556


# Final Results

In [12]:
results = {
    "Decision Tree": accuracy_score(y_test, y_pred_dt),
    "Random Forest": accuracy_score(y_test, y_pred_rf),
    "XGBoost": accuracy_score(y_test, y_pred_xgb),
    "Voting": accuracy_score(y_test, y_pred_vote),
    "Stacking": accuracy_score(y_test, y_pred_stack)
}

df_results = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy'])
print(df_results)

               Accuracy
Decision Tree  0.933333
Random Forest  0.888889
XGBoost        0.933333
Voting         0.955556
Stacking       0.955556
