# Bagging Classifier
The Bagging Classifier (Bootstrap Aggregating) is an ensemble learning method that improves the stability and accuracy of machine learning algorithms by combining multiple models. It works by creating multiple subsets of the original dataset with replacement and training a base estimator on each subset. The final prediction is made by aggregating the predictions of all base estimators, typically through majority voting.

## Advantages:
- Reduced Overfitting: By averaging multiple models, Bagging reduces overfitting and variance.
- Improved Accuracy: Combines multiple models to achieve higher accuracy compared to a single model.
- Parallel Training: Base estimators can be trained in parallel, speeding up the training process.
- Robustness: Works well with a variety of models and is robust to noisy data.

## Disadvantages:
- Increased Complexity: More complex and computationally intensive compared to using a single model.
- Resource Intensive: Requires significant computational resources and memory, especially for large datasets.
- Model Interpretability: The ensemble model is harder to interpret compared to a single decision tree or simpler model.

## Use Case:
- Finance: Fraud detection, credit scoring, and risk assessment.
- Healthcare: Disease prediction and patient outcome analysis.
- Marketing: Customer segmentation, churn prediction, and recommendation systems.
- E-commerce: Product recommendation, inventory forecasting, and sales prediction.

## Scaling (not necessary and necessary Depend on the models)
Bagging does not require feature scaling because it is based on decision tree algorithms which are not sensitive to the scale of the features. However, if the base estimator requires scaling (e.g., SVM), then scaling is necessary.

## Encoding (necessary)
Categorical data must be encoded into numerical values.

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import uniform, loguniform
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_regression

In [None]:
df = pd.read_csv('Breast_Cancer.csv')
x = df.drop('diagnosis',axis=1)
y = df['diagnosis']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 1. Bagging with the Default Estimator (Decision Tree)

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Create the Bagging Regressor with default estimator (DecisionTreeClassifier)
bagging_clas = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(bagging_clas, param_grid, cv=5, n_jobs=-1)

# Train the grid search
grid_search.fit(x_train, y_train)

In [None]:
print("Best Hyperparameter Index:", grid_search.best_index_)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

In [None]:
# Get the model with best hyperparameters
model = grid_search.best_estimator_
# y_pred = model.predict(x_test)

## Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Create the Bagging Regressor with default estimator (DecisionTreeClassifier)
bagging_clas = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': [10, 50, 100, 200],
    'estimator__max_depth': [None, 10, 20, 30, 40, 50],
    'estimator__min_samples_split': [2, 5, 10, 15]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(bagging_clas, param_distributions=param_dist, n_iter=50, cv=5, n_jobs=-1, random_state=42)

# Train the grid search
random_search.fit(x_train, y_train)

In [None]:
print("Best Hyperparameter Index:", random_search.best_index_)
print("Best Hyperparameters:", random_search.best_params_)
print("Best Cross-Validated Score:", random_search.best_score_)

In [None]:
model = random_search.best_estimator_
# y_pred = model.predict(x_test)

## Train BaggingClassifier without search

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

model = BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=10, min_samples_split=5),n_estimators=50,random_state=42)
# model.fit(x_train, y_train)

# 2. Bagging with a Single Estimator (Support Vector Regression)

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC


# Create the Bagging Regressor with SVC
bagging_clas_SVC = BaggingClassifier(estimator=SVC(), random_state=42)


# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'estimator__C': [0.1, 1, 10, 100],
    'estimator__gamma': ['scale', 'auto'] + list(np.logspace(-9, 3, 13)),
    'estimator__kernel': ['linear', 'poly', 'rbf']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(bagging_clas_SVC, param_grid, cv=5, n_jobs=-1)

# Train the grid search
grid_search.fit(x_train, y_train)

In [None]:
print("Best Hyperparameter Index:", grid_search.best_index_)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

In [None]:
# Get the model with best hyperparameters
model = grid_search.best_estimator_
# y_pred = model.predict(x_test)

## Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

# Create the Bagging Regressor with default estimator (DecisionTreeClassifier)
bagging_clas_SVC = BaggingClassifier(estimator=SVC(), random_state=42)

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': [10, 50, 100, 200],
    'estimator__C': [0.1, 1, 10, 100],
    'estimator__gamma': ['scale', 'auto'] + list(np.logspace(-9, 3, 13)),
    'estimator__kernel': ['linear', 'poly', 'rbf']
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(bagging_clas_SVC, param_distributions=param_dist, n_iter=50, cv=5, n_jobs=-1, random_state=42)

# Train the grid search
random_search.fit(x_train, y_train)

In [None]:
print("Best Hyperparameter Index:", random_search.best_index_)
print("Best Hyperparameters:", random_search.best_params_)
print("Best Cross-Validated Score:", random_search.best_score_)

In [None]:
model = random_search.best_estimator_
# y_pred = model.predict(x_test)

## Train BaggingClassifier without search

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

model = BaggingClassifier(estimator=SVC(kernel='linear', gamma=1, C=100),n_estimators=50,random_state=42)
# model.fit(x_train, y_train)

# 3. Bagging with Multiple Estimators (SVC, Decision Tree, GaussianNB)

## Grid Search

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Create the individual regressors without the Pipeline of scaler
svc = SVC()
decision_tree = DecisionTreeClassifier()
gaussian_NB = GaussianNB()


# Create the VotingClassifier with the different models
voting_regressor = VotingClassifier(estimators=[
    ('SVC', svc),
    ('decision_tree', decision_tree),
    ('gaussian_NB', gaussian_NB)
])

# Create the Bagging Regressor with VotingClassifier
bagging_clas_voting = BaggingClassifier(estimator=voting_regressor, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'estimator__SVC__C': [0.1, 1, 10],
    'estimator__SVC__gamma': [0.1, 0.5, 1],
    'estimator__SVC__kernel': ['linear', 'poly', 'rbf'],
    'estimator__decision_tree__max_depth': [None, 10, 20, 30],
    'estimator__decision_tree__min_samples_split': [2, 5, 10],
    'estimator__gaussian_NB__var_smoothing': np.logspace(-9, 0, 10)
}

# Initialize GridSearchCV
grid_search_voting = GridSearchCV(bagging_clas_voting, param_grid, cv=5, n_jobs=-1)

# Train the grid search
grid_search_voting.fit(x_train, y_train)

In [None]:
print("Best Hyperparameter Index:", grid_search.best_index_)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

In [None]:
# Get the model with best hyperparameters
model = grid_search.best_estimator_
# y_pred = model.predict(x_test)

## Randomized Search

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Create the individual regressors without the Pipeline of scaler
svc = SVC()
decision_tree = DecisionTreeClassifier()
gaussian_NB = GaussianNB()

# Create the VotingClassifier with the different models
voting_regressor = VotingClassifier(estimators=[
    ('SVC', svc),
    ('decision_tree', decision_tree),
    ('gaussian_NB', gaussian_NB)
])

# Create the Bagging Regressor with VotingClassifier
bagging_clas_voting = BaggingClassifier(estimator=voting_regressor, random_state=42)

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': [10, 50, 100, 200],
    'estimator__SVC__C': [0.1, 1, 10, 100],
    'estimator__SVC__gamma': [0.1, 0.2, 0.5, 1.0],
    'estimator__SVC__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'estimator__decision_tree__max_depth': [None, 10, 20, 30, 40, 50],
    'estimator__decision_tree__min_samples_split': [2, 5, 10, 15],
    'estimator__gaussian_NB__var_smoothing': np.logspace(-9, 0, 10)
}

# Initialize RandomizedSearchCV
random_search_voting = RandomizedSearchCV(bagging_clas_voting, param_distributions=param_dist, n_iter=50, cv=5, n_jobs=-1, random_state=42)
# Train the grid search
random_search_voting.fit(x_train, y_train)

In [None]:
print("Best Hyperparameter Index:", random_search.best_index_)
print("Best Hyperparameters:", random_search.best_params_)
print("Best Cross-Validated Score:", random_search.best_score_)

In [None]:
model = random_search.best_estimator_
# y_pred = model.predict(x_test)

## Train BaggingClassifier without search

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

# Create the individual regressors without the Pipeline of scaler
svc = SVC(kernel='linear', gamma=1, C=1)
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
gaussian_NB = GaussianNB(var_smoothing=0.001)

# Create the VotingClassifier with the different models
voting_regressor = VotingClassifier(estimators=[
    ('SVC', svc),
    ('decision_tree', decision_tree),
    ('gaussian_NB', gaussian_NB)
])


model = BaggingClassifier(estimator=voting_regressor,n_estimators=50,random_state=42)
# model.fit(x_train, y_train)

