In [1]:
# Import necessary libraries
from sklearn.linear_model import *
from sklearn.metrics import accuracy_score,log_loss
from sklearn.linear_model import  LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import *
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.model_selection import *
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.discriminant_analysis import *
from sklearn.preprocessing import *
from sklearn.svm import SVC
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import os

warnings.simplefilter('ignore')

os.chdir('D:\Datasets')

kyp = pd.read_csv('kyphosis.csv')
kyp

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15
...,...,...,...,...
76,present,157,3,13
77,absent,26,7,13
78,absent,120,2,13
79,present,42,7,6


In [2]:
# Assign the dataset without the 'Kyphosis' column to 'x'
x = kyp.drop('Kyphosis', axis=1)

# Assign the 'Kyphosis' column to 'y'
y = kyp['Kyphosis']

# Split the data into training and testing sets, with a test size of 30% and stratified sampling based on the 'Kyphosis' column
# This ensures that the class distribution in the training and testing sets is similar to the original dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=24)

# Create a Logistic Regression model
lr = LogisticRegression()

# Create a Bagging Classifier using the Logistic Regression model as the base estimator
# Set the number of estimators to 15 and enable out-of-bag (OOB) score calculation
bagg = BaggingClassifier(estimator=lr, n_estimators=15, random_state=24, oob_score=True)

# Fit the Bagging Classifier to the training data
bagg.fit(x_train, y_train)

# Print the out-of-bag (OOB) score, which is an estimate of the model's performance on unseen data
print("OOB Score:", bagg.oob_score_)

OOB Score: 0.8035714285714286


In [3]:
# Use the trained Bagging Classifier to predict the classes of the test data
y_pred = bagg.predict(x_test)

# Get the predicted probabilities of the positive class (1) for the test data
y_pred_prob = bagg.predict_proba(x_test)[:, 1]

# Calculate and print the accuracy score of the predictions
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Calculate and print the log loss of the predicted probabilities
print("Log Loss:", log_loss(y_test, y_pred_prob))

Accuracy Score: 0.76
Log Loss: 0.440608539745545


In [29]:
# Print the parameters of the Bagging Classifier
print(bagg.get_params())

{'base_estimator': 'deprecated', 'bootstrap': True, 'bootstrap_features': False, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 100, 'estimator__multi_class': 'auto', 'estimator__n_jobs': None, 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__solver': 'lbfgs', 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': LogisticRegression(), 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 15, 'n_jobs': None, 'oob_score': True, 'random_state': 24, 'verbose': 0, 'warm_start': False}


# hyperparameter tuning

In [4]:
# Create a Stratified K-Fold cross-validation object with 5 splits, shuffling the data, and a random state of 24
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

# Create a Bagging Classifier with Logistic Regression as the base estimator
# Set the number of estimators to 15 and enable out-of-bag (OOB) score calculation
bagg = BaggingClassifier(estimator=lr, n_estimators=15, random_state=24, oob_score=True)

# Fit the Bagging Classifier to the training data
bagg.fit(x_train, y_train)

# Define the parameter grid for the Logistic Regression model within the Bagging Classifier
params = {
    'estimator__penalty': ['L2', None],
    'estimator__C': np.linspace(0.001, 10, 5),
    'estimator__solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag']
}

# Create a GridSearchCV object to perform cross-validation and hyperparameter tuning
# Use the Bagging Classifier as the estimator, the parameter grid defined above, and the Stratified K-Fold cross-validation
# Set the scoring metric to 'neg_log_loss' (negative log loss) to optimize for log loss
gcv = GridSearchCV(bagg, param_grid=params, cv=kfold, verbose=3, scoring='neg_log_loss')

# Fit the GridSearchCV object to the entire dataset (x and y)
gcv.fit(x, y)

# Print the best parameters found by the GridSearchCV
print("Best Parameters:", gcv.best_params_)

# Print the best score (negative log loss) found by the GridSearchCV
print("Best Score:", gcv.best_score_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END estimator__C=0.001, estimator__penalty=L2, estimator__solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END estimator__C=0.001, estimator__penalty=L2, estimator__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END estimator__C=0.001, estimator__penalty=L2, estimator__solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END estimator__C=0.001, estimator__penalty=L2, estimator__solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END estimator__C=0.001, estimator__penalty=L2, estimator__solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END estimator__C=0.001, estimator__penalty=L2, estimator__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END estimator__C=0.001, estimator__penalty=L2, estimator__solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END estimator__C=0.001, estimator__penalty=L2, estimator__solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END estimator__C=0.001, estim

# multiple estimator

In [5]:
# Create a Gaussian Naive Bayes classifier
nb = GaussianNB()

# Create a Stratified K-Fold cross-validation object with 5 splits, shuffling the data, and a random state of 24
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

# Create a Support Vector Classifier (SVC) with probability estimates enabled and a random state of 24
svm = SVC(random_state=24, probability=True)

# Create a Decision Tree Classifier with a random state of 24
dtc = DecisionTreeClassifier(random_state=24)

# Create a Logistic Regression model
lr = LogisticRegression()

# Create a Bagging Classifier with a random state of 24
bagg = BaggingClassifier(random_state=24)

# Define the parameter grid for the Bagging Classifier
params = {
    'estimator': [svm, nb, lr, dtc],
    'n_estimators': [10, 15]
}

# Create a GridSearchCV object to perform cross-validation and hyperparameter tuning
# Use the Bagging Classifier as the estimator, the parameter grid defined above, the Stratified K-Fold cross-validation,
# set the verbosity to 3, and use the negative log loss as the scoring metric
gcv_m = GridSearchCV(bagg, param_grid=params, cv=kfold, verbose=3, scoring='neg_log_loss')

# Fit the GridSearchCV object to the entire dataset (x and y)
gcv_m.fit(x, y)

# Print the best parameters found by the GridSearchCV
print("Best Parameters:", gcv_m.best_params_)

# Print the best score (negative log loss) found by the GridSearchCV
print("Best Score:", gcv_m.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END estimator=SVC(probability=True, random_state=24), n_estimators=10;, score=-0.525 total time=   0.0s
[CV 2/5] END estimator=SVC(probability=True, random_state=24), n_estimators=10;, score=-0.452 total time=   0.0s
[CV 3/5] END estimator=SVC(probability=True, random_state=24), n_estimators=10;, score=-0.413 total time=   0.0s
[CV 4/5] END estimator=SVC(probability=True, random_state=24), n_estimators=10;, score=-0.455 total time=   0.0s
[CV 5/5] END estimator=SVC(probability=True, random_state=24), n_estimators=10;, score=-0.529 total time=   0.0s
[CV 1/5] END estimator=SVC(probability=True, random_state=24), n_estimators=15;, score=-0.524 total time=   0.0s
[CV 2/5] END estimator=SVC(probability=True, random_state=24), n_estimators=15;, score=-0.438 total time=   0.0s
[CV 3/5] END estimator=SVC(probability=True, random_state=24), n_estimators=15;, score=-0.417 total time=   0.0s
[CV 4/5] END estimator=SVC(probabili