In [2]:

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:

# Load your data from the CSV file
file_path = 'your_data.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Specify the features (X) and target variable (y) columns
# Replace 'target_column' with the actual name of your target variable column
X = df.drop('smoking', axis=1)
y = df['smoking']

# Split the data into train, validate, and test sets (70%, 15%, 15%)
# Adjust random_state for reproducibility
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validate, X_test, y_validate, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shapes of the resulting sets
print("Train set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_validate.shape, y_validate.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Train set shape: (111479, 11) (111479,)
Validation set shape: (23888, 11) (23888,)
Test set shape: (23889, 11) (23889,)


In [5]:
# Assuming X_train, y_train, X_validate, y_validate, X_test, y_test are your train, validate, and test data

# Define the number of base models (gradient boosting classifiers)
num_base_models = 10  # You can adjust this based on your preference

def bagged_gradient_boosting(X_train, y_train, X_validate, y_validate, num_base_models):
    # Step 1: Train on the training data
    predictions = []

    for _ in range(num_base_models):
        # Bootstrap sampling (sampling with replacement)
        X_sampled, y_sampled = resample(X_train, y_train, replace=True, random_state=None)

        # Create and train the base gradient boosting model
        base_model = GradientBoostingClassifier()

        # Hyperparameter tuning with grid search (including learning rate)
        param_grid = {'learning_rate': [0.001, 0.01, 0.1, 0.2],  # Example values for learning rate
                      'n_estimators': [50, 100, 200]}  # Additional hyperparameters
        grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_sampled, y_sampled)
        best_model = grid_search.best_estimator_

        # Make predictions on the validation set
        y_pred_validate = best_model.predict(X_validate)
        predictions.append(y_pred_validate)

    # Combine predictions through majority voting for validation
    ensemble_predictions_validate = np.round(np.mean(predictions, axis=0))

    return best_model, ensemble_predictions_validate  # Return the final trained model and validation predictions



In [None]:
# Example usage:
trained_model, validation_predictions = bagged_gradient_boosting(X_train, y_train, X_validate, y_validate, num_base_models)






In [None]:
# Evaluate accuracy on the test set
test_predictions = trained_model.predict(X_test)
accuracy_test = accuracy_score(y_test, test_predictions)
print(f"Bagged Gradient Boosting Test Accuracy: {accuracy_test}")