* Imported Packages

In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

* Shared Variables

In [3]:
with open("shared_variables.pkl", "rb") as f:
    columns = pickle.load(f)
    X_train = pickle.load(f)
    y_train = pickle.load(f)
    X_test = pickle.load(f)
    y_test = pickle.load(f)

* Models
    1. Bagging (With Grid Search)

In [7]:
# Function to implement bagging with Logistic Regression and grid search
def bagging(X_train, y_train, num_models, param_grid):
    models = []

    for _ in range(num_models):
        # Randomly sample with replacement
        indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
        X_subset = X_train.iloc[indices]
        y_subset = y_train.iloc[indices]
        
        base_model = LogisticRegression(max_iter=1000)

        grid_search = GridSearchCV(base_model, param_grid, scoring='accuracy', cv=5)
        grid_search.fit(X_subset, y_subset)

        # Use the best estimator from grid search
        best_model = grid_search.best_estimator_
    
        models.append(best_model)

    return models

# Function to make predictions using bagging
def predict_bagging(models, X_test):
    predictions = np.zeros((len(X_test), len(models)))  

    for i, model in enumerate(models):
        predictions[:, i] = model.predict(X_test)

    # Use majority voting for classification
    ensemble_predictions = np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), 
                                               axis=1, arr=predictions)

    return ensemble_predictions


param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'solver': ['lbfgs', 'liblinear']}

# Train bagging ensemble
num_models = 10 
models = bagging(X_train, y_train, num_models, param_grid)

# Make predictions
ensemble_predictions = predict_bagging(models, X_test)

# Evaluate Accuracy
accuracy = accuracy_score(y_test, ensemble_predictions)
print(f"Accuracy on test data: {accuracy}")

Accuracy on test data: 0.6676294528862656


2. Boosting

In [8]:
def initialize_weights(n_samples):
    return np.ones(n_samples) / n_samples

def update_weights(weights, alpha, y_true, y_pred):
    incorrect = (y_true != y_pred).astype(int)
    updated_weights = weights * np.exp(alpha * incorrect)
    return updated_weights / np.sum(updated_weights)

def adaboost(X, y, n_estimators):
    n_samples, n_features = X.shape 
    weights = initialize_weights(n_samples)
    models = []
    alphas = []

    for _ in range(n_estimators):
        # Train a weak learner (Decision Tree in this case)
        model = DecisionTreeClassifier(max_depth=1)
        model.fit(X, y, sample_weight=weights)

        # Make predictions
        y_pred = model.predict(X)

        # Calculate error and alpha
        error = np.sum(weights * (y_pred != y)) / np.sum(weights)
        alpha = 0.5 * np.log((1 - error) / error)

        # Update weights
        weights = update_weights(weights, alpha, y, y_pred)

        # Save the model and alpha
        models.append(model)
        alphas.append(alpha)

    return models, alphas

def adaboost_predict(models, alphas, X):
    n_samples = X.shape[0]
    predictions = np.zeros(n_samples)

    for model, alpha in zip(models, alphas):
        predictions += alpha * model.predict(X)

    return np.sign(predictions)

# Train AdaBoost
n_estimators = 5
models, alphas = adaboost(X_train, y_train, n_estimators)

# Make predictions on test data
y_pred_train = adaboost_predict(models, alphas, X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred_train)
print(f"Accuracy on test data: {accuracy}")

Accuracy on test data: 0.5517183641006321


3. Random Forests

In [4]:
def bootstrap_sample(data, labels):
    n_samples = len(data)
    indices = np.random.choice(n_samples, size=n_samples, replace=True)
    return data.iloc[indices], labels.iloc[indices]

def random_forest(X, y, n_estimators, max_features=None):
    #n_samples, n_features = X.shape
    models = []

    for i in range(n_estimators):
        # Create a bootstrap sample
        bootstrap_data, bootstrap_labels = bootstrap_sample(X, y)

        # Train a decision tree on the bootstrap sample
        model = DecisionTreeClassifier(max_features=max_features)
        model.fit(bootstrap_data, bootstrap_labels)

        # Add the trained model to the ensemble
        models.append(model)

    return models

def random_forest_predict(models, X):
    predictions = np.zeros((X.shape[0], len(models)))

    for i, model in enumerate(models):
        predictions[:, i] = model.predict(X)

    final_predictions = np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), 
                                            axis=1, arr=predictions)

    return final_predictions

n_estimators = 5
max_features = 'sqrt'
models = random_forest(X_train, y_train, n_estimators, max_features)

y_pred_test = random_forest_predict(models, X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f"Accuracy on test data: {accuracy_test}")

Accuracy on test data: 0.6297877684289841


4. Random Forests (With BayesSearch)

In [None]:
import numpy as np
np.int = int
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV

def bootstrap_sample(data, labels):
    n_samples = len(data)
    indices = np.random.choice(n_samples, size=n_samples, replace=True)
    return data.iloc[indices], labels.iloc[indices]

def random_forest(X, y, n_estimators, max_features=None, param_space=None):
    #n_samples, n_features = X.shape
    models = []

    for i in range(n_estimators):
        # Create a bootstrap sample
        bootstrap_data, bootstrap_labels = bootstrap_sample(X, y)

        # Train a decision tree on the bootstrap sample
        base_model = DecisionTreeClassifier(max_features=max_features)

        # Define the Bayesian optimization object
        opt = BayesSearchCV(base_model, param_space, n_iter=50, cv=5, n_jobs=-1)

        # Fit the Bayesian optimization object to the training data
        opt.fit(bootstrap_data, bootstrap_labels)

        # Get the best hyperparameters
        best_model = opt.best_estimator_

        # Add the trained model to the ensemble
        models.append(best_model)

    return models

def random_forest_predict(models, X):
    predictions = np.zeros((X.shape[0], len(models)))

    for i, model in enumerate(models):
        predictions[:, i] = model.predict(X)

    final_predictions = np.apply_along_axis(lambda x: np.bincount(x.astype(np.int64)).argmax(), axis=1, arr=predictions)

    return final_predictions

param_space = {          
    'max_depth': (1, 20),               # Maximum depth of the trees
    'min_samples_split': (2, 10),       # Minimum samples required to split an internal node
    'min_samples_leaf': (1, 10),        # Minimum number of samples required to be at a leaf node
}

n_estimators = 5
max_features = 'sqrt'
models = random_forest(X_train, y_train, n_estimators, max_features, param_space)

y_pred_test = random_forest_predict(models, X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f"Accuracy on test data: {accuracy_test}")