In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import xgboost as xgb

df = pd.read_csv("D:\Capstone\Databases\ML CSV\CUDB_VFDB_combined.csv")

In [None]:
df.shape

(12775, 12)

In [None]:
# Separate the data into two classes
class_0 = df[df["rhythm"] == 0]
class_1 = df[df["rhythm"] == 1]

# Undersample class 0 to match the number of samples in class 1
class_0_undersampled = class_0.sample(n=len(class_1), random_state=42)

# Concatenate the undersampled class 0 with class 1
balanced_df = pd.concat([class_0_undersampled, class_1])

# Shuffle the concatenated DataFrame to mix the rows
#balanced_df = balanced_df.sample(frac=1, random_state=100)

In [None]:
print(balanced_df['rhythm'].value_counts())

rhythm
0    2399
1    2399
Name: count, dtype: int64


In [None]:
#X = balanced_df.drop(columns=['std_dev','rhythm','STE'])
X = balanced_df.iloc[:, :-1]
y = balanced_df.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 100)

In [None]:
import xgboost as xgb
'''
model = xgb.XGBClassifier(colsample_bytree = 0.816432335230161,
gamma = 0,
learning_rate = 0.060309950161179966,
max_depth = 10,
n_estimators = 500,
reg_alpha = 0,
reg_lambda = 7,
subsample = 0.9946951308154663)
'''
'''
model = xgb.XGBClassifier(colsample_bytree = 1,
gamma = 0,
learning_rate = 0.17901706233426795,
max_depth = 10,
n_estimators = 366,
reg_alpha = 0,
reg_lambda = 5,
subsample = 1)
'''
model = xgb.XGBClassifier()
model.fit(X_train,y_train)
y_predict = model.predict(X_test)
y_train_predict = model.predict(X_train)
from sklearn.metrics import accuracy_score
print('Train accuracy',accuracy_score(y_train, y_train_predict))
print('Test accuracy',accuracy_score(y_test,y_predict))

Train accuracy 1.0
Test accuracy 0.9333333333333333


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Fit the model
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Predictions
y_predict_train = model.predict(X_train)
y_predict_test = model.predict(X_test)

# Metrics calculation
accuracy_train = accuracy_score(y_train, y_predict_train)
accuracy_test = accuracy_score(y_test, y_predict_test)
sensitivity = recall_score(y_test, y_predict_test)
specificity = recall_score(y_test, y_predict_test, pos_label=0)
precision = precision_score(y_test, y_predict_test)
npv = precision_score(y_test, y_predict_test, pos_label=0)
auc_roc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# Convert to percentages
accuracy_train *= 100
accuracy_test *= 100
sensitivity *= 100
specificity *= 100
precision *= 100
npv *= 100
auc_roc *= 100

# Print results
print(f'Train Accuracy: {accuracy_train:.2f}%')
print(f'Test Accuracy: {accuracy_test:.2f}%')
print(f'Sensitivity: {sensitivity:.2f}%')
print(f'Specificity: {specificity:.2f}%')
print(f'Precision: {precision:.2f}%')
print(f'NPV: {npv:.2f}%')
print(f'AUC-ROC: {auc_roc:.2f}%')


Train Accuracy: 100.00%
Test Accuracy: 93.33%
Sensitivity: 93.53%
Specificity: 93.14%
Precision: 93.14%
NPV: 93.53%
AUC-ROC: 98.17%


In [None]:
# Calculate the confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predict)

# Create a heatmap visualization using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.show()

# **Bayesian Optimization (skopt)**

In [None]:
from skopt import BayesSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Define the hyperparameter search space
param_space = {
    'n_estimators': (50, 500),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 1.0, 'log-uniform'),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'gamma': (0, 5),
    'reg_alpha': (0, 10),
    'reg_lambda': (0, 10),
}

# Define the XGBoost classifier
clf = XGBClassifier()

# Initialize BayesSearchCV with the classifier, parameter space, and number of iterations
opt = BayesSearchCV(
    clf,
    param_space,
    n_iter=50,  # Number of iterations
    cv=5,       # Cross-validation folds
    n_jobs=-1,  # Number of CPU cores to use (-1 uses all available)
    scoring='accuracy',  # Scoring metric
    random_state=42
)
np.int = int

# Fit the BayesSearchCV to the data
opt.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters found by Bayesian optimization:")
print(opt.best_params_)

# Evaluate the best model
y_pred = opt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the best model:", accuracy)


Best parameters found by Bayesian optimization:
OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.16887504568911432), ('max_depth', 10), ('n_estimators', 500), ('reg_alpha', 0), ('reg_lambda', 10), ('subsample', 1.0)])
Accuracy of the best model: 0.94375


In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score

# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Calculate accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn) * 100

# Calculate sensitivity (recall)
sensitivity = tp / (tp + fn) * 100

# Calculate specificity
specificity = tn / (tn + fp) * 100

# Calculate precision
precision = precision_score(y_test, y_pred) * 100

# Calculate NPV
npv = tn / (tn + fn) * 100

# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, opt.predict_proba(X_test)[:, 1]) * 100

# Print the results
print(f"Accuracy: {accuracy:.2f}%")
print(f"Sensitivity: {sensitivity:.2f}%")
print(f"Specificity: {specificity:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"NPV: {npv:.2f}%")
print(f"AUC-ROC: {auc_roc:.2f}%")


Accuracy: 94.38%
Sensitivity: 95.35%
Specificity: 93.43%
Precision: 93.37%
NPV: 95.39%
AUC-ROC: 98.45%


# **Bayesian Optimization (Hyperopt)**

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hyperopt import hp, tpe, fmin, Trials



# Define the objective function to minimize (negative accuracy)
def objective(params):
    # Convert float hyperparameters to integers
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])

    # Initialize XGBoost classifier with given hyperparameters
    clf = xgb.XGBClassifier(**params)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Return negative accuracy (to minimize)
    return -accuracy

# Define the search space
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 500, 1),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1.0)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'gamma': hp.uniform('gamma', 0, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0, 10),
    'reg_lambda': hp.uniform('reg_lambda', 0, 10)
}

# Initialize trials object to keep track of optimization history
trials = Trials()

# Run Bayesian optimization
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

# Convert best parameters to integer where needed
best["n_estimators"] = int(best["n_estimators"])
best["max_depth"] = int(best["max_depth"])

# Print best hyperparameters
print("Best Hyperparameters:", best)

# Train the model with the best hyperparameters
best_clf = xgb.XGBClassifier(**best)
best_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


100%|███████████████████████████████████████████████| 50/50 [00:08<00:00,  5.94trial/s, best loss: -0.9395833333333333]
Best Hyperparameters: {'colsample_bytree': 0.9180735836279931, 'gamma': 0.040176865265566145, 'learning_rate': 0.09204233967922984, 'max_depth': 9, 'n_estimators': 487, 'reg_alpha': 4.675886120647191, 'reg_lambda': 3.475975666606203, 'subsample': 0.7699154692375176}
Accuracy: 0.9395833333333333


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred) * 100

# Calculate sensitivity (recall)
sensitivity = (tp / (tp + fn)) * 100

# Calculate specificity
specificity = (tn / (tn + fp)) * 100

# Calculate precision
precision = (tp / (tp + fp)) * 100

# Calculate NPV
npv = (tn / (tn + fn)) * 100

# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, y_pred) * 100

# Print results
print(f"Accuracy: {accuracy:.2f}%")
print(f"Sensitivity: {sensitivity:.2f}%")
print(f"Specificity: {specificity:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"NPV: {npv:.2f}%")
print(f"AUC-ROC: {auc_roc:.2f}%")


Accuracy: 93.96%
Sensitivity: 94.71%
Specificity: 93.22%
Precision: 93.14%
NPV: 94.78%
AUC-ROC: 93.97%


# **RandomizedSearch**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

# Define the XGBoost classifier
clf = xgb.XGBClassifier()

# Define the hyperparameter search space (same as for Bayesian optimization)
param_space = {
    'n_estimators': range(50, 501),
    'max_depth': range(3, 11),
    'learning_rate': [0.01, 0.1, 0.2, 0.4, 1.0],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8,1.0],
    'gamma': [0, 1, 2, 3, 4, 5],
    'reg_alpha': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'reg_lambda': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Define the RandomizedSearchCV object with 50 iterations
random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_space,
    n_iter=150,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Perform Randomized Search
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)



Best Hyperparameters: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 416, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 1.0}


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Train the XGBoost model with the best hyperparameters
best_xgb_model = xgb.XGBClassifier(**best_params)
best_xgb_model.fit(X_train, y_train)

# Predict the labels
y_pred = best_xgb_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)  # Sensitivity is also known as recall
specificity = accuracy_score(y_test, y_pred, normalize=True)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
npv = tn / (tn + fn)  # Negative Predictive Value
auc_roc = roc_auc_score(y_test, y_pred)

# Print the values of evaluation metrics in percentages up to two decimal places
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Sensitivity (Recall): {:.2f}%".format(sensitivity * 100))
print("Specificity: {:.2f}%".format(specificity * 100))
print("NPV (Negative Predictive Value): {:.2f}%".format(npv * 100))
print("AUC-ROC: {:.2f}%".format(auc_roc * 100))


Accuracy: 93.65%
Precision: 92.92%
Sensitivity (Recall): 94.29%
Specificity: 93.65%
NPV (Negative Predictive Value): 94.38%
AUC-ROC: 93.66%


# **PSO**

In [None]:
# PSO
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pyswarm import pso

# Define the XGBoost classifier
clf = xgb.XGBClassifier()

# Define the function to optimize (minimize)
def objective_function(hyperparameters):
    # Convert hyperparameters to dictionary
    param_dict = {
        'n_estimators': int(hyperparameters[0]),
        'max_depth': int(hyperparameters[1]),
        'learning_rate': hyperparameters[2],
        'subsample': hyperparameters[3],
        'colsample_bytree': hyperparameters[4],
        'gamma': hyperparameters[5],
        'reg_alpha': hyperparameters[6],
        'reg_lambda': hyperparameters[7]
    }

    # Set hyperparameters for the classifier
    clf.set_params(**param_dict)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # PSO minimizes the objective function, so we return negative accuracy
    return -accuracy

# Define the bounds for each hyperparameter
lb = [50, 3, 0.01, 0.5, 0.5, 0, 0, 0]  # Lower bounds
ub = [500, 10, 1.0, 1.0, 1.0, 5, 10, 10]  # Upper bounds

# Run PSO to optimize hyperparameters
best_hyperparameters, _ = pso(objective_function, lb, ub, swarmsize=10, maxiter=50)

# Convert hyperparameters to dictionary
best_param_dict = {
    'n_estimators': int(best_hyperparameters[0]),
    'max_depth': int(best_hyperparameters[1]),
    'learning_rate': best_hyperparameters[2],
    'subsample': best_hyperparameters[3],
    'colsample_bytree': best_hyperparameters[4],
    'gamma': best_hyperparameters[5],
    'reg_alpha': best_hyperparameters[6],
    'reg_lambda': best_hyperparameters[7]
}

# Set the best hyperparameters for the classifier
clf.set_params(**best_param_dict)

# Train the classifier with the best hyperparameters
clf.fit(X_train, y_train)

# Predict on the test set with the best classifier
y_pred = clf.predict(X_test)

# Calculate accuracy with the best classifier
accuracy = accuracy_score(y_test, y_pred)

print("Best Accuracy:", accuracy)
print("Best Hyperparameters:", best_param_dict)


Stopping search: maximum iterations reached --> 50
Best Accuracy: 0.9458333333333333
Best Hyperparameters: {'n_estimators': 299, 'max_depth': 7, 'learning_rate': 0.2619945862958502, 'subsample': 0.6218596122214207, 'colsample_bytree': 0.9870653101081508, 'gamma': 0.041294248394420405, 'reg_alpha': 1.1706355025369426, 'reg_lambda': 8.339934590841537}


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Predict probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Predictions
y_pred = clf.predict(X_test)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred) * 100

# Calculate sensitivity (true positive rate)
sensitivity = recall_score(y_test, y_pred) * 100

# Calculate specificity (true negative rate)
specificity = (tn / (tn + fp)) * 100

# Calculate precision
precision = precision_score(y_test, y_pred) * 100

# Calculate NPV (Negative Predictive Value)
npv = (tn / (tn + fn)) * 100

# Calculate AUC-ROC
auc_roc = roc_auc_score(y_test, y_pred_proba) * 100

# Print the evaluation metrics
print("Accuracy: {:.2f}%".format(accuracy))
print("Sensitivity (Recall): {:.2f}%".format(sensitivity))
print("Specificity: {:.2f}%".format(specificity))
print("Precision: {:.2f}%".format(precision))
print("NPV (Negative Predictive Value): {:.2f}%".format(npv))
print("AUC-ROC: {:.2f}%".format(auc_roc))


Accuracy: 94.17%
Sensitivity (Recall): 94.93%
Specificity: 93.43%
Precision: 93.35%
NPV (Negative Predictive Value): 94.99%
AUC-ROC: 97.78%


# **Simualted annealing**

In [None]:
pip install simanneal


In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.optimize import dual_annealing

# Define the XGBoost classifier
clf = xgb.XGBClassifier()

# Load your data and split into train and test sets
# Replace X_train, X_test, y_train, and y_test with your actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Define the objective function
def objective_function(hyperparameters):
    param_dict = {
        'n_estimators': int(hyperparameters[0]),
        'max_depth': int(hyperparameters[1]),
        'learning_rate': hyperparameters[2],
        'subsample': hyperparameters[3],
        'colsample_bytree': hyperparameters[4],
        'gamma': hyperparameters[5],
        'reg_alpha': hyperparameters[6],
        'reg_lambda': hyperparameters[7]
    }

    # Set hyperparameters for the classifier
    clf.set_params(**param_dict)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Return negative accuracy since scipy minimizes the objective function
    return -accuracy

# Define the bounds for each hyperparameter
bounds = [(50, 500),  # n_estimators
          (3, 10),    # max_depth
          (0.01, 1.0),  # learning_rate
          (0.5, 1.0),  # subsample
          (0.5, 1.0),  # colsample_bytree
          (0, 5),      # gamma
          (0, 10),     # reg_alpha
          (0, 10)]     # reg_lambda

# Run simulated annealing to optimize hyperparameters
result = dual_annealing(objective_function, bounds, maxiter=50)

# Get the best hyperparameters
best_hyperparameters = result.x

# Convert best hyperparameters to dictionary
best_param_dict = {
    'n_estimators': int(best_hyperparameters[0]),
    'max_depth': int(best_hyperparameters[1]),
    'learning_rate': best_hyperparameters[2],
    'subsample': best_hyperparameters[3],
    'colsample_bytree': best_hyperparameters[4],
    'gamma': best_hyperparameters[5],
    'reg_alpha': best_hyperparameters[6],
    'reg_lambda': best_hyperparameters[7]
}

# Set the best hyperparameters for the classifier
clf.set_params(**best_param_dict)

# Train the classifier with the best hyperparameters
clf.fit(X_train, y_train)

# Predict on the test set with the best classifier
y_pred = clf.predict(X_test)

# Calculate accuracy with the best classifier
accuracy = accuracy_score(y_test, y_pred)

print("Best Accuracy:", accuracy)
print("Best Hyperparameters:", best_param_dict)


Best Accuracy: 0.9427083333333334
Best Hyperparameters: {'n_estimators': 330, 'max_depth': 6, 'learning_rate': 0.22418002399966896, 'subsample': 0.6414961479604244, 'colsample_bytree': 0.9148824531584978, 'gamma': 0.13911383599042892, 'reg_alpha': 2.830417461693287, 'reg_lambda': 0.1601114198565483}


In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc, precision_score, recall_score

# Define a function to calculate sensitivity, specificity, precision, and NPV
def calculate_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    sensitivity = tp / (tp + fn) * 100
    specificity = tn / (tn + fp) * 100
    precision = tp / (tp + fp) * 100
    npv = tn / (tn + fn) * 100

    return sensitivity, specificity, precision, npv

# Predict probabilities on the test set
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Calculate accuracy, sensitivity, specificity, precision, and NPV
accuracy = accuracy_score(y_test, y_pred) * 100
sensitivity, specificity, precision, npv = calculate_metrics(y_test, y_pred)

print("Accuracy: %.2f%%" % accuracy)
print("Sensitivity: %.2f%%" % sensitivity)
print("Specificity: %.2f%%" % specificity)
print("Precision: %.2f%%" % precision)
print("NPV: %.2f%%" % npv)
print("AUC-ROC: %.2f%%" % (roc_auc * 100))


Accuracy: 94.27%
Sensitivity: 95.14%
Specificity: 93.43%
Precision: 93.36%
NPV: 95.19%
AUC-ROC: 98.17%


# **Genetic Algorithm**

In [None]:
pip install geneticalgorithm

Collecting geneticalgorithm
  Obtaining dependency information for geneticalgorithm from https://files.pythonhosted.org/packages/ac/d2/fb9061239eaeee5c0373844f27f43514f33201bc08aea54d65b437402966/geneticalgorithm-1.0.2-py3-none-any.whl.metadata
  Downloading geneticalgorithm-1.0.2-py3-none-any.whl.metadata (25 kB)
Collecting func-timeout (from geneticalgorithm)
  Downloading func_timeout-4.3.5.tar.gz (44 kB)
     ---------------------------------------- 0.0/44.3 kB ? eta -:--:--
     --------------------------- ------------ 30.7/44.3 kB 1.3 MB/s eta 0:00:01
     ---------------------------------------- 44.3/44.3 kB 1.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading geneticalgorithm-1.0.2-py3-none-any.whl (16 kB)
Building wheels for collected packages: func-timeout
  Building wheel for func-timeout (setup.py): started
  Building wheel for func-timeout (setup.py): finished with status 'done'
  Created whe

In [None]:
#using deap lib
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from deap import base, creator, tools, algorithms

# Define the XGBoost classifier
clf = xgb.XGBClassifier()

# Define the function to optimize (maximize)
def objective_function(hyperparameters):
    # Convert hyperparameters to dictionary
    param_dict = {
        'n_estimators': int(hyperparameters[0]),
        'max_depth': int(hyperparameters[1]),
        'learning_rate': hyperparameters[2],
        'subsample': hyperparameters[3],
        'colsample_bytree': hyperparameters[4],
        'gamma': hyperparameters[5],
        'reg_alpha': hyperparameters[6],
        'reg_lambda': hyperparameters[7]
    }

    # Set hyperparameters for the classifier
    clf.set_params(**param_dict)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Return negative accuracy to maximize
    return -accuracy

# Define the bounds for each hyperparameter
varbound = np.array([[50, 500],  # n_estimators
                     [3, 10],    # max_depth
                     [0.01, 1.0],  # learning_rate
                     [0.5, 1.0],  # subsample
                     [0.5, 1.0],  # colsample_bytree
                     [0, 5],     # gamma
                     [0, 10],    # reg_alpha
                     [0, 10]])   # reg_lambda

# Define the creator for the fitness and individual
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Initialize toolbox
toolbox = base.Toolbox()

# Register attributes to the individual
toolbox.register("attr_float", np.random.uniform, -1, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=8)

# Register the individual and population
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register the evaluation function
toolbox.register("evaluate", objective_function)

# Register the crossover operator
toolbox.register("mate", tools.cxBlend, alpha=0.5)

# Register the mutation operator
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.2, indpb=0.1)

# Register the selection operator
toolbox.register("select", tools.selTournament, tournsize=3)

# Define main function to run genetic algorithm
def main():
    # Initialize population
    population = toolbox.population(n=10)

    # Define statistics to gather
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("max", np.max)

    # Define the algorithm
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=50, stats=stats, verbose=True)

    # Get the best individual
    best_individual = tools.selBest(population, k=1)[0]

    return best_individual

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Run the genetic algorithm
best_individual = main()

# Convert the best individual to hyperparameters
best_hyperparameters = [int(val) if idx == 0 else val for idx, val in enumerate(best_individual)]

# Convert hyperparameters to dictionary
best_param_dict = {
    'n_estimators': int(best_hyperparameters[0]),
    'max_depth': int(best_hyperparameters[1]),
    'learning_rate': best_hyperparameters[2],
    'subsample': best_hyperparameters[3],
    'colsample_bytree': best_hyperparameters[4],
    'gamma': best_hyperparameters[5],
    'reg_alpha': best_hyperparameters[6],
    'reg_lambda': best_hyperparameters[7]
}

# Set the best hyperparameters for the classifier
clf.set_params(**best_param_dict)

# Train the classifier with the best hyperparameters
clf.fit(X_train, y_train)

# Predict on the test set with the best classifier
y_pred = clf.predict(X_test)

# Calculate accuracy with the best classifier
accuracy = accuracy_score(y_test, y_pred)

print("Best Accuracy:", accuracy)
print("Best Hyperparameters:", best_param_dict)


In [None]:
pip install geatpy


In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from geatpy import GeneticAlgorithm

# Define the XGBoost classifier
clf = xgb.XGBClassifier()

# Define the function to optimize (maximize)
def objective_function(solution, *args):
    X_train, X_test, y_train, y_test = args

    # Convert the solution to hyperparameters
    n_estimators = int(solution[0])
    max_depth = int(solution[1])
    learning_rate = solution[2]
    subsample = solution[3]
    colsample_bytree = solution[4]
    gamma = solution[5]
    reg_alpha = solution[6]
    reg_lambda = solution[7]

    # Set hyperparameters for the classifier
    clf.set_params(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                   subsample=subsample, colsample_bytree=colsample_bytree,
                   gamma=gamma, reg_alpha=reg_alpha, reg_lambda=reg_lambda)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Define the bounds for each hyperparameter
lb = [50, 3, 0.01, 0.5, 0.5, 0, 0, 0]  # Lower bounds
ub = [500, 10, 1.0, 1.0, 1.0, 5, 10, 10]  # Upper bounds

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Set the parameters for the genetic algorithm
problem = 'R'  # Real-coded optimization problem
max_iter = 50  # Maximum number of iterations

# Create an instance of the genetic algorithm
ga = GeneticAlgorithm(problem, None, lb, ub, NIND=50, MAXGEN=max_iter)

# Run the genetic algorithm
best_solution, best_fitness = ga.run(objective_function, X_train, X_test, y_train, y_test)

print("Best Solution:", best_solution)
print("Best Fitness (Accuracy):", best_fitness)


ModuleNotFoundError: No module named 'geatpy'

In [None]:
pip uninstall Cython


In [None]:
pip uninstall geatpy

Note: you may need to restart the kernel to use updated packages.


