In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

df = pd.read_csv("D:\Capstone\Databases\ML CSV\CUDB_VFDB_combined.csv")

In [None]:
df.shape

(12775, 12)

In [None]:
# Separate the data into two classes
class_0 = df[df["rhythm"] == 0]
class_1 = df[df["rhythm"] == 1]

# Undersample class 0 to match the number of samples in class 1
class_0_undersampled = class_0.sample(n=len(class_1), random_state=42)

# Concatenate the undersampled class 0 with class 1
balanced_df = pd.concat([class_0_undersampled, class_1])

# Shuffle the concatenated DataFrame to mix the rows
balanced_df = balanced_df.sample(frac=1, random_state=100)

In [None]:
print(balanced_df['rhythm'].value_counts())

rhythm
1    2399
0    2399
Name: count, dtype: int64


In [None]:
#X = balanced_df.drop(columns=['std_dev','rhythm'])
X = balanced_df.iloc[:, :-1]
y = balanced_df.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 100)

In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(colsample_bytree = 0.816432335230161,
gamma = 0,
learning_rate = 0.060309950161179966,
max_depth = 10,
n_estimators = 500,
reg_alpha = 0,
reg_lambda = 7,
subsample = 0.9946951308154663)

#model = xgb.XGBClassifier()
model.fit(X_train,y_train)
y_predict = model.predict(X_test)
y_train_predict = model.predict(X_train)
from sklearn.metrics import accuracy_score
print('Train accuracy',accuracy_score(y_train, y_train_predict))
print('Test accuracy',accuracy_score(y_test,y_predict))

Train accuracy 0.9994788952579469
Test accuracy 0.95


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Read the data
df = pd.read_csv("D:/Capstone/Databases/ML CSV/CUDB_VFDB_combined.csv")

# Separate features and target variable
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Undersample the majority class to balance the dataset
class_0 = df[df["rhythm"] == 0]
class_1 = df[df["rhythm"] == 1]
class_0_undersampled = class_0.sample(n=len(class_1), random_state=42)
balanced_df = pd.concat([class_0_undersampled, class_1])
balanced_df = balanced_df.sample(frac=1, random_state=100)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Generate predictions from Random Forest model
rf_predictions = rf_model.predict(X_test)

# Train XGBoost model
xgb_model = xgb.XGBClassifier(
    colsample_bytree=0.816432335230161,
    gamma=0,
    learning_rate=0.060309950161179966,
    max_depth=10,
    n_estimators=500,
    reg_alpha=0,
    reg_lambda=7,
    subsample=0.9946951308154663
)
xgb_model.fit(X_train, y_train)

# Generate predictions from XGBoost model
xgb_predictions = xgb_model.predict(X_test)

# Convert hybrid predictions to binary
hybrid_predictions_binary = (rf_predictions + xgb_predictions) / 2
hybrid_predictions_binary[hybrid_predictions_binary < 0.5] = 0
hybrid_predictions_binary[hybrid_predictions_binary >= 0.5] = 1

# Evaluate the performance of the hybrid model
hybrid_accuracy = accuracy_score(y_test, hybrid_predictions_binary)
print('Hybrid Model Accuracy:', hybrid_accuracy)



Hybrid Model Accuracy: 0.9432485322896281


In [None]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import confusion_matrix

# Confusion matrix for hybrid model
tn, fp, fn, tp = confusion_matrix(y_test, hybrid_predictions_binary).ravel()

# Calculate sensitivity
sensitivity = tp / (tp + fn)

# Calculate specificity
specificity = tn / (tn + fp)

# Calculate precision
precision = tp / (tp + fp)

# Calculate NPV (Negative Predictive Value)
npv = tn / (tn + fn)
# Calculate AUC-ROC score for hybrid model
auc_roc = roc_auc_score(y_test, hybrid_predictions_binary)
print("AUC-ROC Score for Hybrid Model:", auc_roc)
# Print the calculated metrics
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)
print("Precision:", precision)
print("NPV (Negative Predictive Value):", npv)

AUC-ROC Score for Hybrid Model: 0.9094938600815224
Sensitivity: 0.8548057259713702
Specificity: 0.9641819941916747
Precision: 0.8495934959349594
NPV (Negative Predictive Value): 0.9655841008240427


In [None]:
accuracy_score(y_test,rf_predictions)

0.938160469667319

In [None]:
accuracy_score(y_test,xgb_predictions)

0.9436399217221135

# **Bayesian optimization for RF**

In [None]:
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score

# Define the objective function
def objective(params):
    rf = RandomForestClassifier(**params)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy  # Minimize negative accuracy

# Define the search space
param_space = {
    'n_estimators': (50, 500),
    'max_depth': (3, 10),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10),
    'max_features': (0.1, 1.0),
}

# Initialize the BayesSearchCV object
opt = BayesSearchCV(
    RandomForestClassifier(),
    param_space,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    random_state=42,
    n_jobs=-1
)
np.int = int

# Fit the BayesSearchCV object
opt.fit(X_train, y_train)

# Get the best hyperparameters
best_params = opt.best_params_
print("Best Hyperparameters:", best_params)
