In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE  # Consider using SMOTE or ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
df = pd.read_csv('datasets\processed\heart_no_nans.csv')

In [None]:
# Prepare the data
X = df.drop('HadHeartAttack', axis=1)
y = df['HadHeartAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Use MinMaxScaler (alternative to StandardScaler)
scaler = MinMaxScaler()  
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE for handling class imbalance
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Visualize the distribution after SMOTE
plt.figure(figsize=(10,6))
ax = sns.countplot(x=y_train_smote, palette='Set2')
for container in ax.containers:
    ax.bar_label(container, label_type='center', rotation=0, color='white')
plt.title("Distribution After SMOTE", size=14)
plt.show()

# Logistic Regression model with class weight
logreg = LogisticRegression(solver='liblinear', penalty='l2', class_weight='balanced', max_iter=10000, random_state=42)

# Hyperparameter tuning with Stratified KFold and GridSearchCV
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Tuning regularization strength
    'max_iter': [100, 500, 1000, 10000],  # Experiment with different iteration limits
    'penalty': ['l2', 'elasticnet'],  # ElasticNet as an alternative penalty
    'solver': ['liblinear', 'saga']  # Try different solvers, 'saga' can handle elasticnet
}

# StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(logreg, param_grid, cv=skf, scoring='accuracy')
grid_search.fit(X_train_smote, y_train_smote)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Step 3: Predict and Evaluate
y_pred_log_k = best_model.predict(X_test_scaled)
y_pred_proba_log_k = best_model.predict_proba(X_test_scaled)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_log_k)
print('Confusion Matrix:')
print(cm)

# Calculate Precision-Recall and AUC-PR
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_log_k[:, 1])
auc_pr = auc(recall, precision)  # AUC for Precision-Recall curve

# Evaluation DataFrame
KSMOTE_log_eval = pd.DataFrame(data=[ 
    accuracy_score(y_test, y_pred_log_k),
    precision_score(y_test, y_pred_log_k, average='binary'),
    recall_score(y_test, y_pred_log_k, average='binary'),
    f1_score(y_test, y_pred_log_k, average='binary'),
    auc_pr
], index=['Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC-PR'], 
   columns=["Logistic_Regression_KSMOTE"])

# Display evaluation results
print(KSMOTE_log_eval)

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', color='b', label="Precision-Recall Curve (AUC-PR = %0.2f)" % auc_pr)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc, classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import ADASYN
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import VotingClassifier

# Prepare the data
X = df.drop('HadHeartAttack', axis=1)
y = df['HadHeartAttack']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Feature Scaling using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply ADASYN for handling class imbalance
adasyn = ADASYN(sampling_strategy='auto', random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_scaled, y_train)

# Visualize the distribution after ADASYN
plt.figure(figsize=(10, 6))
ax = sns.countplot(x=y_train_adasyn, palette='Set2')
for container in ax.containers:
    ax.bar_label(container, label_type='center', rotation=0, color='white')
plt.title("Distribution After ADASYN", size=14)
plt.show()

# Logistic Regression model with class weight
logreg = LogisticRegression(solver='liblinear', penalty='l2', class_weight='balanced', max_iter=10000, random_state=42)

# Hyperparameter tuning with Stratified KFold and GridSearchCV
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Tuning regularization strength
    'max_iter': [100, 500, 1000, 10000],  # Experiment with different iteration limits
    'penalty': ['l2', 'elasticnet'],  # ElasticNet as an alternative penalty
    'solver': ['liblinear', 'saga']  # Try different solvers, 'saga' can handle elasticnet
}

# StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=skf, scoring='accuracy')
grid_search.fit(X_train_adasyn, y_train_adasyn)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Step 3: Predict and Evaluate
y_pred_log_k = best_model.predict(X_test_scaled)
y_pred_proba_log_k = best_model.predict_proba(X_test_scaled)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_log_k)
print('Confusion Matrix:')
print(cm)

# Calculate Precision-Recall and AUC-PR
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_log_k[:, 1])
auc_pr = auc(recall, precision)  # AUC for Precision-Recall curve

# Calculate ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba_log_k[:, 1])
print("ROC-AUC:", roc_auc)

# Evaluation DataFrame
log_eval = pd.DataFrame(data=[ 
    accuracy_score(y_test, y_pred_log_k),
    precision_score(y_test, y_pred_log_k, average='binary'),
    recall_score(y_test, y_pred_log_k, average='binary'),
    f1_score(y_test, y_pred_log_k, average='binary'),
    auc_pr,
    roc_auc
], index=['Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC-PR', 'AUC-ROC'], 
   columns=["Logistic_Regression_ADASYN"])

# Display evaluation results
print(log_eval)

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', color='b', label="Precision-Recall Curve (AUC-PR = %0.2f)" % auc_pr)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_log_k[:, 1])
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', label="ROC Curve (AUC-ROC = %0.2f)" % roc_auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import precision_recall_curve, auc, classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from imblearn.over_sampling import ADASYN, SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare the data (replace df with your actual DataFrame)
X = df.drop('HadHeartAttack', axis=1)
y = df['HadHeartAttack']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Feature Scaling using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply ADASYN for handling class imbalance
adasyn = ADASYN(sampling_strategy='auto', random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_scaled, y_train)

# Visualize the distribution after ADASYN
plt.figure(figsize=(10, 6))
ax = sns.countplot(x=y_train_adasyn, palette='Set2')
for container in ax.containers:
    ax.bar_label(container, label_type='center', rotation=0, color='white')
plt.title("Distribution After ADASYN", size=14)
plt.show()

# Define Models
logreg = LogisticRegression(solver='liblinear', penalty='l2', class_weight='balanced', max_iter=10000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

# Ensemble Model using Voting Classifier
voting_clf = VotingClassifier(estimators=[('logreg', logreg), ('rf', rf), ('gb', gb)], voting='soft')

# Hyperparameter tuning with Stratified KFold and GridSearchCV
param_grid = {
    'voting_clf__logreg__C': [0.001, 0.01, 0.1, 1, 10],  # Tuning regularization strength
    'voting_clf__logreg__max_iter': [100, 500, 1000],  # Experiment with different iteration limits
    'voting_clf__rf__n_estimators': [100, 200, 300],  # Random forest parameter tuning
    'voting_clf__gb__n_estimators': [100, 200, 300],  # Gradient boosting parameter tuning
}

# StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV
grid_search = GridSearchCV(voting_clf, param_grid, cv=skf, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_adasyn, y_train_adasyn)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Step 3: Predict and Evaluate
y_pred_log_k = best_model.predict(X_test_scaled)
y_pred_proba_log_k = best_model.predict_proba(X_test_scaled)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_log_k)
print('Confusion Matrix:')
print(cm)

# Calculate Precision-Recall and AUC-PR
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_log_k[:, 1])
auc_pr = auc(recall, precision)  # AUC for Precision-Recall curve

# Calculate ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_proba_log_k[:, 1])
print("ROC-AUC:", roc_auc)

# Evaluation DataFrame
log_eval = pd.DataFrame(data=[ 
    accuracy_score(y_test, y_pred_log_k),
    precision_score(y_test, y_pred_log_k, average='binary'),
    recall_score(y_test, y_pred_log_k, average='binary'),
    f1_score(y_test, y_pred_log_k, average='binary'),
    auc_pr,
    roc_auc
], index=['Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC-PR', 'AUC-ROC'], 
   columns=["Voting_Classifier"])

# Display evaluation results
print(log_eval)

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', color='b', label="Precision-Recall Curve (AUC-PR = %0.2f)" % auc_pr)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_log_k[:, 1])
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', label="ROC Curve (AUC-ROC = %0.2f)" % roc_auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()