In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, auc
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE


In [None]:
# Load datasets
train_url = "https://raw.githubusercontent.com/zetomic/dataset/main/train.csv"
test_url = "https://raw.githubusercontent.com/zetomic/dataset/main/test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

In [None]:



# Separate numeric and categorical columns
numeric_cols = train_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data[categorical_cols])
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data and handle missing values
encoded_features_test = encoder.transform(test_data[categorical_cols])
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(categorical_cols))
test_data = test_data.drop(categorical_cols, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.median())
test_data = test_data.fillna(train_data.median())
# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Define the hyperparameters and their possible values for XGBoost
param_grid = {
    'n_estimators': [200],
    'max_depth': [7],
    'learning_rate': [0.05],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.7],
}

# Initialize the XGBoost classifier
clf = XGBClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the XGBoost classifier with the best parameters
best_xgb = XGBClassifier(**best_params, random_state=42)
best_xgb.fit(X_train, y_train)

# Predict and compute accuracy on the validation set
y_val_pred = best_xgb.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation set with best parameters: {val_accuracy * 100:.2f}%")

# Predict on the test set
y_test_proba = best_xgb.predict_proba(test_data)[:, 1]

# Save the predictions to a CSV file with maximum precision
record_ids = test_data['RecordID'].astype(int)
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': y_test_proba})
output_df.to_csv('target-xgboost.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-xgboost.csv")


# Compute ROC curve and ROC AUC
y_val_prob = best_xgb.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:


# Separate numeric and categorical columns
numeric_cols = train_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data[categorical_cols])
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data and handle missing values
encoded_features_test = encoder.transform(test_data[categorical_cols])
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(categorical_cols))
test_data = test_data.drop(categorical_cols, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.median())
test_data = test_data.fillna(train_data.median())
# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Define the hyperparameters and their possible values for XGBoost
param_grid = {
    'n_estimators': [200],
    'max_depth': [7],
    'learning_rate': [0.05],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.7],
}

# Initialize the XGBoost classifier
clf = XGBClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the XGBoost classifier with the best parameters
best_xgb = XGBClassifier(**best_params, random_state=42)
best_xgb.fit(X_train, y_train)

# Predict and compute accuracy on the validation set
y_val_pred = best_xgb.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation set with best parameters: {val_accuracy * 100:.2f}%")

# Predict on the test set
y_test_proba = best_xgb.predict_proba(test_data)[:, 1]

# Save the predictions to a CSV file with maximum precision
record_ids = test_data['RecordID'].astype(int)
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': y_test_proba})
output_df.to_csv('target-xgboost.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-xgboost.csv")


# Compute ROC curve and ROC AUC
y_val_prob = best_xgb.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:


# Separate numeric and categorical columns
numeric_cols = train_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()

# Remove 'hospital_death' from numeric_cols if it exists
numeric_cols = [col for col in numeric_cols if col != 'hospital_death']

# Impute missing values
# For numerical columns, use median imputation
num_imputer = SimpleImputer(strategy='median')
train_data[numeric_cols] = num_imputer.fit_transform(train_data[numeric_cols])
test_data[numeric_cols] = num_imputer.transform(test_data[numeric_cols])

# For categorical columns, use mode imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_cols] = cat_imputer.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = cat_imputer.transform(test_data[categorical_cols])

# One-hot encode categorical variables
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data[categorical_cols])
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

encoded_features_test = encoder.transform(test_data[categorical_cols])
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(categorical_cols))
test_data = test_data.drop(categorical_cols, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Determine the class imbalance ratio
class_ratio = sum(train_data['hospital_death'] == 0) / sum(train_data['hospital_death'] == 1)

# Define the hyperparameters and their possible values for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.7, 0.8, 0.9],
    'scale_pos_weight': [class_ratio]
}

# Initialize the XGBoost classifier
clf_xgb = XGBClassifier(random_state=42)  # Removed early_stopping_rounds=10

# Initialize GridSearchCV with roc_auc scoring instead of accuracy
grid_search = GridSearchCV(estimator=clf_xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')

# Fit the model
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best Parameters for XGBoost: {best_params}")

# Train the XGBoost classifier with the best parameters
best_xgb = XGBClassifier(**best_params, random_state=42)  # Removed early_stopping_rounds=10
best_xgb.fit(X_train_scaled, y_train)

# Initialize other classifiers
clf_rf = RandomForestClassifier(random_state=42)
clf_lr = LogisticRegression(random_state=42)
clf_svc = SVC(probability=True, random_state=42)

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('xgb', best_xgb), ('rf', clf_rf), ('lr', clf_lr), ('svc', clf_svc)],
    voting='soft'
)

# Fit the voting classifier
voting_clf.fit(X_train_scaled, y_train)

# Predict and compute accuracy on the validation set
y_val_pred = voting_clf.predict(X_val_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation set with ensemble model: {val_accuracy * 100:.2f}%")

# Predict on the test set
y_test_proba = voting_clf.predict_proba(test_data_scaled)[:, 1]

# Save the predictions to a CSV file with maximum precision
record_ids = test_data['RecordID'].astype(int)
output_df = pd.DataFrame({'RecordID': record_ids, 'hospital_death': y_test_proba})
output_df.to_csv('target-ensemble.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-ensemble.csv")


In [None]:
train_data.drop(['RecordID'], axis=1, inplace=True)
test_data.drop(['RecordID'], axis=1, inplace=True)



# Separate numeric and categorical columns
numeric_cols = train_data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()

# One-hot encode categorical variables for train_data
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(train_data[categorical_cols])
train_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, train_encoded], axis=1)

# One-hot encode test data and handle missing values
encoded_features_test = encoder.transform(test_data[categorical_cols])
test_encoded = pd.DataFrame(encoded_features_test, columns=encoder.get_feature_names_out(categorical_cols))
test_data = test_data.drop(categorical_cols, axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Handle missing values
train_data = train_data.fillna(train_data.mode())
test_data = test_data.fillna(train_data.mode())

# Ensure both train and test data have the same columns
missing_cols = set(train_data.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[train_data.columns.drop('hospital_death')]

# Use StratifiedShuffleSplit for a stratified train-validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_data.drop('hospital_death', axis=1), train_data['hospital_death']):
    X_train, X_val = train_data.iloc[train_index].drop('hospital_death', axis=1), train_data.iloc[val_index].drop('hospital_death', axis=1)
    y_train, y_val = train_data.iloc[train_index]['hospital_death'], train_data.iloc[val_index]['hospital_death']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Label Encoding for Categorical Features
label_encoder = LabelEncoder()
for col in categorical_cols:
    if col in X_train.columns:
        X_train[col] = label_encoder.fit_transform(X_train[col])
        X_val[col] = label_encoder.transform(X_val[col])
        test_data[col] = label_encoder.transform(test_data[col])


# Principal Component Analysis (PCA)
pca = PCA(n_components=10)  # Adjust the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
test_data_pca = pca.transform(test_data_scaled)

# Define the hyperparameters and their possible values for XGBoost
param_grid_xgboost = {
    'n_estimators': [195],
    'max_depth': [7],
    'learning_rate': [0.05],
    'min_child_weight': [1],
    'subsample': [0.7],
}

# Initialize the XGBoost classifier
clf_xgboost = XGBClassifier(random_state=42)

# Initialize GridSearchCV for XGBoost
grid_search_xgboost = GridSearchCV(estimator=clf_xgboost, param_grid=param_grid_xgboost, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the XGBoost model
grid_search_xgboost.fit(X_train_pca, y_train)

# Get the best parameters from the grid search for XGBoost
best_params_xgboost = grid_search_xgboost.best_params_
print(f"Best Parameters for XGBoost: {best_params_xgboost}")

# Train the XGBoost classifier with the best parameters
best_xgboost = XGBClassifier(**best_params_xgboost, random_state=42)
best_xgboost.fit(X_train_pca, y_train)

# Predict and compute accuracy on the validation set with XGBoost
y_val_pred_xgboost = best_xgboost.predict(X_val_pca)
val_accuracy_xgboost = accuracy_score(y_val, y_val_pred_xgboost)
print(f"Accuracy on validation set with best parameters for XGBoost: {val_accuracy_xgboost * 100:.2f}%")

# Predict on the test set with XGBoost
y_test_proba_xgboost = best_xgboost.predict_proba(test_data_pca)[:, 1]

# Save the predictions to a CSV file with maximum precision for XGBoost
record_ids_xgboost = test_data.index.astype(int)
output_df_xgboost = pd.DataFrame({'RecordID': record_ids_xgboost, 'hospital_death_xgboost': y_test_proba_xgboost})
output_df_xgboost.to_csv('target-xgboost.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-xgboost.csv")

# Initialize the CatBoost classifier
clf_catboost = CatBoostClassifier(random_state=42, verbose=0)

# Define the hyperparameters and their possible values for CatBoost
param_grid_catboost = {
    'n_estimators': [185],
    'max_depth': [5, 7, 10],
    'learning_rate': [0.05, 0.1],
    'min_child_samples': [1, 2, 3],  # Equivalent to min_child_weight in XGBoost
    'subsample': [0.7, 0.8],
}

# Initialize GridSearchCV for CatBoost
grid_search_catboost = GridSearchCV(estimator=clf_catboost, param_grid=param_grid_catboost, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the CatBoost model
grid_search_catboost.fit(X_train_pca, y_train)

# Get the best parameters from the grid search for CatBoost
best_params_catboost = grid_search_catboost.best_params_
print(f"Best Parameters for CatBoost: {best_params_catboost}")

# Train the CatBoost classifier with the best parameters
best_catboost = CatBoostClassifier(**best_params_catboost, random_state=42, verbose=0)
best_catboost.fit(X_train_pca, y_train)

# Predict and compute accuracy on the validation set with CatBoost
y_val_pred_catboost = best_catboost.predict(X_val_pca)
val_accuracy_catboost = accuracy_score(y_val, y_val_pred_catboost)
print(f"Accuracy on validation set with best parameters for CatBoost: {val_accuracy_catboost * 100:.2f}%")

# Predict on the test set with CatBoost
y_test_proba_catboost = best_catboost.predict_proba(test_data_pca)[:, 1]

# Save the predictions to a CSV file with maximum precision for CatBoost
record_ids_catboost = test_data.index.astype(int)
output_df_catboost = pd.DataFrame({'RecordID': record_ids_catboost, 'hospital_death_catboost': y_test_proba_catboost})
output_df_catboost.to_csv('target-catboost.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with high precision are saved in target-catboost.csv")

# Voting Classifier
voting_clf = VotingClassifier(estimators=[('xgboost', best_xgboost), ('catboost', best_catboost)], voting='soft')
voting_clf.fit(X_train_pca, y_train)
y_val_pred_voting = voting_clf.predict(X_val_pca)
val_accuracy_voting = accuracy_score(y_val, y_val_pred_voting)
print(f"Accuracy on validation set with Voting Classifier: {val_accuracy_voting * 100:.2f}%")

# Predict on the test set with the Voting Classifier
y_test_proba_voting = voting_clf.predict_proba(test_data_pca)[:, 1]

# Save the predictions to a CSV file with maximum precision for the Voting Classifier
record_ids_voting = test_data.index.astype(int)
output_df_voting = pd.DataFrame({'RecordID': record_ids_voting, 'hospital_death_voting': y_test_proba_voting})
output_df_voting.to_csv('target-voting.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with Voting Classifier are saved in target-voting.csv")

# Stacking Classifier
estimators = [('xgboost', best_xgboost), ('catboost', best_catboost)]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=ExtraTreesClassifier(random_state=42))

# Fit the Stacking Classifier
stacking_clf.fit(X_train_pca, y_train)

# Predict and compute accuracy on the validation set with Stacking Classifier
y_val_pred_stacking = stacking_clf.predict(X_val_pca)
val_accuracy_stacking = accuracy_score(y_val, y_val_pred_stacking)
print(f"Accuracy on validation set with Stacking Classifier: {val_accuracy_stacking * 100:.2f}%")

# Predict on the test set with the Stacking Classifier
y_test_proba_stacking = stacking_clf.predict_proba(test_data_pca)[:, 1]

# Save the predictions to a CSV file with maximum precision for the Stacking Classifier
record_ids_stacking = test_data.index.astype(int)
output_df_stacking = pd.DataFrame({'RecordID': record_ids_stacking, 'hospital_death_stacking': y_test_proba_stacking})
output_df_stacking.to_csv('target-stacking.csv', index=False, float_format='%.15f')

print("Probability predictions for the test set with Stacking Classifier are saved in target-stacking.csv")

# ROC Curve for XGBoost
fpr_xgboost, tpr_xgboost, _ = roc_curve(y_val, y_val_pred_xgboost)
roc_auc_xgboost = auc(fpr_xgboost, tpr_xgboost)

# ROC Curve for CatBoost
fpr_catboost, tpr_catboost, _ = roc_curve(y_val, y_val_pred_catboost)
roc_auc_catboost = auc(fpr_catboost, tpr_catboost)

# ROC Curve for Voting Classifier
fpr_voting, tpr_voting, _ = roc_curve(y_val, y_val_pred_voting)
roc_auc_voting = auc(fpr_voting, tpr_voting)

# ROC Curve for Stacking Classifier
fpr_stacking, tpr_stacking, _ = roc_curve(y_val, y_val_pred_stacking)
roc_auc_stacking = auc(fpr_stacking, tpr_stacking)

# Plot ROC Curves
plt.figure(figsize=(10, 8))
plt.plot(fpr_xgboost, tpr_xgboost, color='darkorange', lw=2, label=f'XGBoost ROC curve (area = {roc_auc_xgboost:.2f})')
plt.plot(fpr_catboost, tpr_catboost, color='green', lw=2, label=f'CatBoost ROC curve (area = {roc_auc_catboost:.2f})')
plt.plot(fpr_voting, tpr_voting, color='blue', lw=2, label=f'Voting Classifier ROC curve (area = {roc_auc_voting:.2f})')
plt.plot(fpr_stacking, tpr_stacking, color='red', lw=2, label=f'Stacking Classifier ROC curve (area = {roc_auc_stacking:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

# Save ROC curve data to CSV
roc_data = pd.DataFrame({'FPR_XGBoost': fpr_xgboost, 'TPR_XGBoost': tpr_xgboost, 'FPR_CatBoost': fpr_catboost, 'TPR_CatBoost': tpr_catboost, 'FPR_Voting': fpr_voting, 'TPR_Voting': tpr_voting, 'FPR_Stacking': fpr_stacking, 'TPR_Stacking': tpr_stacking})
roc_data.to_csv('roc_data.csv', index=False)

print("ROC curve data is saved in roc_data.csv")
