In [None]:
import optuna
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import joblib
from scipy.stats import skew

# sklearn
from sklearn.model_selection import (
    cross_val_score, KFold, StratifiedKFold,
    train_test_split, RandomizedSearchCV, GridSearchCV)
from sklearn.metrics import (
    precision_score, recall_score, confusion_matrix, roc_auc_score,
    RocCurveDisplay, accuracy_score, f1_score, classification_report,
    balanced_accuracy_score, fbeta_score, precision_recall_curve, roc_curve, PrecisionRecallDisplay)
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder,
    OrdinalEncoder, PowerTransformer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

import warnings
warnings.filterwarnings('ignore')

<br>

### Loading data

In [None]:
# load base data
X_train = pd.read_pickle('X_train.pkl')
X_test = pd.read_pickle('X_test.pkl')
y_train = pd.read_pickle('y_train.pkl')
y_test = pd.read_pickle('y_test.pkl')

# load age & sex matched data
X_train_matched = pd.read_pickle('X_train_matched.pkl')
X_test_matched = pd.read_pickle('X_test_matched.pkl')
y_train_matched = pd.read_pickle('y_train_matched.pkl')
y_test_matched = pd.read_pickle('y_test_matched.pkl')


n_iterations = 100     # for hyperparmeter tuning
cv_folds = 5           # number of cross validation folds for hyperparameter tuning

<br>

### Scoring metric = ROC AUC

In [None]:
# base data hyperparameter tuning
def objective(trial):
    # hyperparameters relevant to LGBM
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 8, 128),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1000, log=True),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 10, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 5000),
        'n_jobs': -1,
        'bagging_freq': 1,
        'force_row_wise': True,
        'bagging_seed': 2024,
        'verbosity': -100,
        'extra_trees': False}

    # create model with above parameters
    model = lgb.LGBMClassifier(**params)

    # 5 fold cross validation
    cv = KFold(n_splits = cv_folds, shuffle=True, random_state=1)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=1)
    return scores.mean()

# create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_iterations, show_progress_bar=True)

# get the best hyperparameters
print("\nBest hyperparameters found:")
for param, value in study.best_params.items():
    print(f"{param}: {value}")
print(f"\nBest cross-validation score: {study.best_value:.4f}")

In [None]:
# base data model evaluation

# set the optimal hyperparameters found above
base_params = {
    'n_estimators': 368,
    'max_bin': 612,
    'learning_rate': 0.04492929029938169,
    'max_depth': 4,
    'num_leaves': 69,
    'bagging_fraction': 0.8930333984571698,
    'feature_fraction': 0.6723092127660762,
    'lambda_l2': 101.66071203673786,
    'min_sum_hessian_in_leaf': 0.0035990279823698043,
    'min_data_in_leaf': 24,

    # normal params:
    'n_jobs': -1,
    'bagging_freq': 1,
    'force_row_wise': True,
    'bagging_seed': 2024,
    'verbosity': -100,
    'extra_trees': False,
}

# train the final model
base_model = lgb.LGBMClassifier(**base_params)
base_model.fit(X_train, y_train)

# make predictions with test set
y_pred = base_model.predict(X_test)
y_pred_proba = base_model.predict_proba(X_test)[:, 1]

# eval metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_proba)


print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUROC: {auroc:.4f}")
print("\nConfusion Matrix \nTN FP \nFN TP:")
print(conf_matrix)

# feature importance
feature_importance = base_model.feature_importances_
feature_names = X_train.columns if hasattr(X_train, 'columns') else [f'Feature_{i}' for i in range(X_train.shape[1])]
sorted_idx = np.argsort(feature_importance)
plt.figure(figsize=(5,5))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

# ROC curve
plt.figure(figsize=(3,3))
RocCurveDisplay.from_predictions(y_test, y_pred_proba)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'k--', label='0.5')
plt.legend()
plt.show()

# precision recall curve
plt.figure(figsize=(3,3))
PrecisionRecallDisplay.from_predictions(y_test, y_pred_proba)
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

<br>

In [None]:
# undersampling controls
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=1)
X_train_smote, y_train_smote = undersampler.fit_resample(X_train, y_train)

print("before undersampling:")
print(y_train.value_counts())
print("\nafter undersampling:")
print(y_train_smote.value_counts())

In [None]:
# undersampled hyperparameter tuning

def objective(trial):
    # hyperparameters relevant to LGBM
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 8, 128),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1000, log=True),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 10, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 5000),
        'n_jobs': -1,
        'bagging_freq': 1,
        'force_row_wise': True,
        'bagging_seed': 2024,
        'verbosity': -100,
        'extra_trees': False}

    # create model with above parameters
    model_smote = lgb.LGBMClassifier(**params)

    # 5 fold cross validation
    cv = KFold(n_splits = cv_folds, shuffle=True, random_state=1)
    scores = cross_val_score(model_smote, X_train_smote, y_train_smote, cv=cv, scoring='roc_auc', n_jobs=1)
    return scores.mean()

# create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_iterations, show_progress_bar=True)

# get the best hyperparameters
print("\nBest hyperparameters found:")
for param, value in study.best_params.items():
    print(f"{param}: {value}")
print(f"\nBest cross-validation score: {study.best_value:.4f}")

In [None]:
# undersampled model evaluation

# set the optimal hyperparameters found above
smote_params = {
    'n_estimators': 381,
    'max_bin': 369,
    'learning_rate': 0.030027550861638556,
    'max_depth': 3,
    'num_leaves': 87,
    'bagging_fraction': 0.9336151881344432,
    'feature_fraction': 0.6670365046716105,
    'lambda_l2': 0.6386045468782642,
    'min_sum_hessian_in_leaf': 0.01048928153287772,
    'min_data_in_leaf': 687,
    
    # normal params:
    'n_jobs': -1,
    'bagging_freq': 1,
    'force_row_wise': True,
    'bagging_seed': 2024,
    'verbosity': -100,
    'extra_trees': False}

# train the final model
smote_model = lgb.LGBMClassifier(**smote_params)
smote_model.fit(X_train_smote, y_train_smote)

# make predictions with test set
y_pred = smote_model.predict(X_test)
y_pred_proba = smote_model.predict_proba(X_test)[:, 1]

# eval metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_proba)


print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUROC: {auroc:.4f}")
print("\nConfusion Matrix \nTN FP \nFN TP:")
print(conf_matrix)

# feature importance
feature_importance = smote_model.feature_importances_
feature_names = X_train_smote.columns if hasattr(X_train, 'columns') else [f'Feature_{i}' for i in range(X_train_smote.shape[1])]
sorted_idx = np.argsort(feature_importance)
plt.figure(figsize=(5,5))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

# ROC curve
plt.figure(figsize=(3,3))
RocCurveDisplay.from_predictions(y_test, y_pred_proba)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'k--', label='0.5')
plt.legend()
plt.show()

# precision recall curve
plt.figure(figsize=(3,3))
PrecisionRecallDisplay.from_predictions(y_test, y_pred_proba)
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

<br>

In [None]:
# age/sex matched hyperparameter tuning

def objective(trial):
    # hyperparameters relevant to LGBM
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 8, 128),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1000, log=True),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 10, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 5000),
        'n_jobs': -1,
        'bagging_freq': 1,
        'force_row_wise': True,
        'bagging_seed': 2024,
        'verbosity': -100,
        'extra_trees': False}

    # create model with above parameters
    model = lgb.LGBMClassifier(**params)

    # 5 fold cross validation
    cv = KFold(n_splits = cv_folds, shuffle=True, random_state=1)
    scores = cross_val_score(model, X_train_matched, y_train_matched, cv=cv, scoring='roc_auc', n_jobs=1)
    return scores.mean()

# create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_iterations, show_progress_bar=True)

# get the best hyperparameters
print("\nBest hyperparameters found:")
for param, value in study.best_params.items():
    print(f"{param}: {value}")
print(f"\nBest cross-validation score: {study.best_value:.4f}")

In [None]:
# age/sex matched model evaluation

# set the optimal hyperparameters found above
matched_params = {
    'n_estimators': 297,
    'max_bin': 698,
    'learning_rate': 0.03664083987716067,
    'max_depth': 7,
    'num_leaves': 61,
    'bagging_fraction': 0.9609593575826182,
    'feature_fraction': 0.5204793037683734,
    'lambda_l2': 154.4619926699515,
    'min_sum_hessian_in_leaf': 0.004812351346173821,
    'min_data_in_leaf': 2202,

    # normal params:
    'n_jobs': -1,
    'bagging_freq': 1,
    'force_row_wise': True,
    'bagging_seed': 2024,
    'verbosity': -100,
    'extra_trees': False,
}

# train the final model
matched_model = lgb.LGBMClassifier(**matched_params)
matched_model.fit(X_train_matched, y_train_matched)

# make predictions with test set
y_pred = matched_model.predict(X_test_matched)
y_pred_proba = matched_model.predict_proba(X_test_matched)[:, 1]

# eval metrics
precision = precision_score(y_test_matched, y_pred)
recall = recall_score(y_test_matched, y_pred)
conf_matrix = confusion_matrix(y_test_matched, y_pred)
auroc = roc_auc_score(y_test_matched, y_pred_proba)


print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUROC: {auroc:.4f}")
print("\nConfusion Matrix \nTN FP \nFN TP:")
print(conf_matrix)

# feature importance
feature_importance = matched_model.feature_importances_
feature_names = X_train_matched.columns if hasattr(X_train_matched, 'columns') else [f'Feature_{i}' for i in range(X_train_matched.shape[1])]
sorted_idx = np.argsort(feature_importance)
plt.figure(figsize=(5,5))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

# ROC curve
plt.figure(figsize=(3,3))
RocCurveDisplay.from_predictions(y_test_matched, y_pred_proba)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'k--', label='0.5')
plt.legend()
plt.show()

# precision recall curve
plt.figure(figsize=(3,3))
PrecisionRecallDisplay.from_predictions(y_test_matched, y_pred_proba)
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

<br>

### Scoring metric = Recall

In [None]:
# base data hyperparameter tuning

def objective(trial):
    # hyperparameters relevant to LGBM
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 8, 128),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1000, log=True),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 10, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 5000),
        'n_jobs': -1,
        'bagging_freq': 1,
        'force_row_wise': True,
        'bagging_seed': 2024,
        'verbosity': -100,
        'extra_trees': False}

    # create model with above parameters
    model = lgb.LGBMClassifier(**params)

    # 5 fold cross validation
    cv = KFold(n_splits = cv_folds, shuffle=True, random_state=1)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='recall', n_jobs=1)
    return scores.mean()

# create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_iterations, show_progress_bar=True)

# get the best hyperparameters
print("\nBest hyperparameters found:")
for param, value in study.best_params.items():
    print(f"{param}: {value}")
print(f"\nBest cross-validation score: {study.best_value:.4f}")

In [None]:
# base data model evaluation

# set the optimal hyperparameters found above
base_params = {
    'n_estimators': 299,
    'max_bin': 352,
    'learning_rate': 0.2997976527843294,
    'max_depth': 9,
    'num_leaves': 105,
    'bagging_fraction': 0.8152704911597977,
    'feature_fraction': 0.8378133204831323,
    'lambda_l2': 1.4869979936367044,
    'min_sum_hessian_in_leaf': 0.4590025239961025,
    'min_data_in_leaf': 261,

    # normal params:
    'n_jobs': -1,
    'bagging_freq': 1,
    'force_row_wise': True,
    'bagging_seed': 2024,
    'verbosity': -100,
    'extra_trees': False,
}

# train the final model
base_model = lgb.LGBMClassifier(**base_params)
base_model.fit(X_train, y_train)

# make predictions with test set
y_pred = base_model.predict(X_test)
y_pred_proba = base_model.predict_proba(X_test)[:, 1]

# eval metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_proba)


print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUROC: {auroc:.4f}")
print("\nConfusion Matrix \nTN FP \nFN TP:")
print(conf_matrix)

# feature importance
feature_importance = base_model.feature_importances_
feature_names = X_train.columns if hasattr(X_train, 'columns') else [f'Feature_{i}' for i in range(X_train.shape[1])]
sorted_idx = np.argsort(feature_importance)
plt.figure(figsize=(5,5))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

# ROC curve
plt.figure(figsize=(3,3))
RocCurveDisplay.from_predictions(y_test, y_pred_proba)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'k--', label='0.5')
plt.legend()
plt.show()

# precision recall curve
plt.figure(figsize=(3,3))
PrecisionRecallDisplay.from_predictions(y_test, y_pred_proba)
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

<br>

In [None]:
# undersampled hyperparameter tuning

def objective(trial):
    # hyperparameters relevant to LGBM
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 8, 128),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1000, log=True),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 10, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 5000),
        'n_jobs': -1,
        'bagging_freq': 1,
        'force_row_wise': True,
        'bagging_seed': 2024,
        'verbosity': -100,
        'extra_trees': False}

    # create model with above parameters
    model_smote = lgb.LGBMClassifier(**params)

    # 5 fold cross validation
    cv = KFold(n_splits = cv_folds, shuffle=True, random_state=1)
    scores = cross_val_score(model_smote, X_train_smote, y_train_smote, cv=cv, scoring='recall', n_jobs=1)
    return scores.mean()

# create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_iterations, show_progress_bar=True)

# get the best hyperparameters
print("\nBest hyperparameters found:")
for param, value in study.best_params.items():
    print(f"{param}: {value}")
print(f"\nBest cross-validation score: {study.best_value:.4f}")

In [None]:
# undersampled model evaluation

# set the optimal hyperparameters found above
smote_params = {
    'n_estimators': 316,
    'max_bin': 155,
    'learning_rate': 0.09157073221081566,
    'max_depth': 8,
    'num_leaves': 71,
    'bagging_fraction': 0.8910733061393177,
    'feature_fraction': 0.8656518984470933,
    'lambda_l2': 993.4066861978613,
    'min_sum_hessian_in_leaf': 0.45634063289049837,
    'min_data_in_leaf': 1095,
    
    # normal params:
    'n_jobs': -1,
    'bagging_freq': 1,
    'force_row_wise': True,
    'bagging_seed': 2024,
    'verbosity': -100,
    'extra_trees': False}

# train the final model
smote_model = lgb.LGBMClassifier(**smote_params)
smote_model.fit(X_train_smote, y_train_smote)

# make predictions with test set
y_pred = smote_model.predict(X_test)
y_pred_proba = smote_model.predict_proba(X_test)[:, 1]

# eval metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_proba)


print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUROC: {auroc:.4f}")
print("\nConfusion Matrix \nTN FP \nFN TP:")
print(conf_matrix)

# feature importance
feature_importance = smote_model.feature_importances_
feature_names = X_train_smote.columns if hasattr(X_train_smote, 'columns') else [f'Feature_{i}' for i in range(X_train_smote.shape[1])]
sorted_idx = np.argsort(feature_importance)
plt.figure(figsize=(5,5))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

# ROC curve
plt.figure(figsize=(3,3))
RocCurveDisplay.from_predictions(y_test, y_pred_proba)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'k--', label='0.5')
plt.legend()
plt.show()

# precision recall curve
plt.figure(figsize=(3,3))
PrecisionRecallDisplay.from_predictions(y_test, y_pred_proba)
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

<br>

In [None]:
# age/sex matched hyperparameter tuning

def objective(trial):
    # hyperparameters relevant to LGBM
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_bin': trial.suggest_int('max_bin', 128, 1024),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 8, 128),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1000, log=True),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 10, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 5000),
        'n_jobs': -1,
        'bagging_freq': 1,
        'force_row_wise': True,
        'bagging_seed': 2024,
        'verbosity': -100,
        'extra_trees': False}

    # create model with above parameters
    model = lgb.LGBMClassifier(**params)

    # 5 fold cross validation
    cv = KFold(n_splits = cv_folds, shuffle=True, random_state=1)
    scores = cross_val_score(model, X_train_matched, y_train_matched, cv=cv, scoring='recall', n_jobs=1)
    return scores.mean()

# create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_iterations, show_progress_bar=True)

# get the best hyperparameters
print("\nBest hyperparameters found:")
for param, value in study.best_params.items():
    print(f"{param}: {value}")
print(f"\nBest cross-validation score: {study.best_value:.4f}")

In [None]:
# age/sex matched model evaluation

# set the optimal hyperparameters found above
matched_params = {
    'n_estimators': 312,
    'max_bin': 733,
    'learning_rate': 0.28642682596893726,
    'max_depth': 12,
    'num_leaves': 123,
    'bagging_fraction': 0.6412630511112203,
    'feature_fraction': 0.7374540315935075,
    'lambda_l2': 0.6432538398292775,
    'min_sum_hessian_in_leaf': 1.1693341208203987,
    'min_data_in_leaf': 324,

    # normal params:
    'n_jobs': -1,
    'bagging_freq': 1,
    'force_row_wise': True,
    'bagging_seed': 2024,
    'verbosity': -100,
    'extra_trees': False,
}

# train the final model
matched_model = lgb.LGBMClassifier(**matched_params)
matched_model.fit(X_train_matched, y_train_matched)

# make predictions with test set
y_pred = matched_model.predict(X_test_matched)
y_pred_proba = matched_model.predict_proba(X_test_matched)[:, 1]

# eval metrics
precision = precision_score(y_test_matched, y_pred)
recall = recall_score(y_test_matched, y_pred)
conf_matrix = confusion_matrix(y_test_matched, y_pred)
auroc = roc_auc_score(y_test_matched, y_pred_proba)


print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUROC: {auroc:.4f}")
print("\nConfusion Matrix \nTN FP \nFN TP:")
print(conf_matrix)

# feature importance
feature_importance = matched_model.feature_importances_
feature_names = X_train_matched.columns if hasattr(X_train_matched, 'columns') else [f'Feature_{i}' for i in range(X_train_matched.shape[1])]
sorted_idx = np.argsort(feature_importance)
plt.figure(figsize=(5,5))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

# ROC curve
plt.figure(figsize=(3,3))
RocCurveDisplay.from_predictions(y_test_matched, y_pred_proba)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'k--', label='0.5')
plt.legend()
plt.show()

# precision recall curve
plt.figure(figsize=(3,3))
PrecisionRecallDisplay.from_predictions(y_test_matched, y_pred_proba)
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()