___
# Classification of ED triage notes for self-harm
___

In [1]:
import pandas as pd
from scipy.stats import uniform

from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_extraction.text import TfidfVectorizer 

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import make_pipeline

import pickle

# Project imports
from self_harm_triage_notes.config import *
from self_harm_triage_notes.dev_utils import get_stopwords

# Toolbox imports
from ml_health_toolbox.dev_utils import *
from ml_health_toolbox.eval_utils import *

In [2]:
# Suppress warnings (including in worker processes started via spawn)
import os
# Ensure child processes inherit this setting
os.environ['PYTHONWARNINGS'] = 'ignore'

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Set-up
# Development data
dev_data_filename = "lvrh_2012_2022"

# Test data
test_data_filename = "lvrh_2012_2022"

# Classification label and features
label = 'SH' #'SH_SI'
features = 'entities'
palette = ['seagreen','tomato']
    
# Undersampling
undersample = False
n_controls = 10000

# Dimensionality reduction
reduce_dim = False
lsa_method = 'svd'
n_components = 100

# Additional features
add_length = False
add_metadata = False

___
## Data preparation

In [None]:
# Load the development set
df = pd.read_parquet(proc_data_dir / (dev_data_filename + "_normalised.parquet"), engine="pyarrow")

# Keep data for development
cut_off_year = 2017
df = df[df.year <= cut_off_year].copy()

# Rename label column
df.rename(columns={label: 'y'}, inplace=True)

# Class names
classes = df.y.cat.categories

# Define features and labels
X = df[features]
y = df.y
groups = df.uid

# Define CV strategy
cv = get_cv_strategy(groups=groups, n_splits=N_SPLITS)

# Dictionary to store metrics
metrics_dict = {}

df.y.value_counts(normalize=True).sort_index().round(4)*100

y
Not self-harm    98.62
Self-harm         1.38
Name: proportion, dtype: float64

___
# Model development

### Quick & dirty trying a few algorithms

In [None]:
vectorizer = TfidfVectorizer(stop_words=get_stopwords(), token_pattern=r'\S+')
selector = SelectPercentile(score_func=chi2, percentile=5)

clfs = (
    MultinomialNB(),
    LogisticRegression(solver='saga', max_iter=5000, class_weight='balanced', random_state=42),
#     RandomForestClassifier(class_weight="balanced", random_state=42),
#     GradientBoostingClassifier(random_state=42),
#     XGBClassifier(objective='binary:logistic', random_state=42),
    LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1),
)

for clf in clfs:
    pipeline = make_pipeline(vectorizer, selector, clf)
    score_cv(pipeline, X, y.cat.codes, groups, len(y.cat.categories), cv=cv)

### Proper model comparison with nested CV 

In [None]:
pipelines = []
search_modes = []
param_grids = []

vectorizer = TfidfVectorizer(stop_words='english')
selector = SelectPercentile(score_func=chi2)

# # MNB
# clf = MultinomialNB()
# pipeline = make_pipeline(vectorizer, selector, clf)
# pipelines.append(pipeline)

# search_modes.append('random')

# param_grid = {
#     'tfidfvectorizer__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)), 
#     'tfidfvectorizer__max_df': (0.8, 0.9, 1.0),
#     'tfidfvectorizer__min_df': (10, 20),
#     'tfidfvectorizer__sublinear_tf': (True, False),
#     'selectpercentile__percentile': (0.1, 0.5, 1, 5),
#     'multinomialnb__alpha': (1e-5, 1e-2, 1e-1, 0.25, 0.5, 1, 10, 50, 100),
#     }
# param_grids.append(param_grid)

# # Logistic Regression
# clf = LogisticRegression(solver='saga', max_iter=5000, class_weight='balanced', random_state=42)
# pipeline = make_pipeline(vectorizer, selector,clf)
# pipelines.append(pipeline)

# search_modes.append('random')

# param_grid = {
#     'tfidfvectorizer__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)), 
#     'tfidfvectorizer__max_df': (0.8, 0.9, 1.0),
#     'tfidfvectorizer__min_df': (10, 20),
#     'tfidfvectorizer__sublinear_tf': (True, False),
#     'selectpercentile__percentile': (0.1, 0.5, 1, 5),
#     'logisticregression__C': (1e-5, 1e-2, 1e-1, 0.25, 0.5, 1, 10, 50, 100),
#     'logisticregression__l1_ratio': (1.0, 0.75, 0.5, 0.25, 0.0),
#     }
# param_grids.append(param_grid)

# LightGBM
clf = LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1)
pipeline = make_pipeline(vectorizer, selector, clf)
pipelines.append(pipeline)

search_modes.append('random')

num_leaves = [5, 10, 30, 50, 100]
max_depth = [-1, 3, 4, 5, 6, 7, 10]
learning_rate = [0.01, 0.1, 0.25]
n_estimators = [300, 350, 400, 450, 500]
min_child_samples = range(1, 50)
subsample = uniform(0.1, 1.0)
colsample_bytree = uniform(0.1, 1.0) 
reg_alpha = uniform(0.0, 1.0)
reg_lambda = uniform(0.0, 1.0)

param_grid = {
    'tfidfvectorizer__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)), 
    'tfidfvectorizer__max_df': (0.8, 0.9, 1.0),
    'tfidfvectorizer__min_df': (10, 20),
    'tfidfvectorizer__sublinear_tf': (True, False),
    'selectpercentile__percentile': (0.1, 0.5, 1, 5),
    'lgbmclassifier__num_leaves': num_leaves,
    'lgbmclassifier__max_depth': max_depth,
    'lgbmclassifier__learning_rate': learning_rate,
    'lgbmclassifier__n_estimators': n_estimators,
    'lgbmclassifier__min_child_samples': min_child_samples,
    'lgbmclassifier__subsample': subsample,
    'lgbmclassifier__colsample_bytree': colsample_bytree,
    'lgbmclassifier__reg_alpha': reg_alpha,
    'lgbmclassifier__reg_lambda': reg_lambda,
}
param_grids.append(param_grid)

In [None]:
for pipeline, search_mode, param_grid in zip(pipelines, search_modes, param_grids):
        benchmark_nested_cv(pipeline, search_mode, param_grid, X, y.cat.codes, groups, len(y.cat.categories), cv=cv)

### Tune the final model

In [6]:
# LightGBM
vectorizer = TfidfVectorizer(stop_words=get_stopwords(), token_pattern=r'\S+')
selector = SelectPercentile(score_func=chi2)
clf = LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1)
pipeline = make_pipeline(vectorizer, selector, clf)

num_leaves = [5, 10, 30, 50, 100]
max_depth = [-1, 3, 4, 5, 6, 7, 10]
learning_rate = [0.1]
n_estimators = [300, 400, 500]
min_child_samples = range(1, 50)
subsample = uniform(0.5, 1.0)
colsample_bytree = uniform(0.5, 1.0) 
reg_alpha = uniform(0.0, 1.0)
reg_lambda = uniform(0.0, 1.0)

param_grid = {
    'tfidfvectorizer__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)), 
    'tfidfvectorizer__max_df': (0.8, 0.9, 1.0),
    'tfidfvectorizer__min_df': (10, 20),
    'tfidfvectorizer__sublinear_tf': (True, False),
    'selectpercentile__percentile': uniform(0.1, 15.0),
    'lgbmclassifier__num_leaves': num_leaves,
    'lgbmclassifier__max_depth': max_depth,
    'lgbmclassifier__learning_rate': learning_rate,
    'lgbmclassifier__n_estimators': n_estimators,
    'lgbmclassifier__min_child_samples': min_child_samples,
    'lgbmclassifier__subsample': subsample,
    'lgbmclassifier__colsample_bytree': colsample_bytree,
    'lgbmclassifier__reg_alpha': reg_alpha,
    'lgbmclassifier__reg_lambda': reg_lambda,
}

search_result = search_params(pipeline, 'random', param_grid, X, y.cat.codes, groups, cv=cv, scoring='average_precision')
tuned_model = search_result.best_estimator_

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/miniforge3/conda-bld/bld/rattler-build_liblightgbm_1768022048/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/miniforge3/conda-bld/bld/rattler-build_liblightgbm_1768022048/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/miniforge3/conda-bld/bld/rattler-build_liblightgbm_1768022048/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/miniforge3/conda-bld/bld/rattler-build_liblightgbm_1768022048/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/miniforge3/conda-bld/bld/rattler-build_liblightgbm_1768022048/work/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/miniforge3/conda-

Best for current fold: 0.798 using {'lgbmclassifier__colsample_bytree': 0.974039388225595, 'lgbmclassifier__learning_rate': 0.1, 'lgbmclassifier__max_depth': 5, 'lgbmclassifier__min_child_samples': 1, 'lgbmclassifier__n_estimators': 500, 'lgbmclassifier__num_leaves': 10, 'lgbmclassifier__reg_alpha': 0.42367924416491276, 'lgbmclassifier__reg_lambda': 0.959474192232794, 'lgbmclassifier__subsample': 0.7051575051445674, 'selectpercentile__percentile': 15.026537562825311, 'tfidfvectorizer__max_df': 0.9, 'tfidfvectorizer__min_df': 10, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__sublinear_tf': True}
0.798 (+/- 0.008) with: {'lgbmclassifier__colsample_bytree': 0.974039388225595, 'lgbmclassifier__learning_rate': 0.1, 'lgbmclassifier__max_depth': 5, 'lgbmclassifier__min_child_samples': 1, 'lgbmclassifier__n_estimators': 500, 'lgbmclassifier__num_leaves': 10, 'lgbmclassifier__reg_alpha': 0.42367924416491276, 'lgbmclassifier__reg_lambda': 0.959474192232794, 'lgbmclassifier__subsample

>**Optimising on data from 2012-2013**
>
> Best for current fold: 0.802 using {'lgbmclassifier__colsample_bytree': 0.9405774799966728, 'lgbmclassifier__learning_rate': 0.1, 'lgbmclassifier__max_depth': 7, 'lgbmclassifier__min_child_samples': 11, 'lgbmclassifier__n_estimators': 500, 'lgbmclassifier__num_leaves': 10, 'lgbmclassifier__reg_alpha': 0.6204779482919983, 'lgbmclassifier__reg_lambda': 0.8801005165506667, 'lgbmclassifier__subsample': 0.6210162242125827, 'selectpercentile__percentile': 8.32817725843034, 'tfidfvectorizer__max_df': 0.8, 'tfidfvectorizer__min_df': 10, 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__sublinear_tf': True}
>
>**Optimising on data from 2012-2015**
>
> Best for current fold: 0.810 using {'lgbmclassifier__colsample_bytree': 0.5588539725482633, 'lgbmclassifier__learning_rate': 0.1, 'lgbmclassifier__max_depth': 3, 'lgbmclassifier__min_child_samples': 29, 'lgbmclassifier__n_estimators': 400, 'lgbmclassifier__num_leaves': 30, 'lgbmclassifier__reg_alpha': 0.48597218005935294, 'lgbmclassifier__reg_lambda': 0.41562677309363316, 'lgbmclassifier__subsample': 0.9090799309941685, 'selectpercentile__percentile': 14.594225226814705, 'tfidfvectorizer__max_df': 0.9, 'tfidfvectorizer__min_df': 20, 'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__sublinear_tf': True}
>
>**Optimising on data from 2012-2016**
>
> Best for current fold: 0.834 using {'lgbmclassifier__colsample_bytree': 0.655905267913607, 'lgbmclassifier__learning_rate': 0.1, 'lgbmclassifier__max_depth': 4, 'lgbmclassifier__min_child_samples': 18, 'lgbmclassifier__n_estimators': 500, 'lgbmclassifier__num_leaves': 10, 'lgbmclassifier__reg_alpha': 0.17131732035590963, 'lgbmclassifier__reg_lambda': 0.24096322122260094, 'lgbmclassifier__subsample': 0.9426285422415861, 'selectpercentile__percentile': 6.23691613733805, 'tfidfvectorizer__max_df': 1.0, 'tfidfvectorizer__min_df': 10, 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__sublinear_tf': True}
>
>**Optimising on data from 2012-2017**
>
> Best for current fold: 0.850 using {'lgbmclassifier__colsample_bytree': 0.9612825197147109, 'lgbmclassifier__learning_rate': 0.1, 'lgbmclassifier__max_depth': 10, 'lgbmclassifier__min_child_samples': 44, 'lgbmclassifier__n_estimators': 500, 'lgbmclassifier__num_leaves': 30, 'lgbmclassifier__reg_alpha': 0.263826877804582, 'lgbmclassifier__reg_lambda': 0.9057130463033352, 'lgbmclassifier__subsample': 0.8735760255497855, 'selectpercentile__percentile': 6.232260239759076, 'tfidfvectorizer__max_df': 1.0, 'tfidfvectorizer__min_df': 20, 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__sublinear_tf': False}
>
>**Optimising on data from 2012-2017, no text normalisation**
>
> Best for current fold: 0.802 using {'lgbmclassifier__colsample_bytree': 0.6898019252162189, 'lgbmclassifier__learning_rate': 0.1, 'lgbmclassifier__max_depth': -1, 'lgbmclassifier__min_child_samples': 23, 'lgbmclassifier__n_estimators': 500, 'lgbmclassifier__num_leaves': 50, 'lgbmclassifier__reg_alpha': 0.17745092381651095, 'lgbmclassifier__reg_lambda': 0.570683845277106, 'lgbmclassifier__subsample': 0.8348503956617689, 'selectpercentile__percentile': 3.4489321623180476, 'tfidfvectorizer__max_df': 0.8, 'tfidfvectorizer__min_df': 20, 'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__sublinear_tf': False}


### Tuned model

In [None]:
vectorizer = TfidfVectorizer(stop_words=get_stopwords(), token_pattern=r'\S+')
selector = SelectPercentile(score_func=chi2)
clf = LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1)
pipeline = make_pipeline(vectorizer, selector, clf)
params = {'lgbmclassifier__colsample_bytree': 0.9612825197147109, 'lgbmclassifier__learning_rate': 0.1, 'lgbmclassifier__max_depth': 10, 'lgbmclassifier__min_child_samples': 44, 'lgbmclassifier__n_estimators': 500, 'lgbmclassifier__num_leaves': 30, 'lgbmclassifier__reg_alpha': 0.263826877804582, 'lgbmclassifier__reg_lambda': 0.9057130463033352, 'lgbmclassifier__subsample': 0.8735760255497855, 'selectpercentile__percentile': 6.232260239759076, 'tfidfvectorizer__max_df': 1.0, 'tfidfvectorizer__min_df': 20, 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__sublinear_tf': False}
tuned_model = pipeline.set_params(**params)

 ### Evaluate the tuned model

In [None]:
# Make predictions for each CV fold
y_proba = predict_cv(tuned_model, X, y.cat.codes, groups, cv=cv)

# Calculate discrimination metrics for each CV fold
cv_generator = cv.split(X, y.cat.codes, groups)
calculate_discrimination_metrics(y, y_proba, generator=cv_generator)

### Calibrate the tuned model

In [None]:
# Calibrate the model
tuned_model_calibrated, y_proba_calibrated = calibrate_cv(tuned_model, X, y.cat.codes, groups, cv=cv, method='isotonic', ensemble=True)

# Calculate discrimination metrics for calibrated model for each CV fold
cv_generator = cv.split(X, y.cat.codes, groups)
calculate_discrimination_metrics(y, y_proba_calibrated, generator=cv_generator)

In [None]:
# Plot calibration curves to compare the uncalibrate and calibrated vesions
plot_calibration_curve(y, y_proba, y_proba_calibrated, results_dir=results_dir, filename=dev_data_filename + " CV")

### Select the best model

In [None]:
# Use the uncalibrated or calibrated version
best_model = tuned_model#_calibrated
y_proba = y_proba#_calibrated

# # Record discrimination metrics for each CV fold
# cv_generator = cv.split(X, y.cat.codes, groups)
# metrics_dict['Development set, cross-validation'] = calculate_discrimination_metrics(y, y_proba, generator=cv_generator)

### Estimate model discrimination

In [None]:
# Plot ROC curves for CV folds
cv_generator = cv.split(X, y.cat.codes, groups)
plot_roc_curve(y, y_proba, generator=cv_generator, metrics=metrics_dict['Development set, cross-validation']['discrimination'], palette=palette, results_dir=results_dir, filename=dev_data_filename + " CV")

# Plot PR curves for CV folds
cv_generator = cv.split(X, y.cat.codes, groups)
plot_pr_curve(y, y_proba, generator=cv_generator, metrics=metrics_dict['Development set, cross-validation']['discrimination'], palette=palette, results_dir=results_dir, filename=dev_data_filename + " CV")

### Estimate model classification

In [None]:
# Find an optimal threshold for each CV fold
y_pred = np.zeros_like(y.cat.codes)
cv_generator = cv.split(X, y.cat.codes, groups)
for _, test_idx in cv_generator:
    threshold = calculate_optimal_threshold(y.cat.codes[test_idx], y_proba[test_idx,1])
    y_pred[test_idx] = probability_to_label(y_proba[test_idx], threshold=threshold)

In [None]:
classification_metrics = [
        {
        'name': 'precision', 
        'func': calculate_precision_score,
        },
        {
        'name': 'recall', 
        'func': calculate_recall_score,
        },
    ]
cv_generator = cv.split(X, y.cat.codes, groups)
scores = calculate_classification_metrics(y, y_pred, classification_metrics, generator=cv_generator)

In [None]:
scores[(scores.class_.cat.codes==1) & (scores.metric=="precision")].value.agg(['mean', 'std']).round(3)

In [None]:
scores[(scores.class_.cat.codes==1) & (scores.metric=="recall")].value.agg(['mean', 'std']).round(3)

In [None]:
scores[(scores.class_.cat.codes==0) & (scores.metric=="recall")].value.agg(['mean', 'std']).round(3)

### Select the optimal probability threshold

In [None]:
# Convert probabilities to crisp class labels
if len(y.cat.categories)==2:
     # Find an optimal threshold for each CV fold
    cv_generator = cv.split(X, y.cat.codes, groups)
    threshold = select_threshold(y.cat.codes, y_proba[:,1], cv_generator)

    # Convert probabilities to class labels
    y_pred = probability_to_label(y_proba, threshold=threshold)
else:
    # Convert probabilities to class labels
    y_pred = probability_to_label(y_proba)

# Plot confusion matrix
plot_confusion_matrix(y, y_pred)

### Re-train the final model on the full development set

In [None]:
# Re-train the final model on the full development set
best_model.fit(X, y.cat.codes)

### Export selected features

In [None]:
# Uncalibrated
# best_model[1].get_support().sum()
# Calibrated ensemble=False
# best_model.calibrated_classifiers_[0].estimator[1].get_support().sum()
# Calibrated ensemble=True
# best_model.calibrated_classifiers_[0].estimator[1].get_support().sum(), best_model.calibrated_classifiers_[1].estimator[1].get_support().sum(), best_model.calibrated_classifiers_[2].estimator[1].get_support().sum()

In [None]:
all_selected_fts = []
for i in range(3):
    input_fts = best_model.calibrated_classifiers_[i].estimator[0].get_feature_names_out()
    masked_fts = best_model.calibrated_classifiers_[i].estimator[1].get_support()
    selected_fts = input_fts[masked_fts]
    print(f"Selected features for fold {i+1}: {len(selected_fts)}")
    # with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_ensemble_True_selected_fts" + str(i) + ".txt"), 'w') as f:
    #     f.write("\n".join(selected_fts)) 
    all_selected_fts.extend(input_fts[masked_fts])
print(f"Total unique selected features across all folds: {len(set(all_selected_fts))}")
# with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_ensemble_True_selected_fts_all.txt"), 'w') as f:
#         f.write("\n".join(sorted(set(all_selected_fts)))) 

### Overfitting: feature selection vs. classifier

In [None]:
# # Fit vectorizer to the full development set
# vectorizer = get_vectorizer(vectorizer_mode, params)
# vectorizer.fit(X,y)
# print("Vectorizer selected %d features." % vectorizer.df_features.shape[0])

# # Save the vectorizer
# with open(models_dir / (dev_data_filename + "_vectorizer.pickle"), 'wb') as f:
#     pickle.dump(vectorizer, f)
 
# # List selected features
# selected_features = vectorizer.df_features.feature.tolist() #vectorizer.vectorizer.get_feature_names_out()
# print(len(selected_features))

# # Write the list of selected features
# with open(models_dir / (dev_data_filename + "_selected_fts.txt"), 'w') as f:
#     f.writelines('\n'.join(selected_features))

___
# Model evaluation

### Load unseen data

In [None]:
# Load the test set
df_test = pd.read_parquet(proc_data_dir / (test_data_filename + "_normalised.parquet"), engine="pyarrow")

# Use data from 2016 for testing
df_test = df_test[df_test.year > cut_off_year].copy()

# Rename label column
df_test.rename(columns={label: 'y'}, inplace=True)

# Define test set features and labels
X_test = df_test[features]
y_test = df_test.y

df_test.y.value_counts(normalize=True).sort_index().round(4)*100

### Make predictions

In [None]:
# Make predictions
y_proba = best_model.predict_proba(X_test)

# # Convert probabilities to crisp class labels
# if len(y.cat.categories)==2:
#     # Convert probabilities to class labels
#     y_pred = probability_to_label(y_proba, threshold=threshold)
# else:
#     # Convert probabilities to class labels
#     y_pred = probability_to_label(y_proba)

### Calculate performance metrics

In [None]:
calculate_discrimination_metrics(y_test, y_proba)

In [None]:
calculate_discrimination_metrics(y_test, y_proba, generator=True, return_ci=True)

In [None]:
# Calculate discrimination metrics
metrics_dict['Test set, full dataset'] = calculate_discrimination_metrics(y_test, y_proba)
# Calculate discrimination metrics with bootstrapping
metrics_dict['Test set, bootstrapping'] = calculate_discrimination_metrics(y_test, y_proba, generator=True, return_ci=True)
# Calculate classification metrics
metrics_dict['Test set, full dataset'] |= calculate_classification_metrics(y_test, y_pred)
# Calculate classification metrics with bootstrapping
metrics_dict['Test set, bootstrapping'] |= calculate_classification_metrics(y_test, y_pred, generator=True, return_ci=True)

### Plot calibration curve

In [None]:
plot_calibration_curve(y_test, y_proba, generator=True, return_ci=True, results_dir=results_dir, filename=test_data_filename + "_ensemble_True_")

### Plot diagnostic curves

In [None]:
# Plot ROC curves on the full test set
plot_roc_curve(y_test, y_proba, metrics_dict['Test set, full dataset']['discrimination'], palette=palette, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

# Plot ROC curves on the test set with bootstrapping
plot_roc_curve(y_test, y_proba, metrics_dict['Test set, bootstrapping']['discrimination'], generator=True, return_ci=True, palette=palette, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

# Plot PR curves on the full test set
plot_pr_curve(y_test, y_proba, metrics_dict['Test set, full dataset']['discrimination'], palette=palette, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

# Plot PR curves on the test set with bootstrapping
plot_pr_curve(y_test, y_proba, metrics_dict['Test set, bootstrapping']['discrimination'], generator=True, return_ci=True, palette=palette, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

### Plot predicted probabilities

In [None]:
plot_probabilities(y_test, y_proba, threshold=threshold, palette=palette)

### Plot confusion matrix

In [None]:
plot_confusion_matrix(y_test, y_pred, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

### Save the final model and the threshold value

In [None]:
# Save the model
with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_ensemble_False_classifier.pickle"), 'wb') as f:
    pickle.dump(best_model, f)
    
if len(df.y.cat.categories)==2:
    # Save the threshold
    with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_ensemble_False_threshold.txt"), 'w') as f:
        f.write(str(threshold)) 

In [None]:
with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_metrics.txt"), 'w') as f:
        f.write(str(metrics_dict)) 