___
# Classification of ED triage notes for self-harm
___

In [1]:
import pandas as pd
from scipy.stats import uniform

from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_extraction.text import TfidfVectorizer 

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import make_pipeline

import pickle

# Project imports
from self_harm_triage_notes.config import *
from self_harm_triage_notes.dev_utils import get_stopwords

# Toolbox imports
from ml_health_toolbox.dev_utils import *
from ml_health_toolbox.eval_utils import *

In [2]:
# Suppress warnings (including in worker processes started via spawn)
import os
# Ensure child processes inherit this setting
os.environ['PYTHONWARNINGS'] = 'ignore'

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Set-up
# Development data
dev_data_filename = "rmh_2012_2017_dev"

# Test data
test_data_filename = "rmh_2012_2017_test"

# Classification label and features
label = 'SH' #'SH_SI'
features = 'triage_note'
palette = ['seagreen','tomato']
    
# Undersampling
undersample = False
n_controls = 10000

# Dimensionality reduction
reduce_dim = False
lsa_method = 'svd'
n_components = 100

# Additional features
add_length = False
add_metadata = False

___
## Data preparation

In [4]:
# Load the development set
df = pd.read_parquet(interim_data_dir / (dev_data_filename + ".parquet"), engine="pyarrow")

# Keep data up to 2017 for development
df = df[df.year <= 2017].copy()

# Rename label column
df.rename(columns={label: 'y'}, inplace=True)

# Class names
classes = df.y.cat.categories

# Define features and labels
X = df[features]
y = df.y
groups = df.uid

# Define CV strategy
cv = get_cv_strategy(groups=groups, n_splits=N_SPLITS)

# Dictionary to store metrics
metrics_dict = {}

df.y.value_counts(normalize=True).sort_index().round(4)*100

y
Not self-harm    98.65
Self-harm         1.35
Name: proportion, dtype: float64

In [None]:
# (Archived)
# from ml_health_toolbox.dataset_utils import to_categorical

# df["SH_SI"] = np.where((df.SH.cat.codes==1)|(df.SI.cat.codes==1), "Positive", "Negative")
# df.SH_SI = to_categorical(df.SH_SI, categories=["Negative", "Positive"])

# df.SH_SI.value_counts()

# # Concatenate with sex,age,arrival mode
# if add_metadata:
#     df[features] = df.apply(lambda x: " ".join(x[['concepts', 'sex', 
#                                                   'age','arrival_mode']].astype(str)), axis=1)
    
# # Perform undersampling
# if undersample:
#     df = pd.concat([df[df.SH == 0].sample(n_controls, random_state=42), 
#                     df[df.SH!=0]], axis=0)

___
# Model development

### Quick & dirty trying a few algorithms

In [None]:
vectorizer = TfidfVectorizer(stop_words=get_stopwords(), token_pattern=r'\S+')
selector = SelectPercentile(score_func=chi2, percentile=5)

clfs = (
    MultinomialNB(),
    LogisticRegression(solver='saga', max_iter=5000, class_weight='balanced', random_state=42),
#     RandomForestClassifier(class_weight="balanced", random_state=42),
#     GradientBoostingClassifier(random_state=42),
#     XGBClassifier(objective='binary:logistic', random_state=42),
    LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1),
)

for clf in clfs:
    pipeline = make_pipeline(vectorizer, selector, clf)
    score_cv(pipeline, X, y.cat.codes, groups, len(y.cat.categories), cv=cv)

### Proper model comparison with nested CV 

In [None]:
pipelines = []
search_modes = []
param_grids = []

vectorizer = TfidfVectorizer(stop_words='english')
selector = SelectPercentile(score_func=chi2)

# # MNB
# clf = MultinomialNB()
# pipeline = make_pipeline(vectorizer, selector, clf)
# pipelines.append(pipeline)

# search_modes.append('random')

# param_grid = {
#     'tfidfvectorizer__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)), 
#     'tfidfvectorizer__max_df': (0.8, 0.9, 1.0),
#     'tfidfvectorizer__min_df': (10, 20),
#     'tfidfvectorizer__sublinear_tf': (True, False),
#     'selectpercentile__percentile': (0.1, 0.5, 1, 5),
#     'multinomialnb__alpha': (1e-5, 1e-2, 1e-1, 0.25, 0.5, 1, 10, 50, 100),
#     }
# param_grids.append(param_grid)

# # Logistic Regression
# clf = LogisticRegression(solver='saga', max_iter=5000, class_weight='balanced', random_state=42)
# pipeline = make_pipeline(vectorizer, selector,clf)
# pipelines.append(pipeline)

# search_modes.append('random')

# param_grid = {
#     'tfidfvectorizer__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)), 
#     'tfidfvectorizer__max_df': (0.8, 0.9, 1.0),
#     'tfidfvectorizer__min_df': (10, 20),
#     'tfidfvectorizer__sublinear_tf': (True, False),
#     'selectpercentile__percentile': (0.1, 0.5, 1, 5),
#     'logisticregression__C': (1e-5, 1e-2, 1e-1, 0.25, 0.5, 1, 10, 50, 100),
#     'logisticregression__l1_ratio': (1.0, 0.75, 0.5, 0.25, 0.0),
#     }
# param_grids.append(param_grid)

# LightGBM
clf = LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1)
pipeline = make_pipeline(vectorizer, selector, clf)
pipelines.append(pipeline)

search_modes.append('random')

num_leaves = [5, 10, 30, 50, 100]
max_depth = [-1, 3, 4, 5, 6, 7, 10]
learning_rate = [0.01, 0.1, 0.25]
n_estimators = [300, 350, 400, 450, 500]
min_child_samples = range(1, 50)
subsample = uniform(0.1, 1.0)
colsample_bytree = uniform(0.1, 1.0) 
reg_alpha = uniform(0.0, 1.0)
reg_lambda = uniform(0.0, 1.0)

param_grid = {
    'tfidfvectorizer__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)), 
    'tfidfvectorizer__max_df': (0.8, 0.9, 1.0),
    'tfidfvectorizer__min_df': (10, 20),
    'tfidfvectorizer__sublinear_tf': (True, False),
    'selectpercentile__percentile': (0.1, 0.5, 1, 5),
    'lgbmclassifier__num_leaves': num_leaves,
    'lgbmclassifier__max_depth': max_depth,
    'lgbmclassifier__learning_rate': learning_rate,
    'lgbmclassifier__n_estimators': n_estimators,
    'lgbmclassifier__min_child_samples': min_child_samples,
    'lgbmclassifier__subsample': subsample,
    'lgbmclassifier__colsample_bytree': colsample_bytree,
    'lgbmclassifier__reg_alpha': reg_alpha,
    'lgbmclassifier__reg_lambda': reg_lambda,
}
param_grids.append(param_grid)

In [None]:
for pipeline, search_mode, param_grid in zip(pipelines, search_modes, param_grids):
        benchmark_nested_cv(pipeline, search_mode, param_grid, X, y.cat.codes, groups, len(y.cat.categories), cv=cv)

### Tune the final model

In [None]:
# LightGBM
vectorizer = TfidfVectorizer(stop_words=get_stopwords(), token_pattern=r'\S+')
selector = SelectPercentile(score_func=chi2)
clf = LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1)
pipeline = make_pipeline(vectorizer, selector, clf)

num_leaves = [5, 10, 30, 50, 100]
max_depth = [-1, 3, 4, 5, 6, 7, 10]
learning_rate = [0.1]
n_estimators = [300, 400, 500]
min_child_samples = range(1, 50)
subsample = uniform(0.5, 1.0)
colsample_bytree = uniform(0.5, 1.0) 
reg_alpha = uniform(0.0, 1.0)
reg_lambda = uniform(0.0, 1.0)

param_grid = {
    'tfidfvectorizer__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)), 
    'tfidfvectorizer__max_df': (0.8, 0.9, 1.0),
    'tfidfvectorizer__min_df': (10, 20),
    'tfidfvectorizer__sublinear_tf': (True, False),
    'selectpercentile__percentile': uniform(0.1, 15.0),
    'lgbmclassifier__num_leaves': num_leaves,
    'lgbmclassifier__max_depth': max_depth,
    'lgbmclassifier__learning_rate': learning_rate,
    'lgbmclassifier__n_estimators': n_estimators,
    'lgbmclassifier__min_child_samples': min_child_samples,
    'lgbmclassifier__subsample': subsample,
    'lgbmclassifier__colsample_bytree': colsample_bytree,
    'lgbmclassifier__reg_alpha': reg_alpha,
    'lgbmclassifier__reg_lambda': reg_lambda,
}

search_result = search_params(pipeline, 'random', param_grid, X, y.cat.codes, groups, cv=cv, scoring='average_precision')
tuned_model = search_result.best_estimator_

### Tuned model

In [5]:
# # Percentile up to 3
# vectorizer = TfidfVectorizer(stop_words=get_stopwords(), token_pattern=r'\S+')
# selector = SelectPercentile(score_func=chi2)
# clf = LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1)
# pipeline = make_pipeline(vectorizer, selector, clf)
# params = {'lgbmclassifier__colsample_bytree': 0.6843466293889264,
#           'lgbmclassifier__learning_rate': 0.1,
#           'lgbmclassifier__max_depth': 10,
#           'lgbmclassifier__min_child_samples': 5,
#           'lgbmclassifier__n_estimators': 300,
#           'lgbmclassifier__num_leaves': 10,
#           'lgbmclassifier__reg_alpha': 0.9275021934713552,
#           'lgbmclassifier__reg_lambda': 0.9981717891374208,
#           'lgbmclassifier__subsample': 0.7878351074389227,
#           'selectpercentile__percentile': 2.9563778484704324,
#           'tfidfvectorizer__max_df': 0.9,
#           'tfidfvectorizer__min_df': 20,
#           'tfidfvectorizer__ngram_range': (1, 2),
#           'tfidfvectorizer__sublinear_tf': True}

# tuned_model = pipeline.set_params(**params)

# # Percentile up to 15
vectorizer = TfidfVectorizer(stop_words=get_stopwords(), token_pattern=r'\S+')
selector = SelectPercentile(score_func=chi2)
clf = LGBMClassifier(force_row_wise=True, random_state=42, verbose=-1)
pipeline = make_pipeline(vectorizer, selector, clf)
params = {'lgbmclassifier__colsample_bytree': 0.6294449637644206,
          'lgbmclassifier__learning_rate': 0.1,
          'lgbmclassifier__max_depth': -1,
          'lgbmclassifier__min_child_samples': 16,
          'lgbmclassifier__n_estimators': 300,
          'lgbmclassifier__num_leaves': 50,
          'lgbmclassifier__reg_alpha': 0.21746907287837236,
          'lgbmclassifier__reg_lambda': 0.44578160620959895,
          'lgbmclassifier__subsample': 0.6249596223495628,
          'selectpercentile__percentile': 8.800454977071395,
          'tfidfvectorizer__max_df': 0.8,
          'tfidfvectorizer__min_df': 10,
          'tfidfvectorizer__ngram_range': (1, 3),
          'tfidfvectorizer__sublinear_tf': True}

tuned_model = pipeline.set_params(**params)

 ### Evaluate the tuned model

In [6]:
# Calculate performance scores
score_cv(tuned_model, X, y.cat.codes, groups, len(y.cat.categories), cv=cv)

# Make predictions for each CV fold
y_proba = predict_cv(tuned_model, X, y.cat.codes, groups, cv=cv)

________________________________________________________________________________
Training with 5-fold cross-validation:
Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.8, min_df=10, ngram_range=(1, 3),
                                 stop_words=['a', 'about', 'above', 'across',
                                             'after', 'afterwards', 'again',
                                             'against', 'ain', 'all', 'almost',
                                             'alone', 'along', 'already',
                                             'also', 'although', 'always', 'am',
                                             'among', 'amongst', 'amoungst',
                                             'amount', 'an', 'and', 'another',
                                             'any', 'anyhow', 'anyone',
                                             'anything', 'anyway...
                ('selectpercentile',
                 SelectPercentile(percentile=

### Calibrate the tuned model

* uncalibrated -- second best
* calibrated with isotonic and ensemble=True -- best, three models ~1350 fts each (FN=1016>253, FP=727>188)
* calibrated with isotonic and ensemble=False -- worst

In [7]:
# Calibrate the model
tuned_model_calibrated, y_proba_calibrated = calibrate_cv(tuned_model, X, y, groups, cv=cv, method='isotonic', ensemble=True)

________________________________________________________________________________
Training with 5-fold cross-validation:
CalibratedClassifierCV(cv=3, ensemble=True,
                       estimator=Pipeline(steps=[('tfidfvectorizer',
                                                  TfidfVectorizer(max_df=0.8,
                                                                  min_df=10,
                                                                  ngram_range=(1,
                                                                               3),
                                                                  stop_words=['a',
                                                                              'about',
                                                                              'above',
                                                                              'across',
                                                                              'after',
               

In [None]:
# Calculate calibration curves for the uncalibrated and calibrated versions of the model
calibration_curves = prepare_calibration_curve(y, y_proba, y_proba_calibrated)
# Plot calibration curves to compare the uncalibrate and calibrated vesions of the model
plot_calibration_curve(calibration_curves, palette=['#C5D1E0', '#757F8D'])

### Select the best model

In [11]:
# Use the uncalibrated or calibrated version
best_model = tuned_model_calibrated
y_proba = y_proba_calibrated

### Re-train the final model on the full development set

In [12]:
# Re-train the final model on the full development set
best_model.fit(X, y.cat.codes)

0,1,2
,"estimator  estimator: estimator instance, default=None The classifier whose output need to be calibrated to provide more accurate `predict_proba` outputs. The default classifier is a :class:`~sklearn.svm.LinearSVC`. .. versionadded:: 1.2",Pipeline(step...verbose=-1))])
,"method  method: {'sigmoid', 'isotonic', 'temperature'}, default='sigmoid' The method to use for calibration. Can be: - 'sigmoid', which corresponds to Platt's method (i.e. a binary logistic  regression model). - 'isotonic', which is a non-parametric approach. - 'temperature', temperature scaling. Sigmoid and isotonic calibration methods natively support only binary classifiers and extend to multi-class classification using a One-vs-Rest (OvR) strategy with post-hoc renormalization, i.e., adjusting the probabilities after calibration to ensure they sum up to 1. In contrast, temperature scaling naturally supports multi-class calibration by applying `softmax(classifier_logits/T)` with a value of `T` (temperature) that optimizes the log loss. For very uncalibrated classifiers on very imbalanced datasets, sigmoid calibration might be preferred because it fits an additional intercept parameter. This helps shift decision boundaries appropriately when the classifier being calibrated is biased towards the majority class. Isotonic calibration is not recommended when the number of calibration samples is too low ``(≪1000)`` since it then tends to overfit. .. versionchanged:: 1.8  Added option 'temperature'.",'isotonic'
,"cv  cv: int, cross-validation generator, or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`~sklearn.model_selection.StratifiedKFold` is used. If ``y`` is neither binary nor multiclass, :class:`~sklearn.model_selection.KFold` is used. Refer to the :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",3
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. Base estimator clones are fitted in parallel across cross-validation iterations. See :term:`Glossary ` for more details. .. versionadded:: 0.24",-1
,"ensemble  ensemble: bool, or ""auto"", default=""auto"" Determines how the calibrator is fitted. ""auto"" will use `False` if the `estimator` is a :class:`~sklearn.frozen.FrozenEstimator`, and `True` otherwise. If `True`, the `estimator` is fitted using training data, and calibrated using testing data, for each `cv` fold. The final estimator is an ensemble of `n_cv` fitted classifier and calibrator pairs, where `n_cv` is the number of cross-validation folds. The output is the average predicted probabilities of all pairs. If `False`, `cv` is used to compute unbiased predictions, via :func:`~sklearn.model_selection.cross_val_predict`, which are then used for calibration. At prediction time, the classifier used is the `estimator` trained on all the data. Note that this method is also internally implemented in :mod:`sklearn.svm` estimators with the `probabilities=True` parameter. .. versionadded:: 0.24 .. versionchanged:: 1.6  `""auto""` option is added and is the default.",True

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.","['a', 'about', ...]"
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'\\S+'

0,1,2
,"score_func  score_func: callable, default=f_classif Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. Default is f_classif (see below ""See Also""). The default function only works with classification tasks. .. versionadded:: 0.18",<function chi2 at 0x347ed9300>
,"percentile  percentile: int, default=10 Percent of features to keep.",8.800454977071395

0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,-1
,learning_rate,0.1
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


### Export selected features

In [10]:
# Uncalibrated
best_model[1].get_support().sum()
# Calibrated ensemble=False
# best_model.calibrated_classifiers_[0].estimator[1].get_support().sum()
# Calibrated ensemble=True
# best_model.calibrated_classifiers_[0].estimator[1].get_support().sum(), best_model.calibrated_classifiers_[1].estimator[1].get_support().sum(), best_model.calibrated_classifiers_[2].estimator[1].get_support().sum()

8681

In [13]:
all_selected_fts = []
for i in range(3):
    input_fts = best_model.calibrated_classifiers_[i].estimator[0].get_feature_names_out()
    masked_fts = best_model.calibrated_classifiers_[i].estimator[1].get_support()
    selected_fts = input_fts[masked_fts]
    print(f"Selected features for fold {i+1}: {len(selected_fts)}")
    # with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_ensemble_True_selected_fts" + str(i) + ".txt"), 'w') as f:
    #     f.write("\n".join(selected_fts)) 
    all_selected_fts.extend(input_fts[masked_fts])
print(f"Total unique selected features across all folds: {len(set(all_selected_fts))}")
# with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_ensemble_True_selected_fts_all.txt"), 'w') as f:
#         f.write("\n".join(sorted(set(all_selected_fts)))) 

Selected features for fold 1: 5876
Selected features for fold 2: 5885
Selected features for fold 3: 5785
Total unique selected features across all folds: 8986


### Overfitting: feature selection vs. classifier

In [None]:
# # Fit vectorizer to the full development set
# vectorizer = get_vectorizer(vectorizer_mode, params)
# vectorizer.fit(X,y)
# print("Vectorizer selected %d features." % vectorizer.df_features.shape[0])

# # Save the vectorizer
# with open(models_dir / (dev_data_filename + "_vectorizer.pickle"), 'wb') as f:
#     pickle.dump(vectorizer, f)
 
# # List selected features
# selected_features = vectorizer.df_features.feature.tolist() #vectorizer.vectorizer.get_feature_names_out()
# print(len(selected_features))

# # Write the list of selected features
# with open(models_dir / (dev_data_filename + "_selected_fts.txt"), 'w') as f:
#     f.writelines('\n'.join(selected_features))

___
# Model evaluation

### Load unseen data

In [None]:
# Load the test set
df_test = pd.read_parquet(proc_data_dir / (test_data_filename + "_normalised.parquet"), engine="pyarrow")

# Rename label column
df_test.rename(columns={label: 'y'}, inplace=True)

# Define test set features and labels
X_test = df_test[features]
y_test = df_test.y

df_test.y.value_counts(normalize=True).sort_index().round(4)*100

### Make predictions

In [None]:
# Make predictions
y_proba = best_model.predict_proba(X_test)

# Convert probabilities to crisp class labels
if len(y.cat.categories)==2:
    # Convert probabilities to class labels
    y_pred = probability_to_label(y_proba, threshold=threshold)
else:
    # Convert probabilities to class labels
    y_pred = probability_to_label(y_proba)

### Calculate performance metrics

In [None]:
%%timeit
metrics_dict['Test set, bootstrapping'] = calculate_discrimination_metrics(y_test, y_proba, generator=True, return_ci=True)

In [None]:
%%timeit
plot_roc_curve(y_test, y_proba, metrics_dict['Test set, bootstrapping']['discrimination'], generator=True, return_ci=True, palette=palette)

In [None]:
# Calculate discrimination metrics
metrics_dict['Test set, full dataset'] = calculate_discrimination_metrics(y_test, y_proba)
# Calculate discrimination metrics with bootstrapping
metrics_dict['Test set, bootstrapping'] = calculate_discrimination_metrics(y_test, y_proba, generator=True, return_ci=True)
# Calculate classification metrics
metrics_dict['Test set, full dataset'] |= calculate_classification_metrics(y_test, y_pred)
# Calculate classification metrics with bootstrapping
metrics_dict['Test set, bootstrapping'] |= calculate_classification_metrics(y_test, y_pred, generator=True, return_ci=True)

### Plot calibration curve

In [None]:
plot_calibration_curve(y_test, y_proba, generator=True, return_ci=True, results_dir=results_dir, filename=test_data_filename + "_ensemble_True_")

### Plot diagnostic curves

In [None]:
# Plot ROC curves on the full test set
plot_roc_curve(y_test, y_proba, metrics_dict['Test set, full dataset']['discrimination'], palette=palette, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

# Plot ROC curves on the test set with bootstrapping
plot_roc_curve(y_test, y_proba, metrics_dict['Test set, bootstrapping']['discrimination'], generator=True, return_ci=True, palette=palette, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

# Plot PR curves on the full test set
plot_pr_curve(y_test, y_proba, metrics_dict['Test set, full dataset']['discrimination'], palette=palette, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

# Plot PR curves on the test set with bootstrapping
plot_pr_curve(y_test, y_proba, metrics_dict['Test set, bootstrapping']['discrimination'], generator=True, return_ci=True, palette=palette, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

### Plot predicted probabilities

In [None]:
plot_probabilities(y_test, y_proba, threshold=threshold, palette=palette)

### Plot confusion matrix

In [None]:
plot_confusion_matrix(y_test, y_pred, results_dir=results_dir, filename=test_data_filename + "_ensemble_True")

### Save the final model and the threshold value

In [None]:
# Save the model
with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_ensemble_False_classifier.pickle"), 'wb') as f:
    pickle.dump(best_model, f)
    
if len(df.y.cat.categories)==2:
    # Save the threshold
    with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_ensemble_False_threshold.txt"), 'w') as f:
        f.write(str(threshold)) 

In [None]:
with open(models_dir / (dev_data_filename + "_" + label + "_REFIT_metrics.txt"), 'w') as f:
        f.write(str(metrics_dict)) 