# Leave-One-Out Cross Validation
Train on all hospitals but one and test on the left-out hospital. Repeat for each hospital. This is a good way to test the model's generalizability to new hospitals.

In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import seaborn as sns

from datetime import datetime
import glob
from typing import Tuple, List, Dict, Union
import sklearn

In [None]:
""""
AutoMap class
"""

# import
import os
import joblib
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer


class AutoMap:
    def __init__(self):
        nltk.download('punkt')
        nltk.download('wordnet')
        nltk.download('omw-1.4')  # hidden in Pipe creation

        # column names to use in training/predicting
        self.source = 'parameter_name'
        self.target = 'pacmed_subname'
        self.pred = 'predicted_subname'

        # if no label is given, impute with index[0] (unmapped)
        self.unlabeled = ['unmapped', 'microbiology']  # used for validation to filter out unlabeled

        # initial r'\w+' but 5% performance gain when underscores are omitted
        self.preprocess_text_regex_expression = r'[a-zA-Z0-9]+'
        return

    def preprocess_text(self, text):
        # Tokenise words while ignoring punctuation
        tokeniser = RegexpTokenizer(self.preprocess_text_regex_expression)
        tokens = tokeniser.tokenize(text)

        # Lowercase and lemmatise
        lemmatiser = WordNetLemmatizer()
        lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]

        # Remove stop words
        # keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
        # return keywords
        return lemmas

    def create_pipe(self,
                    X=pd.Series,
                    y=pd.Series,
                    estimator=SGDClassifier(random_state=123),
                    grid: dict = None,
                    cv: int = 10,
                    n_jobs: int = None,
                    save: bool = False,
                    prefix=None):
        """
        Create the pipe object used to train and test text data
        """

        if y.isna().sum() > 0:
            y = y.fillna(self.unlabeled[0])

        # ensure labels are encoded
        self.le = LabelEncoder()
        self.le.fit(y=y.unique())

        # Create an instance of TfidfVectorizer
        vectoriser = TfidfVectorizer(analyzer=self.preprocess_text)

        # Fit to the data and transform to feature matrix
        X_train_tfidf = vectoriser.fit_transform(X)

        # try an initial accuracy before hyperparameter optimization
        clf = estimator
        # clf = SGDClassifier(random_state=123)
        # clf_scores = cross_val_score(clf, X_train_tfidf, self.y_train, cv=10)
        # print(clf_scores)
        # print("SGDClassfier Accuracy: %0.2f (+/- %0.2f)" % (clf_scores.mean(), clf_scores.std() * 2))

        if grid is None:
            grid = {'fit_intercept': [True, False],
                    'early_stopping': [True, False],
                    'loss': ['log', 'modified_huber', 'perceptron', 'huber', 'squared_loss', 'epsilon_insensitive',
                             'squared_epsilon_insensitive'],
                    # ['hinge', 'log', 'squared_hinge'], #PM squared_loss --> squared_error in v1.2
                    'penalty': ['l2', 'l1', 'none']}

            # Reduce to optimal grid for rerunning code
            grid = {'fit_intercept': [True],
                    'early_stopping': [False],
                    'loss': ['modified_huber'],
                    'penalty': ['elasticnet']}

        # retry the SGDClassifier training with param_grid
        search = GridSearchCV(estimator=clf, param_grid=grid, cv=cv, n_jobs=n_jobs)
        search.fit(X_train_tfidf, y)

        # grid_sgd_clf_scores = cross_val_score(search.best_estimator_, X_train_tfidf, self.y_train, cv=5)
        # print(grid_sgd_clf_scores)
        # print("SGDClassifier optimal grid Accuracy: %0.2f (+/- %0.2f)" % (
        # grid_sgd_clf_scores.mean(), grid_sgd_clf_scores.std() * 2))

        # create Pipeline with vectoriser and optimal classifier
        self.pipe = Pipeline([('vectoriser', vectoriser),
                              ('classifier', search)])  # clf

        # fit the pipeline to the full training data
        self.pipe.fit(X, self.le.transform(y.values))

        # save pipe to file to prevent rerunning the same pipelines
        if prefix is None:
            prefix = ''
        if save:
            f_name = f'./data/pipes/{prefix}__{datetime.now().strftime("%Y%m%d%H%M%S")}.pipe'
            joblib.dump((self.pipe,
                         self.le,
                         ),
                        f_name,
                        compress=('gzip', 3),
                        protocol=5)
            print(f"Pipeline saved to: {f_name}")

        return self.pipe

    def save_pipe(self, f_name):
        joblib.dump((self.pipe, self.le), f_name)
        print(f"Pipeline saved to: {f_name}")

    def load_pipe(self, f_name):
        if os.path.isfile(f_name):
            self.pipe, self.le = joblib.load(f_name)
        else:
            self.pipe = None
            self.le = LabelEncoder()
        print(f"Pipeline loaded from: {f_name}")

    def predict_proba_transformed(self, X, **predict_proba_params):

        if isinstance(X, pd.Series):
            probs = self.pipe.predict_proba(X, **predict_proba_params)
            id_vars = [X.name]
            X = pd.DataFrame(X)
        else:
            probs = self.pipe.predict_proba(X[X.columns[1]], **predict_proba_params)
            id_vars = list(X.columns)
            print(id_vars)

        c = pd.concat(
            [X.reset_index(drop=True),
             pd.DataFrame(probs, columns=self.le.classes_),
             ],
            axis=1)
        c.loc[:, self.le.classes_] = c.loc[:, self.le.classes_].replace(0, np.nan)
        return (c
            .set_index(id_vars)
            .stack()
            .reset_index()
            .rename(columns={
                "level_1": "label",
                "level_2": "label",
                0: "value",
            }
            )
        )

In [None]:
TRAIN_HOSPITALS = ['vumc', 'amc', 'erasmus', 'olvg']
EHR_SYSTEMS = ['epic', 'mv', 'hix']
SOURCE = 'parameter_name'
TARGET = 'concept_label'
HOSPITAL_COLUMN = 'hospital_name'
DATA_DISTRIBUTION_COLUMNS = ['amin', 'amax', 'p25', 'p50', 'p75', 'p50_over_iqr', 'iqr_over_p50', 'skewed']
DATA_DISTRIBUTION_WEIGHTS = 'num_records'

In [None]:
def transform_predictions_to_proportions(predictions: pd.DataFrame,
                                         original_data: pd.DataFrame,
                                         cumulative_score: bool = False,
                                         ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Transform probability predictions from the AutoMap class to proportions of correct predictions per rank stratified over relevance, irrelevance and overall scores to be used for plotting.

    :param predictions: pandas DataFrame of probability predictions as output by the AutoMap class, where each row represents a prediction for a concept for a document and its corresponding probability.
    :param original_data: pandas DataFrame of the original data as input to the AutoMap class, where each row represents a document and its corresponding true concept labels.
    :param cumulative_score: boolean indicating whether to calculate cumulative scores over the top X predictions.
    :return: tuple of pandas DataFrames where the first returns the probability while the second returns the number of parameters for which predictions were made
    """

    _predictions = predictions.sort_values(['id', 'value'], ascending=[True, False])
    _predictions[TARGET] = _predictions['id'].map(original_data.set_index('id')[TARGET])
    _predictions['rank'] = _predictions.groupby('id').cumcount()
    _predictions['rank_correct'] = (_predictions['label'] == _predictions[TARGET]).astype(int)
    _predictions['relevance'] = (_predictions[TARGET] == 'unmapped').map({True: 'irrelevant',
                                                                          False: 'relevant'})
    # calculate scores
    scores = _predictions.groupby(['relevance', 'rank'])['rank_correct'].sum().reset_index()
    scores_plot = scores.pivot(index=['rank'], columns=['relevance'], values=['rank_correct']).fillna(0)
    scores_plot.columns = scores_plot.columns.droplevel()
    scores_plot['overall'] = scores_plot.sum(axis=1)
    if cumulative_score:
        scores_plot = scores_plot.cumsum()
    scores_plot = scores_plot[sorted(scores_plot.columns)]

    # get the number of parameter in each group of relevance
    parameter_count = _predictions[['id', 'relevance']].groupby(['relevance'])['id'].nunique()
    parameter_count['overall'] = parameter_count.sum()
    parameter_count = parameter_count.sort_index()

    scores_plot_ratio = scores_plot / parameter_count
    print(scores_plot_ratio)

    parameter_count = _predictions[['id', 'relevance', 'rank']].groupby(['relevance', 'rank'])['id'].count()
    parameter_count = parameter_count.reset_index().pivot(index='rank', columns='relevance', values='id')
    parameter_count['overall'] = parameter_count['irrelevant'].fillna(0) + parameter_count['relevant'].fillna(0)
    print(parameter_count)

    return scores_plot_ratio, parameter_count

In [None]:
# merge predictions with original data to retrieve grouping categories
def merge_predictions_with_original_data(predictions: pd.DataFrame,
                                         original_data: pd.DataFrame,
                                         grouping_categories: Dict[str, Dict[str, str]] = None,
                                         ) -> pd.DataFrame:
    """
    Get grouping categories for predictions

    :param predictions: pandas DataFrame of probability predictions as output by the AutoMap class, where each row represents a prediction for a concept for a document and its corresponding probability.
    :param original_data: pandas DataFrame of the original data as input to the AutoMap class, where each row represents a document and its corresponding true concept labels.
    :param grouping_categories: dictionary of column name followed by dictionary to map source to target values
    :return: pandas DataFrame with predictions and grouping categories
    """
    _predictions = predictions.sort_values(['id', 'value']).copy()
    _predictions = _predictions.merge(original_data, on='id')
    if grouping_categories:
        for key, values in grouping_categories.items():
            _predictions[f"{key}_groups"] = _predictions[key].map(values)
    return _predictions

# assign correct flag to predictions
def assign_correct_flag(predictions: pd.DataFrame,
                        target_column: str = TARGET,
                        ) -> pd.DataFrame:
    """
    Assign correct flag to predictions

    :param predictions: pandas DataFrame of probability predictions as output by the AutoMap class, where each row represents a prediction for a concept for a document and its corresponding probability.
    :param target_column: column name of the target column
    :return: pandas DataFrame with predictions and correct flag
    """
    _predictions = predictions.copy()
    _predictions['correct'] = _predictions[target_column] == _predictions['label']
    return _predictions

# assign ranks to predictions
def assign_ranks(predictions: pd.DataFrame,
                 target_column: str = 'label',
                 ) -> pd.DataFrame:
    """
    Assign ranks to predictions

    :param predictions: pandas DataFrame of probability predictions as output by the AutoMap class, where each row represents a prediction for a concept for a document and its corresponding probability.
    :param target_column: column name of the target column
    :return: pandas DataFrame with predictions and ranks
    """
    _predictions = predictions.sort_values(['id', 'value'], ascending=[True, False]).copy()
    _predictions['rank'] = _predictions.groupby('id').cumcount() + 1
    return _predictions


def assign_relevance(predictions: pd.DataFrame,
                     target_column: str = TARGET,
                     ) -> pd.DataFrame:
    """
    Assign relevance to predictions

    :param predictions: pandas DataFrame of probability predictions as output by the AutoMap class, where each row represents a prediction for a concept for a document and its corresponding probability.
    :param target_column: column name of the target column
    :return: pandas DataFrame with predictions and relevance
    """
    _predictions = predictions.copy()
    _predictions['relevance'] = (_predictions[target_column] == 'unmapped').map({True: 'irrelevant', False: 'relevant'})
    return _predictions

def get_processed_data(predictions,
                       original_data,
                       grouping_categories,
                       ):

    _predictions = merge_predictions_with_original_data(predictions=predictions,
                                                        original_data=original_data,
                                                        grouping_categories=grouping_categories)
    _predictions = assign_correct_flag(predictions=_predictions)
    _predictions = assign_ranks(predictions=_predictions)
    _predictions = assign_relevance(predictions=_predictions)
    return _predictions


def calculate_scores_for_groups(
        data: pd.DataFrame,
        label_true: str = TARGET,
        label_pred: str = 'label',
        ) -> pd.DataFrame:
    """
    Calculate scores for the data passed in.
    :param data: pandas DataFrame with at least columns for true labels and predicted labels
    :param label_true: string with the name of the column containing the true labels
    :param label_pred: string with the name of the column containing the predicted labels
    :return: pandas DataFrame with scores for each true label
    """

    result = pd.DataFrame(sklearn.metrics.precision_recall_fscore_support(
        y_true=data[label_true],
        y_pred=data[label_pred],
        labels=data[label_true].unique(),
        average=None, #average='weighted',
        beta=1,
        zero_division=0,
    )).transpose().set_index(data[label_true].unique())
    result.columns = ['precision', 'recall', 'f1', 'support']
    result.index.name = label_true
    result = result.reset_index()

    result_accuracy = data.groupby([TARGET]).apply(lambda x: sklearn.metrics.accuracy_score(
        y_true=x[label_true],
        y_pred=x[label_pred],
        normalize=True,
        sample_weight=None,
    )).to_dict()
    result['accuracy'] = result[label_true].map(result_accuracy)

    result_num_records = data.groupby([TARGET]).apply(lambda x: np.sum(x['num_records'])).to_dict()
    result['num_records'] = result[label_true].map(result_num_records)

    return result

def weighted_average(x: pd.DataFrame,
                     score_types: List[str] = None,
                     ) -> Dict[str, float]:
    """
    Calculated the weighted average for each score type if they're present in the dataframe columns. Expects 'support' to contain counts for each score type.
    :param x: pandas DataFrame
    :param score_types: list of strings with score types to calculate weighted average for
    :return: dictionary with weighted average for each score type
    """
    score_types = ['accuracy', 'precision', 'recall', 'f1'] if score_types is None else score_types
    sum_cols = ['num_records', 'support', 'group_count']
    return_dict = {score_type: np.average(x[score_type], weights=x['support']) for score_type in score_types if score_type in x}
    for col in sum_cols:
        if col in x:
            return_dict[col] = int(np.sum(x[col]))
        elif col == 'group_count':
            return_dict[col] = len(x)
    return return_dict

def calculate_scores(predictions: pd.DataFrame,
                     original_data: pd.DataFrame,
                     grouping_categories: dict = None,
                     rank: int = 1,
                     ) -> Dict[str, pd.DataFrame]:
    """
    Retrieves various grouped scores for the publication.
    :param predictions: pandas DataFrame of the predictions with at least the columns 'id', 'label', 'value'
    :param original_data: panda DataFrame of the original data being predicted on. Must contain the columns 'id', TARGET concept label and ehr_name.
    :param grouping_categories: dictionary of column names and a corresponding dictionary to map values to. Groups will be written to {key}_group column.
    :param rank: integer of the rank to calculate scores for, default is 1 for the first prediction
    :return: dictionary of various grouping structures and the respective table for accuracy/precision/recall/f1-scores and support
    """
    proc = get_processed_data(predictions=predictions,
                              original_data=original_data,
                              grouping_categories=grouping_categories)
    # Table with scores for each concept label --> allows for grouping over data categories and relevance
    result = calculate_scores_for_groups(data=proc.loc[proc['rank'] == rank], label_true=TARGET, label_pred='label')
    result['concept_label_group'] = result['concept_label'].map(concept_category_groups)
    result['relevance'] = (result['concept_label'] == 'unmapped').map({True: 'irrelevant', False: 'relevant'})
    result = result.loc[result['support'] > 0].copy() # remove concepts that were not available in the test set as they cannot be evaluated

    # Table with scores for each concept label group
    result_per_concept_label_group = result.groupby(['concept_label_group']).apply(lambda x: weighted_average(x)
            ).apply(pd.Series).sort_values('support', ascending=False)

    # Table with scores for each relevance group
    result_per_relevance_group = result.groupby(['relevance']).apply(lambda x: weighted_average(x)).apply(pd.Series).sort_values('support', ascending=False)
    result_prg_overall = weighted_average(result_per_relevance_group.reset_index())
    result_prg_overall = pd.DataFrame(result_prg_overall, index=['zoverall'])
    result_per_relevance_group = pd.concat([result_prg_overall, result_per_relevance_group]).sort_index(ascending=False)

    # Table with scores for each EHR system and Relevance groups --> specifically for table in manuscript
    result_ehr = proc.loc[proc['rank'] == rank].groupby(['ehr_name']).apply(lambda x: calculate_scores_for_groups(data=x, label_true=TARGET, label_pred='label'))
    result_ehr['relevance'] = (result_ehr['concept_label'] == 'unmapped').map({True: 'irrelevant', False: 'relevant'})
    result_ehr_prg = result_ehr.reset_index().groupby(['ehr_name', 'relevance']).apply(lambda x: weighted_average(x)).apply(pd.Series).sort_values('support', ascending=False)
    # combine relevant and irrelevant into a weighted average overall score
    result_ehr_overall = result_ehr_prg.reset_index().groupby(['ehr_name']).apply(
        lambda x: weighted_average(x)
            ).apply(pd.Series).sort_values('support', ascending=False)
    result_ehr_overall = result_ehr_overall.reset_index()
    result_ehr_overall['relevance'] = 'zoverall'
    result_ehr_overall.set_index(['ehr_name', 'relevance'], inplace=True)
    result_ehr_prg = pd.concat([result_ehr_prg, result_ehr_overall]).sort_values(['ehr_name', 'relevance'], ascending=[True, False])

    # Table with scores for each EHR, Hospital Name groups, and relevance groups --> specifically for table in manuscript supplementary file
    result_ehr_hosp = proc.loc[proc['rank'] == rank].groupby(['ehr_name', 'hospital_name']).apply(lambda x: calculate_scores_for_groups(data=x, label_true=TARGET, label_pred='label'))
    result_ehr_hosp['relevance'] = (result_ehr_hosp['concept_label'] == 'unmapped').map({True: 'irrelevant', False: 'relevant'})
    result_ehr_hosp_prg = result_ehr_hosp.reset_index().groupby(['ehr_name', 'hospital_name', 'relevance']).apply(lambda x: weighted_average(x)).apply(pd.Series).sort_values('support', ascending=False)
    # combine relevant and irrelevant into a weighted average overall score
    result_ehr_hosp_overall = result_ehr_hosp_prg.reset_index().groupby(['ehr_name', 'hospital_name']).apply(
        lambda x: weighted_average(x)
            ).apply(pd.Series).sort_values('support', ascending=False)
    result_ehr_hosp_overall = result_ehr_hosp_overall.reset_index()
    result_ehr_hosp_overall['relevance'] = 'zoverall'
    result_ehr_hosp_overall.set_index(['ehr_name', 'hospital_name', 'relevance'], inplace=True)
    result_ehr_hosp_prg = pd.concat([result_ehr_hosp_prg, result_ehr_hosp_overall]).sort_values(['ehr_name', 'hospital_name', 'relevance'], ascending=[True, True, False])

    return {'label': result,
            'label_group': result_per_concept_label_group,
            'relevance': result_per_relevance_group,
            'ehr_relevance': result_ehr_prg,
            'ehr_hosp_relevance': result_ehr_hosp_prg,
            }



In [None]:
def get_plot_data(results: Dict[int, Dict[str, pd.DataFrame]],
                  dataset: str = 'relevance',
                  score_type='recall',
                  ) -> Tuple[pd.DataFrame, pd.DataFrame]:

    y_values = pd.concat([results[i][dataset][score_type] for i in range(1, 11)], axis=1)
    y_values.columns = list(range(1,11))

    s_values = pd.concat([results[i][dataset]['support'] for i in range(1, 11)], axis=1)
    s_values.columns = list(range(1,11))
    s_values = s_values.astype(int)
    return y_values.transpose(), s_values.transpose()

def plot_results(results: Dict[int, Dict[str, pd.DataFrame]],
                 dataset: str = 'relevance',
                 score_type: str = 'recall',
                 N: int = 10,
                 cumulative: bool = True,
                 plot_order=None,
                 color_palette=None,
                 save_loc=None,
                 ):
    """
    Plots the results of the evaluation.
    :param results:
    :param score_type:
    :param N:
    :param cumulative:
    :param plot_order:
    :param color_palette:
    :param save_loc:
    :return:
    """
    score_data, count_data = get_plot_data(results=results, dataset=dataset, score_type=score_type)

    if cumulative:
        score_data = score_data.cumsum()
    score_data.to_csv(f'{save_loc}score_data__{dataset}__{score_type}.csv')
    count_data.to_csv(f'{save_loc}count_data__{dataset}__{score_type}.csv')
    plot_order = sorted(score_data.columns, reverse=True) if plot_order is None else plot_order
    plot_order_rename_dict = {x: x.replace('zoverall', 'overall').capitalize() for x in plot_order}
    score_data.rename(columns=plot_order_rename_dict, inplace=True)
    count_data.rename(columns=plot_order_rename_dict, inplace=True)
    plot_order = plot_order_rename_dict.values()
    c_palette = ['black'] * len(plot_order) if color_palette is None else color_palette[0:len(plot_order)] #['black'] * count_values.shape[1]
    fig, (ax1, ax2) = plt.subplots(2, 1,
                                   sharex=True,
                                   figsize=(6,6),
                                   gridspec_kw={'height_ratios': [6,2],
                                                },
                                   )

    # Plot scores
    sns.lineplot(data=score_data[plot_order],
                 palette=c_palette,
                 legend=True,
                 ax=ax1,
                 )
    ax1.set_xlabel('Rank of predicted labels')
    ax1.set_ylabel(f"{score_type}".capitalize(), labelpad=25)
    ax1.set_ylim(0, 1.01)
    ax1.set_xlim(1, N)
    ax1.legend(
        loc='lower right',
        bbox_to_anchor=(1.0, 0.0),
        ncol=1,
    )

    # Plot parameter counts
    sns.lineplot(data=count_data[plot_order],
                 palette=c_palette,
                 legend=False,
                 ax=ax2,
                 )
    ax2.set_ylim(0, count_data.max().max()*1.05)
    ax2.set_xlim(1, N)
    ax2.set_xticks(list(range(1, N+1)))
    ax2.set_xticklabels(list(range(1, N+1)))
    ax2.set_xlabel('Number of predicted labels')
    ax2.set_ylabel('Parameter\ncount')

    plt.tight_layout()
    plt.savefig(f"{save_loc}plot__{dataset}__{score_type}.png", dpi=1200)
    plt.savefig(f"{save_loc}plot__{dataset}__{score_type}.pdf", dpi=1200)
    plt.show()
    return

In [None]:
def create_concept_grouping(data, source, target) -> dict:
    assert data[source].duplicated().sum() == 0, 'source column is not unique'
    return data.set_index(source)[target].to_dict()

concepts = pd.read_csv('../data/input/concepts.csv')
concept_category_groups = create_concept_grouping(concepts, 'concept_label', 'category')
concept_label_super_groups = create_concept_grouping(concepts, 'concept_label', 'concept_label_super')

In [None]:
sns.set_style('white')
sns.set_context('paper')

In [None]:
mappings = pd.read_csv('../data/input/combined.csv')
mappings["id"] = mappings.reset_index(drop=True).index

In [None]:
am = AutoMap()

In [None]:
ehr_dict = {'stantonius': 'EPIC',
            'amc': 'EPIC',
            'vumc': 'EPIC',
            'etz': 'EPIC',
            'radboud': 'EPIC',
            'spaarnegasthuis': 'EPIC',
            'catharina': 'HIX',
            'cwz': 'HIX',
            'ikazia': 'HIX',
            'martini': 'HIX',
            'slingeland': 'HIX',
            'erasmus': 'HIX',
            'bovenij': 'HIX',
            'noordwest': 'HIX',
            'franciscus': 'HIX',
            'viecuri': 'HIX',
            'zgt': 'HIX',
            'rdgg': 'HIX',
            'laurentius': 'HIX',
            'haga': 'MV',
            'mst': 'MV',
            'umcu': 'MV',
            'albertschweitzer': 'MV',
            'maasstad': 'MV',
            'zuyderland': 'MV',
            'olvg': 'MV',
            'amphia': 'MV',
            'jeroenbosch': 'MV',
            'geldersevallei': 'MV',
            'antoniuszorggroep': 'MV'}

In [None]:
load = False  # Expected training duration 3 hours
results = dict()
if load:
    try:
        results = joblib.load('./results.joblib')
    except:
        files = glob.glob('./output/all_mapped/results/*.csv')
        for file in files:
            results[file.split('\\')[-1].split('.')[0]] = pd.read_csv(file)
else:
    results = {}
    t_start = datetime.now()
    t_length = ehr_dict.keys().__len__()
    for i, hospital in enumerate(ehr_dict.keys()):
        print(f"Starting {hospital} {i+1}/{t_length} {datetime.now() - t_start}")
        print(f"Expected duration: {((datetime.now() - t_start) / (i+1)) * (t_length - (i+1))}")

        test_data = mappings.loc[mappings['hospital_name'] == hospital]
        train_data = mappings.loc[(mappings['hospital_name'] != hospital) & (mappings[TARGET] != 'unmapped')]
        print(f"Testing on {hospital} {test_data.shape} and training on {train_data['hospital_name'].unique()} {train_data.shape}")

        # Create the pipe
        am.create_pipe(X=train_data['parameter_name'], y=train_data['concept_label'], n_jobs=-1)
        print(f"Pipe created, predicting labels")
        results[hospital] = am.predict_proba_transformed(test_data[['id', 'parameter_name']])
    try:
        os.makedirs('./output/all_mapped_elasticnet/results/', exist_ok=True)
        for key, value in results.items():
            print(key)
            value.to_csv(f"./output/all_mapped_elasticnet/results/{key}.csv", index=False)
        print('csvs written')
    except:
        joblib.dump(results, './results.joblib')
        print('joblib dumped')
    print('done')

In [None]:
results.keys()

In [None]:
for key, value in results.items():
    results[key]['hospital'] = key
df = pd.concat([x for x in results.values()])

In [None]:
df['concept_label_original'] = df['id'].map(mappings.set_index('id')['concept_label'])
df['correct'] = df['concept_label_original'] == df['label']
df = df.sort_values(['id', 'value'], ascending=[True, False])
df['rank'] = df.groupby('id').cumcount() + 1
df['ehr'] = df['id'].map(mappings.set_index('id')['ehr_name'])
df['relevance'] = (df['concept_label_original'] == 'unmapped').map({True: 'irrelevant', False: 'relevant'})

In [None]:
# Regular performance plot and tables
save_loc='./output/all_mapped_elasticnet/loo/'
os.makedirs(save_loc, exist_ok=True)

predicted_labels = df.copy()
test_data = mappings.copy()


results = dict()
for i in range(1, 11):
    results[i] = calculate_scores(predicted_labels, test_data[['id', 'hospital_name' , 'ehr_name', 'concept_label', 'num_records']], {TARGET: concept_category_groups}, rank=i)
    for key, value in results[i].items():
        if isinstance(value, pd.DataFrame):
            for col in value.columns:
                if value[col].dtype == 'float64':
                    if (value[col] == value[col].astype(int).astype(float)).all():
                        value[col] = value[col].astype(int)
            value.to_csv(f'{save_loc}aprf__rank_{i}__{key}.csv')
            value.round(3).to_csv(f'{save_loc}aprf__rank_{i}__{key}__round3.csv', float_format='%.3f')

sns.set_style('white')
sns.set_context('paper', font_scale=1)
plot_results(results, score_type='precision', save_loc=save_loc)
plot_results(results, score_type='recall', save_loc=save_loc)
plot_results(results, score_type='f1', save_loc=save_loc)

In [None]:
# catplot / swarmplot

# calculate accuracy per hospital/EHR
a = df.groupby(['ehr', 'hospital', 'rank', 'relevance'])['correct'].mean()

# calculate overall score
b = df.groupby(['ehr', 'hospital', 'rank'])['correct'].mean()
b = pd.DataFrame(b)
b['relevance'] = 'overall'
b = b.groupby(['ehr', 'hospital', 'rank', 'relevance']).mean()
b.columns = [0]


c = pd.concat([a, b])
c.sort_values(['ehr', 'hospital', 'rank', 'relevance'], inplace=True)

d = c.groupby(['ehr', 'hospital', 'relevance']).cumsum().unstack()
d.columns = ['irrelevant', 'overall', 'relevant']
d = d.reset_index()
d = d.melt(
    id_vars=['ehr', 'hospital', 'rank'],
    value_vars=['irrelevant', 'overall', 'relevant'],
)
d = d.rename(columns={'ehr': 'EHR', 'value': 'Recall', 'variable': 'Relevance', 'rank': 'Number of predicted labels',})
d['Relevance'] = d['Relevance'].str.capitalize()

In [None]:
sns.set_style('white')
sns.set_context('paper', font_scale=2)
sns.catplot(
    data=d.loc[d['Number of predicted labels'] <= 10],
    x='Number of predicted labels',
    y='Recall',
    color='black',
    col_order=['Overall', 'Relevant', 'Irrelevant'],
    row_order=['EPIC', 'HIX', 'MV'],
    row='EHR',
    col='Relevance',
    kind='swarm',
    sharex=True, sharey=True,
    facet_kws={
        'xlim': (1, 10),
        'ylim':(0,1.01),
    },
    **{'height':8},
)

plt.savefig(f"{save_loc}grid_plot__ehr__relevance__recall.png", dpi=1200)
plt.savefig(f"{save_loc}grid_plot__ehr__relevance__recall.pdf", dpi=1200)