In [1]:
import numpy as np
import pandas as pd

from devutils import *

In [8]:
# ED vocabulary
vocab_filename = "rmh_2012_2017_dev_amt5"

# Dictionary of misspellings
spell_filename = "rmh_2012_2017_dev_amt5"

# Classifier and threshold
model_filename = "calibrated_lgbm_rmh_2012_2017_dev_amt5"

# Dataset used for analysis
unseen_data_filename = "lvrh_2012_2022" # rmh_2012_2017_test, rmh_2018_2022, lvrh_2012_2022

normalise = False
correct = False
mode = 'eval'

In [None]:
# Load the ED vocabulary
vocab = load_vocab(vocab_filename)

# Load the dictionary of corrected misspellings
misspelled_dict = load_misspelled_dict(spell_filename)
    
# Load a pre-trained model and threshold
model, thresh = load_model(model_filename)

### Make predictions

In [None]:
%%time
if normalise:
    # All steps of triage note normalisation
    # Load the dataset
    df = pd.read_csv("../datasets/" + unseen_data_filename + ".csv", 
                     converters={'triage_note': str})
    unseen_data_filename = unseen_data_filename.replace('_cleaned', '')
    print_stats(df)
    count_tokens(df.triage_note)

    # Pre-processing
    df['preprocessed_triage_note'] = df.triage_note.apply(preprocess)
    count_tokens(df.preprocessed_triage_note)
    
    # Create tokenised text
    df['tokenized_triage_note'] = tokenize_step1(df.preprocessed_triage_note)
    count_tokens(df.tokenized_triage_note)
    
    # Re-tokenise text
    df.tokenized_triage_note = tokenize_step2(df.tokenized_triage_note, vocab)
    count_tokens(df.tokenized_triage_note)
    
    # Correct spelling mistakes
    df['corrected_triage_note'] = df.tokenized_triage_note.apply(spelling_correction, 
                                                                 misspelled_dict=misspelled_dict)
    count_tokens(df.corrected_triage_note)
    
    # Replace slang for medications
    df['normalised_triage_note'] = replace_slang(df.corrected_triage_note)
    count_tokens(df.normalised_triage_note)
    
    # Extract features
    df['entities'] = df.normalised_triage_note.apply(extract_features)
    count_tokens(df.entities)
    
elif correct:
    # Load the dataset
    df = pd.read_csv("../datasets/" + unseen_data_filename + "_nospellcorr.csv", 
                     converters={'tokenized_triage_note': str})
    count_tokens(df.tokenized_triage_note)
    
    # Correct spelling mistakes
    df['corrected_triage_note'] = df.tokenized_triage_note.apply(spelling_correction, 
                                                                 misspelled_dict=misspelled_dict)
    count_tokens(df.corrected_triage_note)
    
    # Replace slang for medications
    df['normalised_triage_note'] = replace_slang(df.corrected_triage_note)
    count_tokens(df.normalised_triage_note)
    
    # Extract features
    df['entities'] = df.normalised_triage_note.apply(extract_features)
    count_tokens(df.entities)
    
else:
    # Load the dataset
    df = pd.read_csv("../datasets/" + unseen_data_filename + "_normalised.csv", 
                     converters={'triage_note': str, 
                                 'preprocessed_triage_note': str, 
                                 'tokenized_triage_note': str, 
                                 'corrected_triage_note': str, 
                                 'normalised_triage_note': str,
                                 'entities': str})
    print_stats(df)
    
# Define features
features='entities'
X = df[features]

# Make predictions
y_proba = model.predict_proba(X)
y_proba = y_proba[:,1]

# Convert probabilities to class labels
y_pred = threshold_proba(y_proba, thresh)

#  NLP classifier predictions
df['probability'] = y_proba
df['prediction'] = y_pred

if mode == 'eval':
    y = df.SH
    # Plot curves
    plot_curves(y, y_proba, filename=unseen_data_filename)
    # Evaluate classification on the whole dataset
    evaluate_classification(y, y_pred, filename=unseen_data_filename)
else:
    print("Proportion of labels predicted as positive: %.1f%%" % 
          (df.prediction.sum() / df.shape[0] * 100))

df.to_csv("../datasets/" + unseen_data_filename + "_predicted.csv", index=False)

### For review

In [None]:
df['prediction_class'] = "TN"
df.loc[(df.SH==1) & (df.prediction==1), 'prediction_class'] = "TP"
df.loc[(df.SH==1) & (df.prediction==0), 'prediction_class'] = "FN"
df.loc[(df.SH==0) & (df.prediction==1), 'prediction_class'] = "FP"
df.prediction_class.value_counts()

In [None]:
df['quarter'] = df.arrival_date.dt.to_period('Q')

idx = df[df.prediction_class.isin(['FP', 'FN'])].groupby(['quarter', 'prediction_class']).sample(3).index

df['for_review'] = 0
df.loc[idx, 'for_review'] = 1

In [None]:
cols = ['arrival_date', 'triage_note', 
        'SH', 'SI', 'AOD_OD', 
        'probability', 'prediction', 'prediction_class', 
        'for_review']

df[cols].to_csv("../datasets/" + unseen_data_filename + "_predicted.csv", index=False)