## STEPS

* We go through the 4 steps that are required to de-identify a dataset (i.e run the forward pass on this dataset using a trained model)

In [None]:
%load_ext autoreload
%autoreload 2

## STEP 0: LIBRARIES

In [None]:
import time
a = time.time()

In [None]:
import json
import pandas as pd
import os
from pathlib import Path

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'


In [None]:
from transformers import HfArgumentParser, TrainingArguments

In [None]:
from robust_deid.ner_datasets import DatasetCreator
from robust_deid.sequence_tagging import SequenceTagger
from robust_deid.sequence_tagging.arguments import (
    ModelArguments,
    DataTrainingArguments,
    EvaluationArguments,
)
from robust_deid.deid import TextDeid

## STEP 1: INITIALIZE

In [None]:
# We require two inputs and all other files are created relative to the input_file

# Initialize the path where the dataset is located (input_file).
# Input dataset
input_file = "/home/vs428/project/Data/GPT/notes_abd_simple_trigger.csv"
# we also take in a prefix that is appended to all the temporary files created
prefix = "patient"

# we create a "temp" folder in the parent folder of your input_file and put all the intermediate steps there.
# the final file is output into the parent directory
path = Path(input_file)
intermediate_path = (path.parent.absolute() / "temp")
if not os.path.exists(intermediate_path):
    os.makedirs(intermediate_path)
    
# Initialize the location where we will store the sentencized and tokenized dataset (ner_dataset_file)
ner_dataset_file = intermediate_path / f'{prefix}_ner.jsonl'

# Initialize the location where we will store the model predictions (predictions_file)
# Verify this file location - Ensure it's the same location that you will pass in the json file
# to the sequence tagger model. i.e. output_predictions_file in the json file should have the same
# value as below
predictions_file = intermediate_path / f'{prefix}_pred.jsonl'

# Initialize the file that will contain the original note text and the de-identified note text
deid_file = intermediate_path / f'{prefix}_deid.jsonl'

# Initialize the file that will contain the post-processed de-identified note text
# NOTE: final file is output to the parent directory instead
postprocessed_deid_file = path.parent.absolute() / f'{path.stem}_postprocessed.csv'

# Initialize the model config. This config file contains the various parameters of the model.
model_config = './config/predict_i2b2.json'

In [None]:
# it cannot be anything other than 'text' because this package is made badly
# I'm assuming this is also the case for note_id
text_col_name = "text"
id_col_name = 'note_id'
# the columns you want to include as metadata
# NOTE: you will get an error if these columns don't exist in the data
meta_cols = ['note_id']


# Step 1a: Preprocess with regex

In [None]:
notes_df = pd.read_csv(input_file)

In [None]:
notes_df = notes_df.rename({"0":"text", "Unnamed: 0":"note_id"}, axis=1)

In [None]:
# time is usually 4 numbers surrounded by spaces, so replace it with <<TIME:1324>>, or it could be XX:XX:XX, or 3:15 AM?
notes_df[text_col_name] = notes_df[text_col_name].str.replace(r"(\s[0-9]{4}\s?|[0-9]{2}:[0-9]{2}:[0-9]{2}|[0-9]{1}:[0-9]{2}\s?([AaPp][Mm])?)", 
                             r"<<TIME:\1>>", regex=True)
    
# we missed provider names sometimes and they were the only thing on the line, ending with MD,PA,PA-C, APRN
notes_df[text_col_name] = notes_df[text_col_name].str.replace(r"^([a-zA-Z,]+?MD|^[a-zA-Z,]+?PA|^[a-zA-Z,]+?PA-C|^[a-zA-Z,]+?APRN)", 
                             r"<<STAFF:\1>>", regex=True)


# replace address that's often missed
# 150 Sargent Dr  New Haven CT 06511-6100
notes_df[text_col_name] = notes_df[text_col_name].str.replace(r"(150 Sargent\s+?Dr\s+?New Haven CT\s+?06511-6100)", 
                             r"<<LOCATION:\1>>", regex=True)

# sometimes numbers are missed of type XXX.XXX.XXX or XXX-XXX-XXXX
notes_df[text_col_name] = notes_df[text_col_name].str.replace(r"([0-9]{3}\.[0-9]{3}\.[0-9]{4}|[0-9]{3}\-[0-9]{3}\-[0-9]{4})", 
                             r"<<PHONE:\1>>", regex=True)


# Step 1b: Convert CSV to the right format

In [None]:
notes_df['meta'] = json.loads(notes_df[meta_cols].to_json(orient="records"))

In [None]:
notes_df.columns

In [None]:
notes_df['spans'] = notes_df.shape[0] * [[]]

In [None]:
preprocessed_fp = str(intermediate_path / path.stem) + "_preprocessed" + ".jsonl"
notes_df[[text_col_name, "meta", "spans"]].to_json(preprocessed_fp, orient="records", lines=True)

In [None]:
preprocessed_fp

## STEP 2: NER DATASET
* Sentencize and tokenize the raw text. We used sentences of length 128, which includes an additional 32 context tokens on either side of the sentence. These 32 tokens serve (from the previous & next sentence) serve as additional context to the current sentence.
* We used the en_core_sci_lg sentencizer and a custom tokenizer (can be found in the preprocessing module)
* The dataset stored in the ner_dataset_file will be used as input to the sequence tagger model

In [None]:
# %%time

In [None]:
# Create the dataset creator object

dataset_creator = DatasetCreator(
    sentencizer='en_core_sci_sm',
    tokenizer='clinical',
    max_tokens=128,
    max_prev_sentence_token=32,
    max_next_sentence_token=32,
    default_chunk_size=32,
    ignore_label='NA'
)


In [None]:
preprocessed_fp

In [None]:
# This function call sentencizes and tokenizes the dataset
# It returns a generator that iterates through the sequences.
# We write the output to the ner_dataset_file (in json format)
ner_notes = dataset_creator.create(
    input_file=preprocessed_fp,
    mode='predict',
    notation='BILOU',
    token_text_key=text_col_name,
    metadata_key='meta',
    note_id_key=id_col_name,
    label_key='label',
    span_text_key='spans'
)
# Write to file
with open(ner_dataset_file, 'w') as file:
    for ner_sentence in ner_notes:
        file.write(json.dumps(ner_sentence) + '\n')

## STEP 3: SEQUENCE TAGGING
* Run the sequence model - specify parameters to the sequence model in the config file (model_config). The model will be run with the specified parameters. For more information of these parameters, please refer to huggingface (or use the docs provided).
* This file uses the argmax output. To use the recall threshold models (running the forward pass with a recall biased threshold for aggressively removing PHI) use the other config files.
* The config files in the i2b2 direct`ory specify the model trained on only the i2b2 dataset. The config files in the mgb_i2b2 directory is for the model trained on both MGB and I2B2 datasets.
* You can manually pass in the parameters instead of using the config file. The config file option is recommended. In our example we are passing the parameters through a config file. If you do not want to use the config file, skip the next code block and manually enter the values in the following code blocks. You will still need to read in the training args using huggingface and change values in the training args according to your needs.

In [None]:
parser = HfArgumentParser((
    ModelArguments,
    DataTrainingArguments,
    EvaluationArguments,
    TrainingArguments
))
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, evaluation_args, training_args = parser.parse_json_file(json_file=model_config)

In [None]:
# Initialize the sequence tagger
sequence_tagger = SequenceTagger(
    task_name=data_args.task_name,
    notation=data_args.notation,
    ner_types=data_args.ner_types,
    model_name_or_path=model_args.model_name_or_path,
    config_name=model_args.config_name,
    tokenizer_name=model_args.tokenizer_name,
    post_process=model_args.post_process,
    cache_dir=model_args.cache_dir,
    model_revision=model_args.model_revision,
    use_auth_token=model_args.use_auth_token,
    threshold=model_args.threshold,
    do_lower_case=data_args.do_lower_case,
    fp16=training_args.fp16,
    seed=training_args.seed,
    local_rank=training_args.local_rank
)

In [None]:
# Load the required functions of the sequence tagger
sequence_tagger.load()

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Set the required data and predictions of the sequence tagger
# Can also use data_args.test_file instead of ner_dataset_file (make sure it matches ner_dataset_file)
sequence_tagger.set_predict(
    test_file=str(ner_dataset_file),
    max_test_samples=data_args.max_predict_samples,
    preprocessing_num_workers=data_args.preprocessing_num_workers,
    overwrite_cache=data_args.overwrite_cache
)

In [None]:
# IMPORTANT NOTE: we use the `eval_accumulation_steps` in the config file so that we don't try and gather the entire evaluation dataset on the GPU after evaluation. 
# This helps ensure that we don't hit any GPU CUDA errors. Instead, only play with the batch size. 

training_args.fp16 = True
training_args.disable_tqdm = False
training_args.fp16_full_eval = True

import torch
torch.cuda.empty_cache()

In [None]:
# Initialize the huggingface trainer
sequence_tagger.setup_trainer(training_args=training_args)

In [None]:
# Store predictions in the specified file
predictions = sequence_tagger.predict()
# Write predictions to a file
with open(predictions_file, 'w') as file:
    for prediction in predictions:
        file.write(json.dumps(prediction) + '\n')

## STEP 4: DE-IDENTIFY TEXT

* This step uses the predictions from the previous step to de-id the text. We pass the original input file where the original text is present. We look at this text and the predictions and use both of these to de-id the text.

In [None]:
# Initialize the text deid object
text_deid = TextDeid(notation='BILOU', span_constraint='super_strict')

In [None]:
# De-identify the text - using deid_strategy=replace_informative doesn't drop the PHI from the text, but instead
# labels the PHI - which you can use to drop the PHI or do any other processing.
# If you want to drop the PHI automatically, you can use deid_strategy=remove
deid_notes = text_deid.run_deid(
    input_file=preprocessed_fp,
    predictions_file=str(predictions_file),
    deid_strategy='replace_informative',
    keep_age=False,
    metadata_key='meta',
    note_id_key=id_col_name,
    tokens_key='tokens',
    predictions_key='predictions',
    text_key=text_col_name,
)

In [None]:
# Write the deidentified output to a file
with open(deid_file, 'w') as file:
    for deid_note in deid_notes:
        file.write(json.dumps(deid_note) + '\n')

In [None]:
b = time.time()
print(b-a)

# Postprocess

In [None]:
import jsonlines
import pandas as pd
import re

In [None]:
deid_file

In [None]:
deid_notes = []
with jsonlines.open(deid_file) as reader:
    for line in reader:
        deid_notes.append(line)

In [None]:
deid_df = pd.DataFrame.from_records(deid_notes)


In [None]:
# regex rules

# first clean up the accidental DE-ID from the pipeline in the template for the HEART score
# need to replace it with the original string
deid_df['deid_text'] = deid_df['deid_text'].str.replace(r"<<AGE:45 - 64>>", "45 - 64", regex=True)

deid_df['deid_text'] = deid_df['deid_text'].str.replace(r"- < <<AGE:45>> 0", "- < 45 0", regex=True)

# replace incomplete hospital name acrnoyms
# <<HOSPITAL:BH>> GH LMW Q YH -> <<HOSPITAL:BH GH LMW Q YH>>
deid_df['deid_text'] = deid_df['deid_text'].str.replace(r"<<HOSPITAL:BH>> GH LMW Q YH", "<<HOSPITAL:BH GH LMW Q YH>>", regex=True)

# replace incomplete yale name
# <<HOSPITAL:Yale>> Radiology and Biomedical Imaging -> <<HOSPITAL:Yale Radiology and Biomedical Imaging>>
deid_df['deid_text'] = deid_df['deid_text'].str.replace(r"<<HOSPITAL:Yale>> Radiology and Biomedical Imaging", 
                                 r"<<HOSPITAL:Yale Radiology and Biomedical Imaging>>", regex=True)
                                                                                            
#############################################
#############################################
#############################################


In [None]:
deid_df.to_csv(postprocessed_deid_file, index=False)

# Drop all Tags

In [None]:
deid_df = pd.read_csv(postprocessed_deid_file)

In [None]:
import re
def drop_deid_tags(df, col):
    df[col + "_replaced"] = df[col].str.replace(r"<<([A-Z]+?):.*?>>",  r'<<\1>>', regex=True, flags=re.DOTALL)
    return df

In [None]:
replaced = drop_deid_tags(deid_df, "deid_text")


In [None]:
import ast
replaced['meta'] = replaced['meta'].apply(lambda x: ast.literal_eval(x))
replaced = pd.concat([replaced.drop(['meta'], axis=1), replaced['meta'].apply(pd.Series)], axis=1)

In [None]:
replaced

In [None]:
replaced.to_csv(postprocessed_deid_file, index=False)