In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import transformers
import glob
import csv
import re

import sys
import pprint
import pandas as pd
import numpy as np
from scipy.special import softmax
import scipy.stats
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import auc, precision_recall_curve, roc_curve, RocCurveDisplay
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score


from sklearn.model_selection import ParameterGrid
import sklearn
import os
import shutil
from os import listdir
from os.path import isfile, join
import time
import torch

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from datasets import load_dataset

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 300)
pd.set_option('display.max_columns', 100)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# Setting path for importing required functions for data processing

sys.path.append("data_processing")
sys.path.append("data_processing")

# Functions required for data processing

import process_text
import transform_textfiles

# Functions

### Data processing

In [None]:
def extractOriginalText2(input_path):
    # read in progress note from txt file (same txt file that was imported to CLAMP for annotation)
    results = glob.glob(f"{input_path}/*.txt")
    rows = []
    
    # extract original note text from the progress note and save as dataframe
    for i in range(len(results)):
        row = pd.read_csv(results[i], sep="\t", quoting=csv.QUOTE_NONE).iloc[:, [6]]
        row.columns = ['note_des']
        row['file'] = os.path.basename(results[i]).split(".")[0]
        rows.append(row)
    return pd.concat(rows)

def extractXMIAnnotation2(input_path):
    # write Patterns
    id_match = re.compile("(?<=xmi:id=\")\d{,5}(?=\")")
    begin_match = re.compile("(?<=begin=\")\d{,6}(?=\")")
    end_match = re.compile("(?<=end=\")\d{,6}(?=\")")
    tag_match = re.compile("(?<=semanticTag=\")\w*(?=\")")

    # read annotation file (notes were annotated in CLAMP)
    # annotation files were exported from CLAMP in XMI format
    results = glob.glob(f"{input_path}/*.xmi")
    rows = []
    
    # extract annotations from XMI files (noted with semanticTag field)
    for i in range(len(results)):
        file = open(results[i], "r+")
        lines = file.readlines()
        lines = lines[0].split('><')
        extract = [x for x in lines if re.search("semanticTag", x)]
        extract = [(id_match.findall(x),
                    begin_match.findall(x),
                    end_match.findall(x),
                    tag_match.findall(x))
                   for x in extract]
        unique_tags = sorted(list(set([x[3][0] for x in extract])))
#         file_name = re.findall("\\A.*(?=\.)",os.path.basename(results[i]))[0].split("-")
        file_name = os.path.basename(results[i]).split(".")[0]
        
        row = pd.DataFrame({'xmi': [os.path.basename(results[i])],
                            'file': [os.path.basename(results[i]).split(".")[0]],
                            'anon_id': [file_name.split("-")[1]],
                            'encounter_id': [file_name.split("-")[2]]})

        for tag in unique_tags:
            row.loc[:, tag] = 1

        rows.append(row)

    full = pd.concat(rows).fillna(0)

    # categorize and process types of ground truth annotations
    ## PTBM: parent training in behavioral management
    ## weak_bt: weak evidence of PTBM
    ## strong_bt: strong evidence of PTBM
    ## bt_yn: binary variable for any evidence of PTBM
    full['weak_bt'] = np.where((full['Counsel_Handout_BT'] == 1) |
                               (full['Counsel_Parent_BT'] == 1), 1, 0)

    full['strong_bt'] = np.where((full['Refer_Parent_BT'] == 1) |
                                 (full['Refer_School_BT'] == 1), 1, 0)

#     full['bt_yn'] = np.where((full['weak_bt'] == 1) |
#                              (full['strong_bt'] == 1), 1, 0)

    return full

In [None]:
# get required structured data for model in dict
## Structure
## tabular_data = pd.DataFrame({
##     'dis_symp': [0, 1, 1], # 0 = disorder-level code; 1 = symptom-level code
##     'age_35_6': [1, 0, 1], # 0 = 3-5 years old; 1 = 6 years old
## })
# the required variables are hard-coded into the function - make sure to change if needed
# patient ID is set as ANON_ID

def get_tabular_data(sdata, X_set):
    tab = pd.merge(sdata, X_set, left_on='ANON_ID', right_on=X_set.index, how='right')
    tab['dis_symp'] = tab['only.symp']
    tab['age_35_6'] = tab['ADHD.age'].apply(lambda x: 0 if x < 6 else 1)
    tab = tab.set_index('ANON_ID')
    tab = tab[['dis_symp', 'age_35_6']].to_dict('records')
    
    return tab

In [None]:
# construct dataset with note text, structured data, and note labels
def get_dataset(text_data, tabular_data, labels):
    dataset = [
        {
            'input_ids': text_data['input_ids'][idx],
            'attention_mask': text_data['attention_mask'][idx],
            'tabular_data': torch.tensor(list(sample.values()), dtype=torch.float32),
            'labels': torch.tensor(labels[idx], dtype=torch.long)
        }
        for idx, sample in enumerate(tabular_data)
    ]
    return dataset

### Evaluation metrics

In [None]:
# set metrics for comparison
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    probabilities = torch.sigmoid(torch.tensor(p.predictions)).cpu().numpy()
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    roc_auc = roc_auc_score(labels, probabilities[:, 1])

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
    }

# Import Data

In [None]:
label_of_interest = "BT_yn"

In [None]:
# pull and process original text data (takes in a directory of progress note txt files)
originalTextData = extractOriginalText2("cohort_2to6/Text files/combined_text")
print(originalTextData.shape)

In [None]:
# pull and process annotations (takes in a directory of CLAMP outputted annotation XMI files)
annotatedXMIs = extractXMIAnnotation2("cohort_2to6/XMI files/combined")
print(annotatedXMIs.shape)

In [None]:
# set binary outcome variable

annotatedXMIs['BT_yn'] = np.where((annotatedXMIs['Counsel_Parent_BT'] == 1) | (annotatedXMIs['Counsel_Handout_BT'] == 1) | (annotatedXMIs['Refer_Parent_BT'] == 1) | (annotatedXMIs['Refer_School_BT'] == 1), 1, 0)
annotatedXMIs['BT_yn'].value_counts()

In [None]:
# merging data from both files 
data = originalTextData.merge(annotatedXMIs, on = "file", how = "right")

In [None]:
# using imported function sectionize() from process_text for processing notes text data

data['extractText'] = data['note_des'].apply(lambda x: process_text.sectionize(x)[1])

In [None]:
# using imported function clean_text() for processing notes text data

data['extractText'] = data['extractText'].apply(lambda x: process_text.clean_text(x))

In [None]:
# read in structured data (not included in repository due to PHI)
structured_data = pd.read_csv("bt_demographics.csv")
# structured_data.head(1)

In [None]:
# set patient ID type to int
data.anon_id = data.anon_id.astype(int)

In [None]:
# merge text data with structured data to get complete dataset
data = pd.merge(structured_data, data, left_on='ANON_ID', right_on='anon_id', how='right')

In [None]:
# filter down columns in dataset to those necessary for analysis:
## text and label
data = data.loc[:, ['extractText',label_of_interest, 'ANON_ID']]\
       .rename(columns = {'extractText':'text',
                          label_of_interest: 'label'})

data = data.set_index('ANON_ID')

In [None]:
X = data.loc[:, 'text']
y = data.loc[:, 'label']

# Split the Data

In [None]:
# split data into train, validation, and test sets (stratified)
## 70/30 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 117, stratify = y)
X_val_train, X_val_test, y_val_train, y_val_test = train_test_split(X_train, y_train, test_size = 0.3, random_state = 117, stratify = y_train)

In [None]:
val_train = pd.concat([X_val_train, y_val_train], axis = 1)
val_test = pd.concat([X_val_test, y_val_test], axis = 1)
test = pd.concat([X_test, y_test], axis = 1)

 #Checking the final size for train, validation and test set

In [None]:
print("X_train shape: ", X_val_train.shape)
print("X_val shape: ", X_val_test.shape)
print("X_test shape: ", X_test.shape)

In [None]:
# outcome distribution in train set
y_train.value_counts()

In [None]:
# outcome distribution in validation set
y_val.value_counts()

In [None]:
# outcome distribution in test set
y_test.value_counts()

# Get Dataset for Model

In [None]:
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

In [None]:
# reduce structured dataframe to only include the selected variables
red_sdata = structured_data[['ANON_ID', 'ever.dis', 'only.symp', 'ADHD.age']]

# train set tabular data
train_tab = get_tabular_data(red_sdata, X_val_train)
print(train_tab[0])

# validation set tabular data
val_tab = get_tabular_data(red_sdata, X_val_test)
print(val_tab[0])

# test set tabular data
test_tab = get_tabular_data(red_sdata, X_test)
print(test_tab[0])

In [None]:
# Tokenize and get labels in correct format for train and validation sets
train_tok_text = tokenizer(list(X_val_train.values), return_tensors="pt", padding=True, truncation=True, max_length=512)
train_labels = list(y_val_train)

val_tok_text = tokenizer(list(X_val_test.values), return_tensors="pt", padding=True, truncation=True, max_length=512)
val_labels = list(y_val_test)

In [None]:
if torch.cuda.is_available():    
    device = torch.device("cuda")

    print('%d GPU(s) available' % torch.cuda.device_count())
    print('GPU name: ', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using CPU')
    device = torch.device("cpu")

In [None]:
# get train and validation datasets
train_dataset = get_dataset(train_tok_text, train_tab, train_labels)
valid_dataset = get_dataset(val_tok_text, val_tab, val_labels)

# Train

In [None]:
# Load pre-trained BERT model
bert_model = BertForSequenceClassification.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', num_labels=2)

bert_model = bert_model.to('cuda')

In [None]:
# Training arguments
training_args = TrainingArguments(
    report_to="none",
    overwrite_output_dir = True,
    output_dir="./bioclinicalbert_tabular_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    weight_decay=0,
    logging_dir="./bioclinicalbert_logs",
    save_total_limit = 2,
    save_steps = 500,
    eval_steps = 50,
    logging_steps = 50,
    evaluation_strategy = "steps",
    load_best_model_at_end= True,
    learning_rate = 7.214289287225764e-05,
    seed=117,
    lr_scheduler_type = 'cosine_with_restarts',
)

# Initialize the Trainer
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

In [None]:
# save model to directory
trainer.save_model('./bioclinicalbert_tabular_model_tuned')