# Use the SymptomExtraction Task to Train Model to Extract Symptoms from MIMIC-III Notes

## Make sure all required modules are installed and imported 

In [None]:
!pip install git+https://github.com/mariellederocher/pyhealth.git@fix/dates_v2 -q
!pip install seqeval -q
!pip install transformers scispacy spacy -q
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz -q


In [None]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import spacy
from datasets import Dataset
from seqeval.metrics import classification_report

## Load MIMIC-III Dataset with Clinical Notes

In [None]:
from pyhealth.datasets import MIMIC3Dataset

root = "https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III"
dataset = MIMIC3Dataset(
    root=root,
    dataset_name="mimic3",
    tables=[
        "diagnoses_icd",
        "procedures_icd",
        "noteevents"
    ]
)

In [None]:
from pyhealth.tasks import MIMIC3ICD9Coding

mimic3_coding = MIMIC3ICD9Coding()
samples = dataset.set_task(mimic3_coding)

## Weak Labeling using scispaCy

In [None]:
nlp = en_core_sci_sm.load()
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls", "threshold": .9})

In [None]:
def weak_label(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    labels = ["O"] * len(tokens)
    
    linker = nlp.get_pipe("scispacy_linker")
    
    for ent in doc.ents:
        # print(ent.label_)
        for umls_ent in entity._.kb_ents:
            # print(linker.kb.cui_to_entity[umls_ent[0]].aliases)
            if 'finding' in str(linker.kb.cui_to_entity[umls_ent[0]].aliases):
                start, end = ent.start, ent.end
                labels[start] = "B-SYMPTOM"
                for i in range(start+1, end):
                    labels[i] = "I-SYMPTOM"
                break
    return tokens, labels

samples_processed = 0
data = []
for sample in samples:
    if samples_processed % 50 == 0:
        print("Processed %d out of %d samples" % (samples_processed, len(samples)))
    data.append(weak_label(sample['text']))
    samples_processed += 1

## Prepare Datset with BERT Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
label_map = {"O": 0, "B-SYMPTOM": 1, "I-SYMPTOM": 2}

def tokenize_and_align(tokens, labels):
    encoding = tokenizer(tokens, is_split_into_words=True, return_offsets_mapping=True, return_attention_mask=True, return_tensors=None)
    word_ids = encoding.word_ids(0)  # map subwords back to original tokens
    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label_map[labels[word_idx]])
        else:
            label_ids.append(label_map[labels[word_idx]])  # or -100 to ignore subwords
        previous_word_idx = word_idx
    encoding["labels"] = label_ids
    return encoding

tokenized = [tokenize_and_align(tokens, labels) for tokens, labels in data]
train_set, val_set = train_test_split(tokenized, test_size=0.2)
print(len(train_set))

## Model Setup

In [None]:
label_list = ["O", "B-SYMPTOM", "I-SYMPTOM"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=3)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

task = SymptomExtraction(model=model, tokenizer=tokenizer, optimizer=optimizer, loss_fn=loss_fn, label_map=label_map)

## Train 

In [None]:
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=3)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

task = SymptomExtraction(model=model, tokenizer=tokenizer, optimizer=optimizer, loss_fn=loss_fn, label_map=label_map)

for epoch in range(10):
    total_loss = 0
    for batch in train_set:
        total_loss += task.train_step([batch])
    print(f"Epoch {epoch + 1} - Loss: {total_loss / len(train_set):.4f}")

