## Load libraries

In [None]:
pip install torch transformers

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import io
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import pipeline
import torch

Mounted at /content/drive


## Long Didlogue medical-NER

### Load dataset


In [None]:
# Paths to the files
path_test_long = '/content/drive/MyDrive/W266_Project/Data/test_long.csv'
path_val_long = '/content/drive/MyDrive/W266_Project/Data/val_long.csv'

# Load the datasets
test_long = pd.read_csv(path_test_long)
val_long = pd.read_csv(path_val_long)

# Check the first few rows to ensure they're loaded correctly
print("Long_dialogue_train_data:")
print(val_long.head())

# Check the shape
print("Shape of Long_dialogue_test/val_data:")
print(test_long.shape)
print(val_long.shape)



Long_dialogue_train_data:
                                            dialogue  \
0  doctor: Good morning, patient. Thank you for c...   
1  doctor: Hello, patient, and welcome to my offi...   
2  doctor: Hello Ms. A, thank you for coming in t...   
3  doctor: Hello, Ms. patient, I see you're back ...   
4  doctor: Hello, Mrs. patient, I'm Dr. doctor. I...   

                                                note  Dialogue_Length  \
0  SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...             2745   
1  SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...             1997   
2  SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...             2295   
3  SUBJECTIVE CHIEF COMPLAINT Nausea and vomiting...             2537   
4  SUBJECTIVE CHIEF COMPLAINT Evaluation of hip p...             1749   

   Note_Length  
0         2178  
1         1294  
2         1519  
3         2236  
4          960  
Shape of Long_dialogue_test/val_data:
(180, 4)
(96, 4)


In [None]:
# find the dialogue with the longest summary to exam the extraction
# find the index of with the max dialogue_length
max_dialogue_test_length_index = test_long['Dialogue_Length'].idxmax()
max_dialogue_val_length_index = val_long['Dialogue_Length'].idxmax()

# Get the row with the maximum note_length
max_dialogue_test_length_row = test_long.loc[max_dialogue_test_length_index]
max_dialogue_val_length_row = val_long.loc[max_dialogue_val_length_index]


print("max_dialogue_test")
print(max_dialogue_test_length_row)
print("max_dialogue_val")
print(max_dialogue_val_length_row)


max_dialogue_test
dialogue           doctor eugene walker , n- date of birth 4/14/1...
note               SUBJECTIVE CHIEF COMPLAINT Annual health maint...
Dialogue_Length                                                 8694
Note_Length                                                     1690
Name: 121, dtype: object
max_dialogue_val
dialogue           doctor sophia brown . date of birth , 3/17/194...
note               SUBJECTIVE CHIEF COMPLAINT Annual health maint...
Dialogue_Length                                                 7384
Note_Length                                                     2280
Name: 7, dtype: object


### Clinical NER on long dialogue
- model_max_length: 1000000000000000019884624838656, as the max lenggth in summary is 8694, set the max_lenght  to 15000 to save memory and run time



#### medical_NER on long dialogue _ val & Triain dataset

In [None]:
# Step 1: Load tokenizer and model for ClinicalBERT adapted for token classification
#model_name = "medicalai/ClinicalBERT"
model_name_medicalai_ClinicalBERT = "Clinical-AI-Apollo/Medical-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name_medicalai_ClinicalBERT)
model = AutoModelForTokenClassification.from_pretrained(model_name_medicalai_ClinicalBERT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/736M [00:00<?, ?B/s]

In [None]:
# Function to extract entities from text
def extract_entities_from_chunks(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=10000)
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].tolist())
    labels = [model.config.id2label[prediction.item()] for prediction in predictions[0]]

    entities = []
    for token, label in zip(tokens, labels):
        if label != 'O' and token not in ['[CLS]', '[SEP]']:  # Exclude non-entity labels and special tokens
            entities.append({"token": token, "label": label})
    return entities

# Function to process the dataframe in batches
def process_in_batches(df, batch_size):
    results = []
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        batch_entities = batch['dialogue'].apply(lambda x: extract_entities_from_chunks(x))
        results.extend(batch_entities)
    return results

# Process the dataframe in batches of size 2
batch_size = 2
val_long['dialogue_entities'] = process_in_batches(val_long, batch_size)

# Display a few rows
val_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[{'token': '▁infections', 'label': 'I-DISEASE_..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[{'token': '▁vomiting', 'label': 'B-SIGN_SYMPT..."
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[{'token': '▁pain', 'label': 'B-SIGN_SYMPTOM'}..."


In [None]:
test_long['dialogue_entities'] = process_in_batches(test_long, batch_size)

# Display a few rows
test_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,2483,1895,"[{'token': '▁July', 'label': 'B-DATE'}, {'toke..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...",1943,1425,"[{'token': '▁irritable', 'label': 'B-SIGN_SYMP..."
2,"doctor: Good morning, Mrs. patient, thank you ...",SUBJECTIVE CHIEF COMPLAINT Genetic counseling....,2676,2407,"[{'token': '▁polyps', 'label': 'B-SIGN_SYMPTOM..."


In [None]:
#diaplay medical entity for first 3 row of note_entities
for index, row in val_long.head(3).iterrows():
  print("dialogue", index)
  for entity in row['dialogue_entities']:
    print(entity)

dialogue 0
{'token': '▁infections', 'label': 'I-DISEASE_DISORDER'}
{'token': '▁sore', 'label': 'B-SIGN_SYMPTOM'}
{'token': '▁throat', 'label': 'I-SIGN_SYMPTOM'}
{'token': '▁cough', 'label': 'B-SIGN_SYMPTOM'}
{'token': '▁ears', 'label': 'B-BIOLOGICAL_STRUCTURE'}
{'token': '▁hurt', 'label': 'B-SIGN_SYMPTOM'}
{'token': '▁fever', 'label': 'B-SIGN_SYMPTOM'}
{'token': '▁antibiotics', 'label': 'B-MEDICATION'}
{'token': '▁ear', 'label': 'B-DISEASE_DISORDER'}
{'token': '▁infections', 'label': 'I-DISEASE_DISORDER'}
{'token': '▁a', 'label': 'I-DURATION'}
{'token': '▁year', 'label': 'I-DURATION'}
{'token': '▁antibiotics', 'label': 'B-MEDICATION'}
{'token': '▁hearing', 'label': 'B-DIAGNOSTIC_PROCEDURE'}
{'token': '▁ear', 'label': 'B-DISEASE_DISORDER'}
{'token': '▁infections', 'label': 'I-DISEASE_DISORDER'}
{'token': '▁hearing', 'label': 'B-SIGN_SYMPTOM'}
{'token': '▁ears', 'label': 'B-BIOLOGICAL_STRUCTURE'}
{'token': '▁ears', 'label': 'B-BIOLOGICAL_STRUCTURE'}
{'token': '▁ot', 'label': 'B-DIAGNOSTI

- From the output, we can tell this model use **SentencePiece** tokenlization: token starts with "_" indicates the start of new words or subwords and Tokens without the underscore are continuation tokens that are part of a word or subword unit.
This is different than the tokenization of **Bio-BERT diease** whcih using **wordPiecec**

#### Merge the tokens into words and conbine B and I lable with same entity

In [None]:
def merge_tokens_and_labels(token_label_pairs):
    merged_results = []
    current_phrase = ""
    current_label = ""

    # Handle the case where token_label_pairs might not be in the expected format
    if not isinstance(token_label_pairs, list) or not all(isinstance(item, dict) for item in token_label_pairs):
        return merged_results # Return empty list if format is unexpected

    for pair in token_label_pairs:
        # Check if keys exist before accessing them
        if 'token' in pair and 'label' in pair:
            token = pair['token']
            label = pair['label']

            # Remove the leading underscore if it exists
            if token.startswith('▁'):
                token = token[1:]

            # If the label starts with B, we start a new phrase
            if label.startswith('B-'):
                # If there's an existing phrase, add it to the results
                if current_phrase:
                    merged_results.append({'phrase': current_phrase, 'label': current_label})
                current_phrase = token
                current_label = label[2:]  # Remove the B- prefix
            elif label.startswith('I-') and label[2:] == current_label:
                # If the label is I- and matches the current entity type, continue the phrase
                current_phrase += " " + token
            else:
                # Handle case where I- doesn't match the current_label, which should be rare
                if current_phrase:
                    merged_results.append({'phrase': current_phrase, 'label': current_label})
                current_phrase = token
                current_label = label[2:]  # Use the new label
        else:
            print(f"Warning: Skipping pair due to missing keys: {pair}") # Alert the user about potential issues in the data

    # Add the last phrase if exists
    if current_phrase:
        merged_results.append({'phrase': current_phrase, 'label': current_label})

    return merged_results


#apply to the Apply batch processing to the dialogue columns of the subset
val_long['dialogue_entities_merged'] = val_long['dialogue_entities'].apply(lambda x: merge_tokens_and_labels(x))

#Check the result
val_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities,dialogue_entities_merged
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[{'token': '▁infections', 'label': 'I-DISEASE_...","[{'phrase': 'infections', 'label': 'DISEASE_DI..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[{'token': '▁vomiting', 'label': 'B-SIGN_SYMPT...","[{'phrase': 'vomiting', 'label': 'SIGN_SYMPTOM..."
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[{'token': '▁pain', 'label': 'B-SIGN_SYMPTOM'}...","[{'phrase': 'pain', 'label': 'SIGN_SYMPTOM'}, ..."


In [None]:
# apply to train dataset

#apply to the Apply batch processing to the dialogue columns of the subset
test_long['dialogue_entities_merged'] = test_long['dialogue_entities'].apply(lambda x: merge_tokens_and_labels(x))

#Check the result
test_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities,dialogue_entities_merged
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,2483,1895,"[{'token': '▁July', 'label': 'B-DATE'}, {'toke...","[{'phrase': 'July of 2006', 'label': 'DATE'}, ..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...",1943,1425,"[{'token': '▁irritable', 'label': 'B-SIGN_SYMP...","[{'phrase': 'irritable', 'label': 'SIGN_SYMPTO..."
2,"doctor: Good morning, Mrs. patient, thank you ...",SUBJECTIVE CHIEF COMPLAINT Genetic counseling....,2676,2407,"[{'token': '▁polyps', 'label': 'B-SIGN_SYMPTOM...","[{'phrase': 'polyps', 'label': 'SIGN_SYMPTOM'}..."


### Save extracted NER to new file

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
val_long.to_csv(os.path.join(output_dir, 'val_long_clinical_NER.csv'), index=False)
test_long.to_csv(os.path.join(output_dir, 'test_long_clinical_NER.csv'), index=False)

## Short Dialogue medical NER

### Load the data

In [None]:
path_test_short = '/content/drive/MyDrive/W266_Project/Data/test_short.csv'
path_val_short = '/content/drive/MyDrive/W266_Project/Data/val_short.csv'

# Load the datasets
test_short = pd.read_csv(path_test_short)
val_short = pd.read_csv(path_val_short)

# Check the first few rows to ensure they're loaded correctly
print("Short_dialogue_test_data:")
print(test_short.head())

# Check the shape
print("Shape of Short_dialogue_test_data:")
print(test_short.shape)

# Check the max word length of summary fot furture use

print("Max word length of short dialogue val:")
print(val_short['Dialogue_Length'].max())
print("Max word length of short dialogue test:")
print(test_short['Dialogue_Length'].max())

Short_dialogue_test_data:
  section_header                                       section_text  \
0      FAM/SOCHX  The patient lives with her husband of 48 years...   
1            ROS  ONCOLOGIC: No history of any cancer, change in...   
2  PASTMEDICALHX  The patient denies any previous past medical h...   
3          GENHX  This is a 6-year-old male who comes in recheck...   
4          GYNHX  Her last menstrual period was 6/3/2009. The pa...   

                                            dialogue  Dialogue_Length  \
0  Doctor: Hi, there. Patient: Hi. Guest_family: ...             1756   
1  Doctor: Were you ever diagnosed with any kind ...              336   
2  Doctor: Hi there! Welcome in, sir. Patient: Hi...              255   
3  Doctor: Good afternoon, young man. Is this you...             1386   
4  Doctor: When was your last menstrual period? P...              399   

   Summary_Length  
0             602  
1             129  
2             126  
3             898  
4       

### Medical_NER on short dialogue

- model_max_length: 1000000000000000019884624838656, as the max lenggth in dialogue is 3933, set the max_lenght  to 9000 to save memory and run time

In [None]:
# Step 2: Define Entity Extraction Function
def extract_entities_from_chunks(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True , truncation=True, max_length=9000)
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].tolist())
    labels = [model.config.id2label[prediction.item()] for prediction in predictions[0]]

    entities = []
    for token, label in zip(tokens, labels):
        if label != 'O' and token not in ['[CLS]', '[SEP]']:  # Exclude non-entity labels and special tokens
            entities.append({"token": token, "label": label})
    return entities

# Step 3:
test_short['dialogue_entities'] = test_short['dialogue'].apply(lambda x: extract_entities_from_chunks(x))

# display a few rows
test_short.head(3)


Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_entities
0,FAM/SOCHX,The patient lives with her husband of 48 years...,"Doctor: Hi, there. Patient: Hi. Guest_family: ...",1756,602,"[{'token': '▁mental', 'label': 'B-DISEASE_DISO..."
1,ROS,"ONCOLOGIC: No history of any cancer, change in...",Doctor: Were you ever diagnosed with any kind ...,336,129,"[{'token': '▁diagnosed', 'label': 'I-HISTORY'}..."
2,PASTMEDICALHX,The patient denies any previous past medical h...,"Doctor: Hi there! Welcome in, sir. Patient: Hi...",255,126,"[{'token': '▁health', 'label': 'I-HISTORY'}, {..."


In [None]:
#diaplay medical entity for first 3 row of note_entities
for index, row in test_short.head(3).iterrows():
  print("dialogue", index)
  for entity in row['dialogue_entities']:
    print(entity)

dialogue 0
{'token': '▁mental', 'label': 'B-DISEASE_DISORDER'}
{'token': '▁health', 'label': 'I-DISEASE_DISORDER'}
{'token': '▁registered', 'label': 'B-DETAILED_DESCRIPTION'}
{'token': '▁nurse', 'label': 'I-DISEASE_DISORDER'}
{'token': '▁drink', 'label': 'I-HISTORY'}
{'token': '▁alcohol', 'label': 'I-HISTORY'}
{'token': '▁use', 'label': 'I-HISTORY'}
{'token': '▁recreational', 'label': 'I-HISTORY'}
{'token': '▁drugs', 'label': 'I-HISTORY'}
{'token': '▁drugs', 'label': 'I-HISTORY'}
{'token': '▁drugs', 'label': 'I-DETAILED_DESCRIPTION'}
{'token': '▁muscle', 'label': 'B-MEDICATION'}
{'token': '▁relaxant', 'label': 'I-DIAGNOSTIC_PROCEDURE'}
{'token': 's', 'label': 'I-DIAGNOSTIC_PROCEDURE'}
{'token': '▁sedative', 'label': 'B-DIAGNOSTIC_PROCEDURE'}
{'token': '▁medications', 'label': 'I-DIAGNOSTIC_PROCEDURE'}
{'token': '▁muscle', 'label': 'B-MEDICATION'}
{'token': '▁relaxant', 'label': 'I-DIAGNOSTIC_PROCEDURE'}
{'token': 's', 'label': 'I-DIAGNOSTIC_PROCEDURE'}
{'token': '▁sedative', 'label': '

In [None]:
#apply to the Apply batch processing to the dialogue columns of the subset
test_short['dialogue_entities_merged'] = test_short['dialogue_entities'].apply(lambda x: merge_tokens_and_labels(x))

#Check the result
test_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_entities,dialogue_entities_merged
0,FAM/SOCHX,The patient lives with her husband of 48 years...,"Doctor: Hi, there. Patient: Hi. Guest_family: ...",1756,602,"[{'token': '▁mental', 'label': 'B-DISEASE_DISO...","[{'phrase': 'mental health', 'label': 'DISEASE..."
1,ROS,"ONCOLOGIC: No history of any cancer, change in...",Doctor: Were you ever diagnosed with any kind ...,336,129,"[{'token': '▁diagnosed', 'label': 'I-HISTORY'}...",[{'phrase': 'diagnosed with any kind of cancer...
2,PASTMEDICALHX,The patient denies any previous past medical h...,"Doctor: Hi there! Welcome in, sir. Patient: Hi...",255,126,"[{'token': '▁health', 'label': 'I-HISTORY'}, {...",[{'phrase': 'health problems primary care doct...


In [None]:
## apply to val set as well
val_short['dialogue_entities'] = val_short['dialogue'].apply(lambda x: extract_entities_from_chunks(x))
val_short['dialogue_entities_merged'] = val_short['dialogue_entities'].apply(lambda x: merge_tokens_and_labels(x))

#Check the result
val_short.head(3)


Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_entities,dialogue_entities_merged
0,ASSESSMENT,Upper respiratory infection.,Doctor: What brings you in today? Patient: I h...,595,28,"[{'token': '▁cough', 'label': 'B-SIGN_SYMPTOM'...","[{'phrase': 'cough', 'label': 'SIGN_SYMPTOM'},..."
1,MEDICATIONS,Ibuprofen.,Doctor: Are you taking any medications current...,89,10,"[{'token': '▁medications', 'label': 'I-HISTORY...","[{'phrase': 'medications', 'label': 'HISTORY'}..."
2,DISPOSITION,The patient will be going home.,Doctor: Ready to go home? Patient: I just can'...,137,31,[],[]


### Save extracted NER to new file

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
test_short.to_csv(os.path.join(output_dir, 'test_short_clincal_NER.csv'), index=False)
val_short.to_csv(os.path.join(output_dir, 'val_short_clincal_NER.csv'), index=False)