## Load libraries

In [None]:
pip install torch transformers

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import io
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import pipeline
import torch

Mounted at /content/drive


## Long Didlogue BIO-NER

### Load dataset


In [None]:
# Paths to the files
path_train_long = '/content/drive/MyDrive/W266_Project/Data/train_long.csv'
path_test_long = '/content/drive/MyDrive/W266_Project/Data/test_long.csv'
path_val_long = '/content/drive/MyDrive/W266_Project/Data/val_long.csv'

# Load the datasets
train_long = pd.read_csv(path_train_long)
test_long = pd.read_csv(path_test_long)
val_long = pd.read_csv(path_val_long)

# Check the first few rows to ensure they're loaded correctly
print("Long_dialogue_train_data:")
print(train_long.head())

# Check the shape
print("Shape of Long_dialogue_train/test/val_data:")
print(train_long.shape)
print(test_long.shape)
print(val_long.shape)



Long_dialogue_train_data:
                                            dialogue  \
0  doctor donna torres , date of birth , 08/01/19...   
1  doctor: Good morning, Mr. patient. I'm Dr. doc...   
2  doctor: Hello Mrs. patient, thank you for comi...   
3  doctor hi virginia how're you today patient i'...   
4  doctor: Hello, Mrs. patient, welcome back. How...   

                                                note  Dialogue_Length  \
0  SUBJECTIVE CHIEF COMPLAINT Annual health maint...             8595   
1  SUBJECTIVE CHIEF COMPLAINT Patient reports fru...             1760   
2  SUBJECTIVE CHIEF COMPLAINT Left arm pain after...             4074   
3  SUBJECTIVE CHIEF COMPLAINT Right knee pain. HI...             6728   
4  SUBJECTIVE CHIEF COMPLAINT Recurrent low back ...             1841   

   Note_Length  
0         2794  
1         1536  
2         2971  
3         2207  
4         1595  
Shape of Long_dialogue_train/test/val_data:
(1102, 4)
(180, 4)
(96, 4)


In [None]:
# find the dialogue with the longest summary to exam the extraction
# find the index of with the max dialogue_length
max_dialogue_train_length_index = train_long['Dialogue_Length'].idxmax()
max_dialogue_test_length_index = test_long['Dialogue_Length'].idxmax()
max_dialogue_val_length_index = val_long['Dialogue_Length'].idxmax()

# Get the row with the maximum note_length
max_dialogue_train_length_row = train_long.loc[max_dialogue_train_length_index]
max_dialogue_test_length_row = test_long.loc[max_dialogue_test_length_index]
max_dialogue_val_length_row = val_long.loc[max_dialogue_val_length_index]

print("max_dialogue_train")
print(max_dialogue_train_length_row)
print("max_dialogue_test")
print(max_dialogue_test_length_row)
print("max_dialogue_val")
print(max_dialogue_val_length_row)


max_dialogue_train
dialogue           doctor next patient is christine hernandez , u...
note               SUBJECTIVE CHIEF COMPLAINT Annual health maint...
Dialogue_Length                                                13924
Note_Length                                                     2545
Name: 548, dtype: object
max_dialogue_test
dialogue           doctor eugene walker , n- date of birth 4/14/1...
note               SUBJECTIVE CHIEF COMPLAINT Annual health maint...
Dialogue_Length                                                 8694
Note_Length                                                     1690
Name: 121, dtype: object
max_dialogue_val
dialogue           doctor sophia brown . date of birth , 3/17/194...
note               SUBJECTIVE CHIEF COMPLAINT Annual health maint...
Dialogue_Length                                                 7384
Note_Length                                                     2280
Name: 7, dtype: object


### Bio_Bert_disease



#### NER from model

In [None]:
# Step 1: Load tokenizer and model from Huggingface
model_name = "alvaroalon2/biobert_diseases_ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

In [None]:
# Step 2: Define function to extract entities
def extract_entities_from_chunks(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Need to discuss as Tokenizer shoudl be 510 as chunk
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].tolist())
    labels = [model.config.id2label[prediction.item()] for prediction in predictions[0]]  # Corrected here

    entities = []
    for token, label in zip(tokens, labels):
        if label != '0' and token not in ['[CLS]', '[SEP]']:  # Exclude non-entity labels and special tokens
            entities.append({"token": token, "label": label})
    return entities

# Step 3: Define Split function
def split_into_chunks(text, max_length=510):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(tokenizer(" ".join(current_chunk))['input_ids']) >= max_length:
            chunks.append(" ".join(current_chunk[:-1]))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Step 4: Apply batch processing to the dialogue and note columns of the subset
val_long['dialogue_diease_entities'] = val_long['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))

# display a few
val_long.head(3)


Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_diease_entities
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[{'token': 'ear', 'label': 'B-DISEASE'}, {'tok..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[{'token': 'vomit', 'label': 'B-DISEASE'}, {'t..."
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[{'token': 'pain', 'label': 'B-DISEASE'}, {'to..."


In [None]:
#diaplay medical entity for first 3 row of note_entities
for index, row in val_long.head(3).iterrows():
  print("dialogue", index)
  for entity in row['dialogue_diease_entities']:
    print(entity)

dialogue 0
{'token': 'ear', 'label': 'B-DISEASE'}
{'token': 'infections', 'label': 'I-DISEASE'}
{'token': 'sore', 'label': 'B-DISEASE'}
{'token': 'throat', 'label': 'I-DISEASE'}
{'token': 'cough', 'label': 'B-DISEASE'}
{'token': 'fever', 'label': 'B-DISEASE'}
{'token': 'ear', 'label': 'B-DISEASE'}
{'token': 'infections', 'label': 'I-DISEASE'}
{'token': 'ear', 'label': 'B-DISEASE'}
{'token': 'infections', 'label': 'I-DISEASE'}
{'token': 'o', 'label': 'B-DISEASE'}
{'token': '##titis', 'label': 'I-DISEASE'}
{'token': 'media', 'label': 'I-DISEASE'}
{'token': 'infection', 'label': 'B-DISEASE'}
{'token': 'hearing', 'label': 'B-DISEASE'}
{'token': 'loss', 'label': 'I-DISEASE'}
{'token': 'o', 'label': 'B-DISEASE'}
{'token': '##titis', 'label': 'I-DISEASE'}
{'token': 'media', 'label': 'I-DISEASE'}
{'token': 'ad', 'label': 'B-DISEASE'}
{'token': '##eno', 'label': 'I-DISEASE'}
{'token': '##id', 'label': 'I-DISEASE'}
{'token': 'h', 'label': 'I-DISEASE'}
{'token': '##yper', 'label': 'I-DISEASE'}
{'

- From the output, we can tell this model use **SentencePiece** tokenlization: token starts with "_" indicates the start of new words or subwords and Tokens without the underscore are continuation tokens that are part of a word or subword unit.
This is different than the tokenization of **Bio-BERT diease** whcih using **wordPiecec**

#### Merge the tokens into words and conbine B and I lable with same entity

In [None]:
# step 5: merge wordpices/tokens to original word
def merge_wordpieces(tokens_labels):
    merged_entities = []
    current_entity = ""
    current_label = ""

    for token_dict in tokens_labels:
        token = token_dict['token']
        label = token_dict['label']

        if token.startswith("##"):
            current_entity += token[2:]
        else:
            if current_entity:
                merged_entities.append({'token': current_entity, 'label': current_label})
            current_entity = token
            current_label = label

    # Add the last entity
    if current_entity:
        merged_entities.append({'token': current_entity, 'label': current_label})

    return merged_entities

#step 6: merge word to orginal phrases
# Function to combine B- and I- labels into full entities
def combine_entities(merged_entities):
    combined_entities = []
    current_entity = ""
    current_label = ""

    for entity in merged_entities:
        word = entity['token']
        label = entity['label']

        if label.startswith("B-"):
            if current_entity:
                combined_entities.append({'token': current_entity.strip(), 'label': current_label})
            current_entity = word
            current_label = label
        elif label.startswith("I-") and current_label and current_label[2:] == label[2:]:
            current_entity += " " + word
        else:
            if current_entity:
                combined_entities.append({'token': current_entity.strip(), 'label': current_label})
            current_entity = word
            current_label = label

    if current_entity:
        combined_entities.append({'token': current_entity.strip(), 'label': current_label})

    return combined_entities


#step 7: apply function to get merged disease entity
val_long['dialogue_diease_entities_merged'] = val_long['dialogue_diease_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))

# Check the result
val_long.head(3)


Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_diease_entities,dialogue_diease_entities_merged
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[{'token': 'ear', 'label': 'B-DISEASE'}, {'tok...","[{'token': 'ear infections', 'label': 'B-DISEA..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[{'token': 'vomit', 'label': 'B-DISEASE'}, {'t...","[{'token': 'vomiting', 'label': 'B-DISEASE'}, ..."
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[{'token': 'pain', 'label': 'B-DISEASE'}, {'to...","[{'token': 'pain', 'label': 'B-DISEASE'}, {'to..."


In [None]:
## allpy to both train and test data set as well
test_long['dialogue_diease_entities'] = test_long['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))
test_long['dialogue_diease_entities_merged'] = test_long['dialogue_diease_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))
test_long.head(3)


Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_diease_entities,dialogue_diease_entities_merged
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,2483,1895,"[{'token': 'seizure', 'label': 'B-DISEASE'}, {...","[{'token': 'seizures', 'label': 'B-DISEASE'}, ..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...",1943,1425,"[{'token': 'i', 'label': 'B-DISEASE'}, {'token...","[{'token': 'irritable', 'label': 'B-DISEASE'},..."
2,"doctor: Good morning, Mrs. patient, thank you ...",SUBJECTIVE CHIEF COMPLAINT Genetic counseling....,2676,2407,"[{'token': 'co', 'label': 'B-DISEASE'}, {'toke...","[{'token': 'colon polyps', 'label': 'B-DISEASE..."


In [None]:
train_long['dialogue_diease_entities'] = train_long['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))
train_long['dialogue_diease_entities_merged'] = train_long['dialogue_diease_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))
train_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_diease_entities,dialogue_diease_entities_merged
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,8595,2794,"[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'anxiety', 'label': 'B-DISEASE'}, {..."
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,1760,1536,"[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'anxiety', 'label': 'B-DISEASE'}, {..."
2,"doctor: Hello Mrs. patient, thank you for comi...",SUBJECTIVE CHIEF COMPLAINT Left arm pain after...,4074,2971,"[{'token': 'left', 'label': 'B-DISEASE'}, {'to...","[{'token': 'left arm pain', 'label': 'B-DISEAS..."


### Bio_bert_Chemical

In [None]:
# Step 1: Load tokenizer and model from Huggingface
model_name = "alvaroalon2/biobert_chemical_ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/750 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

In [None]:
# Step 2: Define function to extract entities
def extract_entities_from_chunks(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Need to discuss as Tokenizer shoudl be 510 as chunk
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].tolist())
    labels = [model.config.id2label[prediction.item()] for prediction in predictions[0]]  # Corrected here

    entities = []
    for token, label in zip(tokens, labels):
        if label != 'O' and token not in ['[CLS]', '[SEP]']:  # Exclude non-entity labels and special tokens
            entities.append({"token": token, "label": label})
    return entities

# Step 3: Define Split function
def split_into_chunks(text, max_length=510):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(tokenizer(" ".join(current_chunk))['input_ids']) >= max_length:
            chunks.append(" ".join(current_chunk[:-1]))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [None]:
# Step 4: NER
val_long['dialogue_chemical_entities'] = val_long['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))

# display a few
val_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_diease_entities,dialogue_diease_entities_merged,dialogue_chemical_entities
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[{'token': 'ear', 'label': 'B-DISEASE'}, {'tok...","[{'token': 'ear infections', 'label': 'B-DISEA...",[]
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[{'token': 'vomit', 'label': 'B-DISEASE'}, {'t...","[{'token': 'vomiting', 'label': 'B-DISEASE'}, ...",[]
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[{'token': 'pain', 'label': 'B-DISEASE'}, {'to...","[{'token': 'pain', 'label': 'B-DISEASE'}, {'to...",[]


In [None]:
#step 5: apply merge functions to get merged entities
val_long['dialogue_chemical_entities_merged'] = val_long['dialogue_chemical_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))

# Check the result
val_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_diease_entities,dialogue_diease_entities_merged,dialogue_chemical_entities,dialogue_chemical_entities_merged
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[{'token': 'ear', 'label': 'B-DISEASE'}, {'tok...","[{'token': 'ear infections', 'label': 'B-DISEA...",[],[]
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[{'token': 'vomit', 'label': 'B-DISEASE'}, {'t...","[{'token': 'vomiting', 'label': 'B-DISEASE'}, ...",[],[]
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[{'token': 'pain', 'label': 'B-DISEASE'}, {'to...","[{'token': 'pain', 'label': 'B-DISEASE'}, {'to...",[],[]


In [None]:
test_long['dialogue_chemical_entities'] = test_long['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))
test_long['dialogue_chemical_entities_merged'] = test_long['dialogue_chemical_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))
test_long.head(3)

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_diease_entities,dialogue_diease_entities_merged,dialogue_chemical_entities,dialogue_chemical_entities_merged
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,2483,1895,"[{'token': 'seizure', 'label': 'B-DISEASE'}, {...","[{'token': 'seizures', 'label': 'B-DISEASE'}, ...","[{'token': 'Ke', 'label': 'B-CHEMICAL'}, {'tok...","[{'token': 'Keppra', 'label': 'B-CHEMICAL'}, {..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...",1943,1425,"[{'token': 'i', 'label': 'B-DISEASE'}, {'token...","[{'token': 'irritable', 'label': 'B-DISEASE'},...","[{'token': 'Ty', 'label': 'B-CHEMICAL'}, {'tok...","[{'token': 'Tylenol', 'label': 'B-CHEMICAL'}]"
2,"doctor: Good morning, Mrs. patient, thank you ...",SUBJECTIVE CHIEF COMPLAINT Genetic counseling....,2676,2407,"[{'token': 'co', 'label': 'B-DISEASE'}, {'toke...","[{'token': 'colon polyps', 'label': 'B-DISEASE...","[{'token': 'alcohol', 'label': 'B-CHEMICAL'}]","[{'token': 'alcohol', 'label': 'B-CHEMICAL'}]"


In [None]:
train_long['dialogue_chemical_entities'] = train_long['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))
train_long['dialogue_chemical_entities_merged'] = train_long['dialogue_chemical_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))
train_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_diease_entities,dialogue_diease_entities_merged,dialogue_chemical_entities,dialogue_chemical_entities_merged
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,8595,2794,"[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'pro', 'label': 'B-CHEMICAL'}, {'to...","[{'token': 'progesterone', 'label': 'B-CHEMICA..."
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,1760,1536,"[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...",[],[]
2,"doctor: Hello Mrs. patient, thank you for comi...",SUBJECTIVE CHIEF COMPLAINT Left arm pain after...,4074,2971,"[{'token': 'left', 'label': 'B-DISEASE'}, {'to...","[{'token': 'left arm pain', 'label': 'B-DISEAS...","[{'token': 'ni', 'label': 'B-CHEMICAL'}, {'tok...","[{'token': 'nitroglycerin', 'label': 'B-CHEMIC..."


In [None]:
## misspll disease above, change column name
def rename_columns(df):
    df = df.rename(columns={
        'dialogue_diease_entities': 'dialogue_disease_entities',
        'dialogue_diease_entities_merged': 'dialogue_disease_entities_merged'
    })
    return df

# Rename the columns
train_long = rename_columns(train_long)
test_long = rename_columns(test_long)
val_long = rename_columns(val_long)

# verify the changes
train_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_disease_entities,dialogue_disease_entities_merged,dialogue_chemical_entities,dialogue_chemical_entities_merged
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,8595,2794,"[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'pro', 'label': 'B-CHEMICAL'}, {'to...","[{'token': 'progesterone', 'label': 'B-CHEMICA..."
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,1760,1536,"[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...",[],[]
2,"doctor: Hello Mrs. patient, thank you for comi...",SUBJECTIVE CHIEF COMPLAINT Left arm pain after...,4074,2971,"[{'token': 'left', 'label': 'B-DISEASE'}, {'to...","[{'token': 'left arm pain', 'label': 'B-DISEAS...","[{'token': 'ni', 'label': 'B-CHEMICAL'}, {'tok...","[{'token': 'nitroglycerin', 'label': 'B-CHEMIC..."


### Save as new CSV file

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
train_long.to_csv(os.path.join(output_dir, 'train_Long_BIO_NER.csv'), index=False)
test_long.to_csv(os.path.join(output_dir, 'test_Long_BIO_NER.csv'), index=False)
val_long.to_csv(os.path.join(output_dir, 'val_Long_BIO_NER.csv'), index=False)

## Short Dialogue BIO NER

### Load the data

In [None]:
# Paths to the files
path_train_short = '/content/drive/MyDrive/W266_Project/Data/train_short.csv'
path_test_short = '/content/drive/MyDrive/W266_Project/Data/test_short.csv'
path_val_short = '/content/drive/MyDrive/W266_Project/Data/val_short.csv'

# Load the datasets
train_short = pd.read_csv(path_train_short)
test_short = pd.read_csv(path_test_short)
val_short = pd.read_csv(path_val_short)

# Check the first few rows to ensure they're loaded correctly
print("Short_dialogue_train_data:")
print(train_short.head())

# Check the shape
print("Shape of Short_dialogue_train_data:")
print(train_short.shape)

# Check the max word length of summary fot furture use
print("Max word length of short dialogue trian:")
print(train_short['Dialogue_Length'].max())
print("Max word length of short dialogue val:")
print(val_short['Dialogue_Length'].max())
print("Max word length of short dialogue test:")
print(test_short['Dialogue_Length'].max())


Short_dialogue_train_data:
  section_header                                       section_text  \
0          GENHX  The patient is a 75-year-old female who comes ...   
1      FAM/SOCHX         Significant for diabetes and hypertension.   
2  PASTMEDICALHX                  Significant for anxiety disorder.   
3          GENHX  The patient is a 77-year-old female who is una...   
4      FAM/SOCHX                                   Noncontributory.   

                                            dialogue  Dialogue_Length  \
0  Doctor: Welcome to the clinic. I am Doctor Fra...             1396   
1  Doctor: Does anyone else in your family suffer...              175   
2  Doctor: Have we gone over your survey results ...              256   
3  Guest_clinician: How old is the patient? Docto...              438   
4  Doctor: Do you have a known- Patient: Drug all...              105   

   Summary_Length  
0             677  
1              42  
2              33  
3             325  
4      

### BIO_BERT_DIEASE

In [None]:
# Step 1: Load tokenizer and model from Huggingface
model_name = "alvaroalon2/biobert_diseases_ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


In [None]:
# Step 3: Define function to extract entities
def extract_entities_from_chunks(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Need to discuss as Tokenizer shoudl be 510 as chunk
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].tolist())
    labels = [model.config.id2label[prediction.item()] for prediction in predictions[0]]  # Corrected here

    entities = []
    for token, label in zip(tokens, labels):
        if label != '0' and token not in ['[CLS]', '[SEP]']:  # Exclude non-entity labels and special tokens
            entities.append({"token": token, "label": label})
    return entities

# Step 4: Define Split function
def split_into_chunks(text, max_length=510):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(tokenizer(" ".join(current_chunk))['input_ids']) >= max_length:
            chunks.append(" ".join(current_chunk[:-1]))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Step 5: Apply batch processing to the dialogue and note columns of the subset
val_short['dialogue_disease_entities'] = val_short['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))

# display a few
val_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_disease_entities
0,ASSESSMENT,Upper respiratory infection.,Doctor: What brings you in today? Patient: I h...,595,28,"[{'token': 'cough', 'label': 'B-DISEASE'}, {'t..."
1,MEDICATIONS,Ibuprofen.,Doctor: Are you taking any medications current...,89,10,"[{'token': 'pain', 'label': 'B-DISEASE'}]"
2,DISPOSITION,The patient will be going home.,Doctor: Ready to go home? Patient: I just can'...,137,31,[]


In [None]:
#step 9: apply to the Apply batch processing to the dialogue columns of the subset
val_short['dialogue_disease_entities_merged'] = val_short['dialogue_disease_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))

# Check the result
val_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_disease_entities,dialogue_disease_entities_merged
0,ASSESSMENT,Upper respiratory infection.,Doctor: What brings you in today? Patient: I h...,595,28,"[{'token': 'cough', 'label': 'B-DISEASE'}, {'t...","[{'token': 'cough', 'label': 'B-DISEASE'}, {'t..."
1,MEDICATIONS,Ibuprofen.,Doctor: Are you taking any medications current...,89,10,"[{'token': 'pain', 'label': 'B-DISEASE'}]","[{'token': 'pain', 'label': 'B-DISEASE'}]"
2,DISPOSITION,The patient will be going home.,Doctor: Ready to go home? Patient: I just can'...,137,31,[],[]


In [None]:
test_short['dialogue_disease_entities'] = test_short['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))

test_short['dialogue_disease_entities_merged'] = test_short['dialogue_disease_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))

# Check the result
test_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_disease_entities,dialogue_disease_entities_merged
0,FAM/SOCHX,The patient lives with her husband of 48 years...,"Doctor: Hi, there. Patient: Hi. Guest_family: ...",1756,602,"[{'token': 'mental', 'label': 'B-DISEASE'}, {'...","[{'token': 'mental health', 'label': 'B-DISEAS..."
1,ROS,"ONCOLOGIC: No history of any cancer, change in...",Doctor: Were you ever diagnosed with any kind ...,336,129,"[{'token': 'cancer', 'label': 'B-DISEASE'}, {'...","[{'token': 'cancer', 'label': 'B-DISEASE'}, {'..."
2,PASTMEDICALHX,The patient denies any previous past medical h...,"Doctor: Hi there! Welcome in, sir. Patient: Hi...",255,126,[],[]


In [None]:
train_short['dialogue_disease_entities'] = train_short['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))

train_short['dialogue_disease_entities_merged'] = train_short['dialogue_disease_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))

# Check the result
train_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_disease_entities,dialogue_disease_entities_merged
0,GENHX,The patient is a 75-year-old female who comes ...,Doctor: Welcome to the clinic. I am Doctor Fra...,1396,677,"[{'token': 'stroke', 'label': 'B-DISEASE'}, {'...","[{'token': 'stroke', 'label': 'B-DISEASE'}, {'..."
1,FAM/SOCHX,Significant for diabetes and hypertension.,Doctor: Does anyone else in your family suffer...,175,42,"[{'token': 'high', 'label': 'B-DISEASE'}, {'to...","[{'token': 'high blood pressure', 'label': 'B-..."
2,PASTMEDICALHX,Significant for anxiety disorder.,Doctor: Have we gone over your survey results ...,256,33,"[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'anxiety disorder', 'label': 'B-DIS..."


### BIO_BERT_CHEMICAL

In [None]:
# Step 1: Load tokenizer and model from Huggingface
model_name = "alvaroalon2/biobert_chemical_ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [None]:
# Step 2: Define function to extract entities
def extract_entities_from_chunks(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) # Need to discuss as Tokenizer shoudl be 510 as chunk
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].tolist())
    labels = [model.config.id2label[prediction.item()] for prediction in predictions[0]]  # Corrected here

    entities = []
    for token, label in zip(tokens, labels):
        if label != 'O' and token not in ['[CLS]', '[SEP]']:  # Exclude non-entity labels and special tokens
            entities.append({"token": token, "label": label})
    return entities

# Step 3: Define Split function
def split_into_chunks(text, max_length=510):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(tokenizer(" ".join(current_chunk))['input_ids']) >= max_length:
            chunks.append(" ".join(current_chunk[:-1]))
            current_chunk = [word]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [None]:
# Step 4: NER
val_short['dialogue_chemical_entities'] = val_short['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))


In [None]:
#step 5: apply merge functions to get merged entities
val_short['dialogue_chemical_entities_merged'] = val_short['dialogue_chemical_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))

# Check the result
val_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_disease_entities,dialogue_disease_entities_merged,dialogue_chemical_entities,dialogue_chemical_entities_merged
0,ASSESSMENT,Upper respiratory infection.,Doctor: What brings you in today? Patient: I h...,595,28,"[{'token': 'cough', 'label': 'B-DISEASE'}, {'t...","[{'token': 'cough', 'label': 'B-DISEASE'}, {'t...",[],[]
1,MEDICATIONS,Ibuprofen.,Doctor: Are you taking any medications current...,89,10,"[{'token': 'pain', 'label': 'B-DISEASE'}]","[{'token': 'pain', 'label': 'B-DISEASE'}]","[{'token': 'I', 'label': 'B-CHEMICAL'}, {'toke...","[{'token': 'Ibuprofen', 'label': 'B-CHEMICAL'}]"
2,DISPOSITION,The patient will be going home.,Doctor: Ready to go home? Patient: I just can'...,137,31,[],[],[],[]


In [None]:
## Apply to train and test dataset as well
test_short['dialogue_chemical_entities'] = test_short['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))
test_short['dialogue_chemical_entities_merged'] = test_short['dialogue_chemical_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))

# Check the result
test_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_disease_entities,dialogue_disease_entities_merged,dialogue_chemical_entities,dialogue_chemical_entities_merged
0,FAM/SOCHX,The patient lives with her husband of 48 years...,"Doctor: Hi, there. Patient: Hi. Guest_family: ...",1756,602,"[{'token': 'mental', 'label': 'B-DISEASE'}, {'...","[{'token': 'mental health', 'label': 'B-DISEAS...","[{'token': 'alcohol', 'label': 'B-CHEMICAL'}]","[{'token': 'alcohol', 'label': 'B-CHEMICAL'}]"
1,ROS,"ONCOLOGIC: No history of any cancer, change in...",Doctor: Were you ever diagnosed with any kind ...,336,129,"[{'token': 'cancer', 'label': 'B-DISEASE'}, {'...","[{'token': 'cancer', 'label': 'B-DISEASE'}, {'...",[],[]
2,PASTMEDICALHX,The patient denies any previous past medical h...,"Doctor: Hi there! Welcome in, sir. Patient: Hi...",255,126,[],[],[],[]


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
train_short['dialogue_chemical_entities'] = train_short['dialogue'].apply(lambda x: extract_entities_from_chunks(split_into_chunks(x)))
train_short['dialogue_chemical_entities_merged'] = train_short['dialogue_chemical_entities'].apply(lambda x: combine_entities(merge_wordpieces(x)))

# Check the result
train_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_disease_entities,dialogue_disease_entities_merged,dialogue_chemical_entities,dialogue_chemical_entities_merged
0,GENHX,The patient is a 75-year-old female who comes ...,Doctor: Welcome to the clinic. I am Doctor Fra...,1396,677,"[{'token': 'stroke', 'label': 'B-DISEASE'}, {'...","[{'token': 'stroke', 'label': 'B-DISEASE'}, {'...",[],[]
1,FAM/SOCHX,Significant for diabetes and hypertension.,Doctor: Does anyone else in your family suffer...,175,42,"[{'token': 'high', 'label': 'B-DISEASE'}, {'to...","[{'token': 'high blood pressure', 'label': 'B-...",[],[]
2,PASTMEDICALHX,Significant for anxiety disorder.,Doctor: Have we gone over your survey results ...,256,33,"[{'token': 'anxiety', 'label': 'B-DISEASE'}, {...","[{'token': 'anxiety disorder', 'label': 'B-DIS...",[],[]


### Save as new CSV file

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
train_short.to_csv(os.path.join(output_dir, 'train_Short_BIO_NER.csv'), index=False)
test_short.to_csv(os.path.join(output_dir, 'test_Short_BIO_NER.csv'), index=False)
val_short.to_csv(os.path.join(output_dir, 'val_Short_BIO_NER.csv'), index=False)