## Load libraries

In [None]:
pip install torch transformers

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers, keybert
Successfully installed keybert-0.8.5 sentence-transformers-3.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import io
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import pipeline
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer
import torch

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Long Didlogue KEYBERT


### Load dataset


In [None]:
# Paths to the files
path_train_long = '/content/drive/MyDrive/W266_Project/Data/train_long.csv'
path_test_long = '/content/drive/MyDrive/W266_Project/Data/test_long.csv'
path_val_long = '/content/drive/MyDrive/W266_Project/Data/val_long.csv'

# Load the datasets
train_long = pd.read_csv(path_train_long)
test_long = pd.read_csv(path_test_long)
val_long = pd.read_csv(path_val_long)

# Check the first few rows to ensure they're loaded correctly
print("Long_dialogue_train_data:")
print(train_long.head())

# Check the shape
print("Shape of Long_dialogue_train/test/val_data:")
print(train_long.shape)
print(test_long.shape)
print(val_long.shape)



Long_dialogue_train_data:
                                            dialogue  \
0  doctor donna torres , date of birth , 08/01/19...   
1  doctor: Good morning, Mr. patient. I'm Dr. doc...   
2  doctor: Hello Mrs. patient, thank you for comi...   
3  doctor hi virginia how're you today patient i'...   
4  doctor: Hello, Mrs. patient, welcome back. How...   

                                                note  Dialogue_Length  \
0  SUBJECTIVE CHIEF COMPLAINT Annual health maint...             8595   
1  SUBJECTIVE CHIEF COMPLAINT Patient reports fru...             1760   
2  SUBJECTIVE CHIEF COMPLAINT Left arm pain after...             4074   
3  SUBJECTIVE CHIEF COMPLAINT Right knee pain. HI...             6728   
4  SUBJECTIVE CHIEF COMPLAINT Recurrent low back ...             1841   

   Note_Length  
0         2794  
1         1536  
2         2971  
3         2207  
4         1595  
Shape of Long_dialogue_train/test/val_data:
(1102, 4)
(180, 4)
(96, 4)


In [None]:
# find the dialogue with the longest summary to exam the extraction
# find the index of with the max dialogue_length
max_dialogue_train_length_index = train_long['Dialogue_Length'].idxmax()
max_dialogue_test_length_index = test_long['Dialogue_Length'].idxmax()
max_dialogue_val_length_index = val_long['Dialogue_Length'].idxmax()

# Get the row with the maximum note_length
max_dialogue_train_length_row = train_long.loc[max_dialogue_train_length_index]
max_dialogue_test_length_row = test_long.loc[max_dialogue_test_length_index]
max_dialogue_val_length_row = val_long.loc[max_dialogue_val_length_index]

print("max_dialogue_train")
print(max_dialogue_train_length_row)
print("max_dialogue_test")
print(max_dialogue_test_length_row)
print("max_dialogue_val")
print(max_dialogue_val_length_row)


max_dialogue_train
dialogue           doctor next patient is christine hernandez , u...
note               SUBJECTIVE CHIEF COMPLAINT Annual health maint...
Dialogue_Length                                                13924
Note_Length                                                     2545
Name: 548, dtype: object
max_dialogue_test
dialogue           doctor eugene walker , n- date of birth 4/14/1...
note               SUBJECTIVE CHIEF COMPLAINT Annual health maint...
Dialogue_Length                                                 8694
Note_Length                                                     1690
Name: 121, dtype: object
max_dialogue_val
dialogue           doctor sophia brown . date of birth , 3/17/194...
note               SUBJECTIVE CHIEF COMPLAINT Annual health maint...
Dialogue_Length                                                 7384
Note_Length                                                     2280
Name: 7, dtype: object


### Keybert extraction


In [None]:
# Initialize KeyBERT model
kw_model = KeyBERT()

def extract_keywords(text):
    try:
        # Extract keywords; adjust the parameters as needed
        keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english', use_mmr=True, diversity=0.5, top_n=5)
        # Format the output to store only the keywords, not the scores
        return [keyword[0] for keyword in keywords]
    except Exception as e:
        print(f"Error processing text: {e}")
        return []

val_long['dialogue_keywords'] = val_long['dialogue'].apply(extract_keywords)

val_long.head(3)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_keywords
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[having ear infections, sore throat cough, rou..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[problems eating patient, mother started vomit..."
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[decided gastric bypass, struggling morbid obe..."


In [None]:
test_long['dialogue_keywords'] = test_long['dialogue'].apply(extract_keywords)

test_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_keywords
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,2483,1895,"[seizures hemiparesis confusion, significant b..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...",1943,1425,"[baby irritable fever, symptoms doctor examine..."
2,"doctor: Good morning, Mrs. patient, thank you ...",SUBJECTIVE CHIEF COMPLAINT Genetic counseling....,2676,2407,"[colon polyps worried, cancer history family, ..."


In [None]:
train_long['dialogue_keywords'] = train_long['dialogue'].apply(extract_keywords)

train_long.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_keywords
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,8595,2794,"[patient anxiety going, doctor donna torres, b..."
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,1760,1536,"[wife doctor understand, difficulty treatment ..."
2,"doctor: Hello Mrs. patient, thank you for comi...",SUBJECTIVE CHIEF COMPLAINT Left arm pain after...,4074,2971,"[nitroglycerin usual angina, angina chest pain..."


### Save as new CSV file

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
train_long.to_csv(os.path.join(output_dir, 'train_Long_Keybert.csv'), index=False)
test_long.to_csv(os.path.join(output_dir, 'test_Long_Keybert.csv'), index=False)
val_long.to_csv(os.path.join(output_dir, 'val_Long_Keybert.csv'), index=False)

## Short Dialogue BIO NER

### Load the data

In [None]:
# Paths to the files
path_train_short = '/content/drive/MyDrive/W266_Project/Data/train_short.csv'
path_test_short = '/content/drive/MyDrive/W266_Project/Data/test_short.csv'
path_val_short = '/content/drive/MyDrive/W266_Project/Data/val_short.csv'

# Load the datasets
train_short = pd.read_csv(path_train_short)
test_short = pd.read_csv(path_test_short)
val_short = pd.read_csv(path_val_short)

# Check the first few rows to ensure they're loaded correctly
print("Short_dialogue_train_data:")
print(train_short.head())

# Check the shape
print("Shape of Short_dialogue_train_data:")
print(train_short.shape)

# Check the max word length of summary fot furture use
print("Max word length of short dialogue trian:")
print(train_short['Dialogue_Length'].max())
print("Max word length of short dialogue val:")
print(val_short['Dialogue_Length'].max())
print("Max word length of short dialogue test:")
print(test_short['Dialogue_Length'].max())


Short_dialogue_train_data:
  section_header                                       section_text  \
0          GENHX  The patient is a 75-year-old female who comes ...   
1      FAM/SOCHX         Significant for diabetes and hypertension.   
2  PASTMEDICALHX                  Significant for anxiety disorder.   
3          GENHX  The patient is a 77-year-old female who is una...   
4      FAM/SOCHX                                   Noncontributory.   

                                            dialogue  Dialogue_Length  \
0  Doctor: Welcome to the clinic. I am Doctor Fra...             1396   
1  Doctor: Does anyone else in your family suffer...              175   
2  Doctor: Have we gone over your survey results ...              256   
3  Guest_clinician: How old is the patient? Docto...              438   
4  Doctor: Do you have a known- Patient: Drug all...              105   

   Summary_Length  
0             677  
1              42  
2              33  
3             325  
4      

### Keybert Extraction

In [None]:
val_short['dialogue_keywords'] = val_short['dialogue'].apply(extract_keywords)
val_short.head(3)


Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_keywords
0,ASSESSMENT,Upper respiratory infection.,Doctor: What brings you in today? Patient: I h...,595,28,"[taking cough stuffy, doctor having symptoms, ..."
1,MEDICATIONS,Ibuprofen.,Doctor: Are you taking any medications current...,89,10,"[just ibuprofen pain, ibuprofen, medications c..."
2,DISPOSITION,The patient will be going home.,Doctor: Ready to go home? Patient: I just can'...,137,31,"[doctor ready home, understand patient bye, pa..."


In [None]:
test_short['dialogue_keywords'] = test_short['dialogue'].apply(extract_keywords)
test_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_keywords
0,FAM/SOCHX,The patient lives with her husband of 48 years...,"Doctor: Hi, there. Patient: Hi. Guest_family: ...",1756,602,"[patient hi guest_family, meet husband, lookin..."
1,ROS,"ONCOLOGIC: No history of any cancer, change in...",Doctor: Were you ever diagnosed with any kind ...,336,129,"[patient doctor energy, cancer past, sudden we..."
2,PASTMEDICALHX,The patient denies any previous past medical h...,"Doctor: Hi there! Welcome in, sir. Patient: Hi...",255,126,"[doctor hi welcome, history patient, doctor pr..."


In [None]:
train_short['dialogue_keywords'] = train_short['dialogue'].apply(extract_keywords)
train_short.head(3)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_keywords
0,GENHX,The patient is a 75-year-old female who comes ...,Doctor: Welcome to the clinic. I am Doctor Fra...,1396,677,"[doctor did dizzy, stroke patient morning, num..."
1,FAM/SOCHX,Significant for diabetes and hypertension.,Doctor: Does anyone else in your family suffer...,175,42,"[relatives actually doctor, family suffer high..."
2,PASTMEDICALHX,Significant for anxiety disorder.,Doctor: Have we gone over your survey results ...,256,33,"[doctor yes anxiety, survey results visit, doc..."


### Save as new CSV file

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
train_short.to_csv(os.path.join(output_dir, 'train_Short_Keybert.csv'), index=False)
test_short.to_csv(os.path.join(output_dir, 'test_Short_Keybert.csv'), index=False)
val_short.to_csv(os.path.join(output_dir, 'val_Short_Keybert.csv'), index=False)