# Load libraries

# Clinical NER

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import io
import os
import json
import ast
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import pipeline
import torch

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Long Didlogue

### Load dataset


In [None]:
# Paths to the files
path_train_long = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction/train_Long_clinical_NER.csv'
path_test_long = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction/test_long_clinical_NER.csv'
path_val_long = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction/val_long_clinical_NER.csv'

# Load the datasets
train_long_clinical = pd.read_csv(path_train_long)
test_long_clinical = pd.read_csv(path_test_long)
val_long_clinical = pd.read_csv(path_val_long)

# Check the first few rows to ensure they're loaded correctly
print("Long_dialogue_train_data:")
print(train_long_clinical.head())

# Check the shape
print("Shape of Long_dialogue_train/test/val_data:")
print(train_long_clinical.shape)
print(test_long_clinical.shape)
print(val_long_clinical.shape)



Long_dialogue_train_data:
                                            dialogue  \
0  doctor donna torres , date of birth , 08/01/19...   
1  doctor: Good morning, Mr. patient. I'm Dr. doc...   
2  doctor: Hello Mrs. patient, thank you for comi...   
3  doctor hi virginia how're you today patient i'...   
4  doctor: Hello, Mrs. patient, welcome back. How...   

                                                note  Dialogue_Length  \
0  SUBJECTIVE CHIEF COMPLAINT Annual health maint...             8595   
1  SUBJECTIVE CHIEF COMPLAINT Patient reports fru...             1760   
2  SUBJECTIVE CHIEF COMPLAINT Left arm pain after...             4074   
3  SUBJECTIVE CHIEF COMPLAINT Right knee pain. HI...             6728   
4  SUBJECTIVE CHIEF COMPLAINT Recurrent low back ...             1841   

   Note_Length                                  dialogue_entities  \
0         2794  [{'token': '▁fibro', 'label': 'B-DISEASE_DISOR...   
1         1536  [{'token': '▁treated', 'label': 'I-HISTORY'}

### Get uique NER for each row



In [None]:
# Function to process each entry
def keep_unique_phrases(entry):
    try:
        # Convert string to list of dictionaries using ast.literal_eval
        entities = ast.literal_eval(entry)
    except (ValueError, SyntaxError) as e:
        print(f"Error decoding: {e}")
        return entry  # Return the original entry if there's an error

    # Use a dictionary to keep track of unique phrases and their labels
    unique_entities_dict = {}

    for entity in entities:
        phrase = entity['phrase']
        label = entity['label']
        unique_entities_dict[phrase] = label

    # Convert dictionary back to list of dictionaries
    unique_entities = [{'phrase': phrase, 'label': label} for phrase, label in unique_entities_dict.items()]

    return str(unique_entities)

In [None]:
for df in [train_long_clinical, test_long_clinical, val_long_clinical]:
  df['clinical_ner_unique_label'] = df['dialogue_entities_merged'].apply(keep_unique_phrases)

In [None]:
train_long_clinical.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities,dialogue_entities_merged,clinical_ner_unique_label
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,8595,2794,"[{'token': '▁fibro', 'label': 'B-DISEASE_DISOR...","[{'phrase': 'fibro aden omas', 'label': 'DISEA...","[{'phrase': 'fibro aden omas', 'label': 'DISEA..."
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,1760,1536,"[{'token': '▁treated', 'label': 'I-HISTORY'}, ...","[{'phrase': 'treated', 'label': 'HISTORY'}, {'...","[{'phrase': 'treated', 'label': 'HISTORY'}, {'..."
2,"doctor: Hello Mrs. patient, thank you for comi...",SUBJECTIVE CHIEF COMPLAINT Left arm pain after...,4074,2971,"[{'token': 'lipid', 'label': 'I-DISEASE_DISORD...","[{'phrase': 'lipid', 'label': 'DISEASE_DISORDE...","[{'phrase': 'lipid', 'label': 'DISEASE_DISORDE..."


In [None]:
test_long_clinical.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities,dialogue_entities_merged,clinical_ner_unique_label
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,2483,1895,"[{'token': '▁July', 'label': 'B-DATE'}, {'toke...","[{'phrase': 'July of 2006', 'label': 'DATE'}, ...","[{'phrase': 'July of 2006', 'label': 'DATE'}, ..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...",1943,1425,"[{'token': '▁irritable', 'label': 'B-SIGN_SYMP...","[{'phrase': 'irritable', 'label': 'SIGN_SYMPTO...","[{'phrase': 'irritable', 'label': 'SIGN_SYMPTO..."
2,"doctor: Good morning, Mrs. patient, thank you ...",SUBJECTIVE CHIEF COMPLAINT Genetic counseling....,2676,2407,"[{'token': '▁polyps', 'label': 'B-SIGN_SYMPTOM...","[{'phrase': 'polyps', 'label': 'SIGN_SYMPTOM'}...","[{'phrase': 'polyps', 'label': 'SIGN_SYMPTOM'}..."


In [None]:
val_long_clinical.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities,dialogue_entities_merged,clinical_ner_unique_label
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[{'token': '▁infections', 'label': 'I-DISEASE_...","[{'phrase': 'infections', 'label': 'DISEASE_DI...","[{'phrase': 'infections', 'label': 'DISEASE_DI..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[{'token': '▁vomiting', 'label': 'B-SIGN_SYMPT...","[{'phrase': 'vomiting', 'label': 'SIGN_SYMPTOM...","[{'phrase': 'vomiting', 'label': 'SIGN_SYMPTOM..."
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[{'token': '▁pain', 'label': 'B-SIGN_SYMPTOM'}...","[{'phrase': 'pain', 'label': 'SIGN_SYMPTOM'}, ...","[{'phrase': 'pain', 'label': 'SIGN_SYMPTOM'}, ..."


### keep entity only

In [None]:
# Function to extract phrases
def extract_phrases(entry):
    entities = ast.literal_eval(entry)
    return [entity['phrase'] for entity in entities]

In [None]:
for df in [train_long_clinical, test_long_clinical, val_long_clinical]:
  df['clinical_ner_no_label'] = df['dialogue_entities_merged'].apply(extract_phrases)

In [None]:
for df in [train_long_clinical, test_long_clinical, val_long_clinical]:
  df['clinical_ner_unique_no_label'] = df['clinical_ner_unique_label'].apply(extract_phrases)

In [None]:
train_long_clinical.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities,dialogue_entities_merged,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,8595,2794,"[{'token': '▁fibro', 'label': 'B-DISEASE_DISOR...","[{'phrase': 'fibro aden omas', 'label': 'DISEA...","[{'phrase': 'fibro aden omas', 'label': 'DISEA...","[fibro aden omas, 40, 45, breast, cancer, canc...","[fibro aden omas, 40, 45, breast, cancer, ultr..."
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,1760,1536,"[{'token': '▁treated', 'label': 'I-HISTORY'}, ...","[{'phrase': 'treated', 'label': 'HISTORY'}, {'...","[{'phrase': 'treated', 'label': 'HISTORY'}, {'...","[treated, Seroquel, anxiety, random, urine dru...","[treated, Seroquel, anxiety, random, urine dru..."
2,"doctor: Hello Mrs. patient, thank you for comi...",SUBJECTIVE CHIEF COMPLAINT Left arm pain after...,4074,2971,"[{'token': 'lipid', 'label': 'I-DISEASE_DISORD...","[{'phrase': 'lipid', 'label': 'DISEASE_DISORDE...","[{'phrase': 'lipid', 'label': 'DISEASE_DISORDE...","[lipid, De trol LA, Pri lose c, glucosamine, m...","[lipid, De trol LA, Pri lose c, glucosamine, m..."


In [None]:
val_long_clinical.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities,dialogue_entities_merged,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,2745,2178,"[{'token': '▁infections', 'label': 'I-DISEASE_...","[{'phrase': 'infections', 'label': 'DISEASE_DI...","[{'phrase': 'infections', 'label': 'DISEASE_DI...","[infections, sore throat, cough, ears, hurt, f...","[infections, sore throat, cough, ears, hurt, f..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,1997,1294,"[{'token': '▁vomiting', 'label': 'B-SIGN_SYMPT...","[{'phrase': 'vomiting', 'label': 'SIGN_SYMPTOM...","[{'phrase': 'vomiting', 'label': 'SIGN_SYMPTOM...","[vomiting, raw, carrots, few weeks ago, within...","[vomiting, raw, carrots, few weeks ago, within..."
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,2295,1519,"[{'token': '▁pain', 'label': 'B-SIGN_SYMPTOM'}...","[{'phrase': 'pain', 'label': 'SIGN_SYMPTOM'}, ...","[{'phrase': 'pain', 'label': 'SIGN_SYMPTOM'}, ...","[pain, pain medication, walk, morbid, obesity,...","[pain, pain medication, walk, morbid, obesity,..."


In [None]:
test_long_clinical.head(3)

Unnamed: 0,dialogue,note,Dialogue_Length,Note_Length,dialogue_entities,dialogue_entities_merged,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,2483,1895,"[{'token': '▁July', 'label': 'B-DATE'}, {'toke...","[{'phrase': 'July of 2006', 'label': 'DATE'}, ...","[{'phrase': 'July of 2006', 'label': 'DATE'}, ...","[July of 2006, August of 2007, seizures, hemi ...","[July of 2006, August of 2007, seizures, hemi ..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...",1943,1425,"[{'token': '▁irritable', 'label': 'B-SIGN_SYMP...","[{'phrase': 'irritable', 'label': 'SIGN_SYMPTO...","[{'phrase': 'irritable', 'label': 'SIGN_SYMPTO...","[irritable, fever, about 24 hours, fussy, cryi...","[irritable, fever, about 24 hours, fussy, cryi..."
2,"doctor: Good morning, Mrs. patient, thank you ...",SUBJECTIVE CHIEF COMPLAINT Genetic counseling....,2676,2407,"[{'token': '▁polyps', 'label': 'B-SIGN_SYMPTOM...","[{'phrase': 'polyps', 'label': 'SIGN_SYMPTOM'}...","[{'phrase': 'polyps', 'label': 'SIGN_SYMPTOM'}...","[polyps, was 50, colon os copies, five years, ...","[polyps, was 50, colon os copies, five years, ..."


### change the column to be more readable

In [None]:
# Drop the 'dialogue_entities' column
def drop_columns(df, columns_to_drop):
    df = df.drop(columns=columns_to_drop)
    return df

# Rename the 'dialogue_entities_merged' column to 'clinical_ner_label'
def change_column_name(df, old_name, new_name):
    df = df.rename(columns={old_name: new_name})
    return df



In [None]:
columns_to_drop=["dialogue_entities","Note_Length","Dialogue_Length"]
train_long_clinical = drop_columns(train_long_clinical, columns_to_drop)
train_long_clinical = change_column_name(train_long_clinical, old_name='dialogue_entities_merged', new_name='clinical_ner_label')
train_long_clinical.head(3)

Unnamed: 0,dialogue,note,clinical_ner_label,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,"[{'phrase': 'fibro aden omas', 'label': 'DISEA...","[{'phrase': 'fibro aden omas', 'label': 'DISEA...","[fibro aden omas, 40, 45, breast, cancer, canc...","[fibro aden omas, 40, 45, breast, cancer, ultr..."
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,"[{'phrase': 'treated', 'label': 'HISTORY'}, {'...","[{'phrase': 'treated', 'label': 'HISTORY'}, {'...","[treated, Seroquel, anxiety, random, urine dru...","[treated, Seroquel, anxiety, random, urine dru..."
2,"doctor: Hello Mrs. patient, thank you for comi...",SUBJECTIVE CHIEF COMPLAINT Left arm pain after...,"[{'phrase': 'lipid', 'label': 'DISEASE_DISORDE...","[{'phrase': 'lipid', 'label': 'DISEASE_DISORDE...","[lipid, De trol LA, Pri lose c, glucosamine, m...","[lipid, De trol LA, Pri lose c, glucosamine, m..."


In [None]:
test_long_clinical = drop_columns(test_long_clinical, columns_to_drop)
test_long_clinical = change_column_name(test_long_clinical, old_name='dialogue_entities_merged', new_name='clinical_ner_label')
test_long_clinical.head(3)

Unnamed: 0,dialogue,note,clinical_ner_label,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,"[{'phrase': 'July of 2006', 'label': 'DATE'}, ...","[{'phrase': 'July of 2006', 'label': 'DATE'}, ...","[July of 2006, August of 2007, seizures, hemi ...","[July of 2006, August of 2007, seizures, hemi ..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...","[{'phrase': 'irritable', 'label': 'SIGN_SYMPTO...","[{'phrase': 'irritable', 'label': 'SIGN_SYMPTO...","[irritable, fever, about 24 hours, fussy, cryi...","[irritable, fever, about 24 hours, fussy, cryi..."
2,"doctor: Good morning, Mrs. patient, thank you ...",SUBJECTIVE CHIEF COMPLAINT Genetic counseling....,"[{'phrase': 'polyps', 'label': 'SIGN_SYMPTOM'}...","[{'phrase': 'polyps', 'label': 'SIGN_SYMPTOM'}...","[polyps, was 50, colon os copies, five years, ...","[polyps, was 50, colon os copies, five years, ..."


In [None]:
val_long_clinical = drop_columns(val_long_clinical, columns_to_drop)
val_long_clinical = change_column_name(val_long_clinical, old_name='dialogue_entities_merged', new_name='clinical_ner_label')
val_long_clinical.head(3)

Unnamed: 0,dialogue,note,clinical_ner_label,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,"[{'phrase': 'infections', 'label': 'DISEASE_DI...","[{'phrase': 'infections', 'label': 'DISEASE_DI...","[infections, sore throat, cough, ears, hurt, f...","[infections, sore throat, cough, ears, hurt, f..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,"[{'phrase': 'vomiting', 'label': 'SIGN_SYMPTOM...","[{'phrase': 'vomiting', 'label': 'SIGN_SYMPTOM...","[vomiting, raw, carrots, few weeks ago, within...","[vomiting, raw, carrots, few weeks ago, within..."
2,"doctor: Hello Ms. A, thank you for coming in t...",SUBJECTIVE CHIEF COMPLAINT Postoperative evalu...,"[{'phrase': 'pain', 'label': 'SIGN_SYMPTOM'}, ...","[{'phrase': 'pain', 'label': 'SIGN_SYMPTOM'}, ...","[pain, pain medication, walk, morbid, obesity,...","[pain, pain medication, walk, morbid, obesity,..."


### Save to new CSV file

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_cleaning'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
train_long_clinical.to_csv(os.path.join(output_dir, 'train_long_clinical_NER.csv'), index=False)
val_long_clinical.to_csv(os.path.join(output_dir, 'val_long_clinical_NER.csv'), index=False)
test_long_clinical.to_csv(os.path.join(output_dir, 'test_long_clinical_NER.csv'), index=False)

### get word count percentage for NER

In [None]:
dfs = {
    'train_long_clinical': train_long_clinical,
    'test_long_clinical': test_long_clinical,
    'val_long_clinical': val_long_clinical
}

# Function to count words in a string
def word_count(text):
    return len(text.split())

# Function to count words in a list of strings
def word_count_list(text_list):
    return sum(len(text.split()) for text in text_list)

# Process each DataFrame
for name, df in dfs.items():
    # Calculate word counts
    df['dialogue_word_count'] = df['dialogue'].apply(word_count)
    df['clinical_ner_no_label_word_count'] = df['clinical_ner_no_label'].apply(word_count_list)
    df['clinical_ner_unique_no_label_word_count'] = df['clinical_ner_unique_no_label'].apply(word_count_list)

    # Calculate word count percentage and round to 2 decimal places
    df['clinical_ner_no_label_percentage'] = ((df['clinical_ner_no_label_word_count'] / df['dialogue_word_count']) * 100).round(2)
    df['clinical_ner_unique_no_label_percentage'] = ((df['clinical_ner_unique_no_label_word_count'] / df['dialogue_word_count']) * 100).round(2)

    # Get overall statistics
    overall_stats = df[['clinical_ner_no_label_percentage', 'clinical_ner_unique_no_label_percentage']].describe()

    # Print DataFrame name and overall statistics
    print(f"\n{name} Overall Statistics:")
    print(overall_stats)



train_long_clinical Overall Statistics:
       clinical_ner_no_label_percentage  \
count                       1102.000000   
mean                          19.055336   
std                            7.910466   
min                            0.640000   
25%                           13.610000   
50%                           18.520000   
75%                           23.912500   
max                           50.840000   

       clinical_ner_unique_no_label_percentage  
count                              1102.000000  
mean                                 16.727559  
std                                   7.184536  
min                                   0.540000  
25%                                  11.825000  
50%                                  16.035000  
75%                                  20.882500  
max                                  46.970000  

test_long_clinical Overall Statistics:
       clinical_ner_no_label_percentage  \
count                        180.000000   
mean

In [None]:
train_long_clinical.head(5)

Unnamed: 0,dialogue,note,clinical_ner_label,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label,dialogue_word_count,clinical_ner_no_label_word_count,clinical_ner_unique_no_label_word_count,clinical_ner_no_label_percentage,clinical_ner_unique_no_label_percentage
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,"[{'phrase': 'fibro aden omas', 'label': 'DISEA...","[{'phrase': 'fibro aden omas', 'label': 'DISEA...","[fibro aden omas, 40, 45, breast, cancer, canc...","[fibro aden omas, 40, 45, breast, cancer, ultr...",1867,12,10,0.64,0.54
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,"[{'phrase': 'treated', 'label': 'HISTORY'}, {'...","[{'phrase': 'treated', 'label': 'HISTORY'}, {'...","[treated, Seroquel, anxiety, random, urine dru...","[treated, Seroquel, anxiety, random, urine dru...",330,8,8,2.42,2.42
2,"doctor: Hello Mrs. patient, thank you for comi...",SUBJECTIVE CHIEF COMPLAINT Left arm pain after...,"[{'phrase': 'lipid', 'label': 'DISEASE_DISORDE...","[{'phrase': 'lipid', 'label': 'DISEASE_DISORDE...","[lipid, De trol LA, Pri lose c, glucosamine, m...","[lipid, De trol LA, Pri lose c, glucosamine, m...",654,197,162,30.12,24.77
3,doctor hi virginia how're you today patient i'...,SUBJECTIVE CHIEF COMPLAINT Right knee pain. HI...,"[{'phrase': 'days', 'label': 'DURATION'}, {'ph...","[{'phrase': 'days', 'label': 'DURATION'}, {'ph...","[days, therapy, knee, bony, fractures, effusio...","[days, therapy, knee, bony, fractures, effusio...",1367,55,50,4.02,3.66
4,"doctor: Hello, Mrs. patient, welcome back. How...",SUBJECTIVE CHIEF COMPLAINT Recurrent low back ...,"[{'phrase': 'blood pressure', 'label': 'DIAGNO...","[{'phrase': 'blood pressure', 'label': 'DIAGNO...","[blood pressure, 144 / 72, blood, pressure, me...","[blood pressure, 144 / 72, blood, pressure, me...",320,75,66,23.44,20.62


## Short Dialogue

### Load the data

In [None]:
# Paths to the files
path_train_short = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction/train_short_clinical_NER.csv'
path_test_short = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction/test_short_clincal_NER.csv'
path_val_short = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction/val_short_clincal_NER.csv'

# Load the datasets
train_short_clinical = pd.read_csv(path_train_short)
test_short_clinical = pd.read_csv(path_test_short)
val_short_clinical = pd.read_csv(path_val_short)

# Check the first few rows to ensure they're loaded correctly
print("Short_dialogue_train_data:")
print(train_short_clinical.head())

# Check the shape
print("Shape of Short_dialogue_train_data:")
print(train_short_clinical.shape)

# Check the max word length of summary fot furture use
print("Max word length of short dialogue trian:")
print(train_short_clinical['Dialogue_Length'].max())
print("Max word length of short dialogue val:")
print(test_short_clinical['Dialogue_Length'].max())
print("Max word length of short dialogue test:")
print(val_short_clinical['Dialogue_Length'].max())


Short_dialogue_train_data:
  section_header                                       section_text  \
0          GENHX  The patient is a 75-year-old female who comes ...   
1      FAM/SOCHX         Significant for diabetes and hypertension.   
2  PASTMEDICALHX                  Significant for anxiety disorder.   
3          GENHX  The patient is a 77-year-old female who is una...   
4      FAM/SOCHX                                   Noncontributory.   

                                            dialogue  Dialogue_Length  \
0  Doctor: Welcome to the clinic. I am Doctor Fra...             1396   
1  Doctor: Does anyone else in your family suffer...              175   
2  Doctor: Have we gone over your survey results ...              256   
3  Guest_clinician: How old is the patient? Docto...              438   
4  Doctor: Do you have a known- Patient: Drug all...              105   

   Summary_Length                                  dialogue_entities  \
0             677  [{'token': '▁str

### Get unique NER for each row

In [None]:
for df in [train_short_clinical, test_short_clinical, val_short_clinical]:
  df['clinical_ner_unique_label'] = df['dialogue_entities_merged'].apply(keep_unique_phrases)

### Keep entity only

In [None]:
def extract_phrases(entry):
    entities = ast.literal_eval(entry)
    return [entity['phrase'] for entity in entities]

In [None]:
for df in [train_short_clinical, test_short_clinical, val_short_clinical]:
  df['clinical_ner_no_label'] = df['dialogue_entities_merged'].apply(extract_phrases)

In [None]:
for df in [train_short_clinical, test_short_clinical, val_short_clinical]:
  df['clinical_ner_unique_no_label'] = df['clinical_ner_unique_label'].apply(extract_phrases)

In [None]:
train_short_clinical.head(1)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length,dialogue_entities,dialogue_entities_merged,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,GENHX,The patient is a 75-year-old female who comes ...,Doctor: Welcome to the clinic. I am Doctor Fra...,1396,677,"[{'token': '▁stroke', 'label': 'B-DISEASE_DISO...","[{'phrase': 'stroke', 'label': 'DISEASE_DISORD...","[{'phrase': 'stroke', 'label': 'DISEASE_DISORD...","[stroke, This morning, something, throat, dizz...","[stroke, This morning, something, throat, dizz..."


### change column name to be more readable

In [None]:
columns_to_drop=["dialogue_entities","Summary_Length","Dialogue_Length"]
train_short_clinical = drop_columns(train_short_clinical, columns_to_drop)
train_short_clinical = change_column_name(train_short_clinical, old_name='dialogue_entities_merged', new_name='clinical_ner_label')
train_short_clinical.head(3)

Unnamed: 0,section_header,section_text,dialogue,clinical_ner_label,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,GENHX,The patient is a 75-year-old female who comes ...,Doctor: Welcome to the clinic. I am Doctor Fra...,"[{'phrase': 'stroke', 'label': 'DISEASE_DISORD...","[{'phrase': 'stroke', 'label': 'DISEASE_DISORD...","[stroke, This morning, something, throat, dizz...","[stroke, This morning, something, throat, dizz..."
1,FAM/SOCHX,Significant for diabetes and hypertension.,Doctor: Does anyone else in your family suffer...,"[{'phrase': 'family', 'label': 'HISTORY'}, {'p...","[{'phrase': 'family', 'label': 'HISTORY'}, {'p...","[family, high, blood, pressure, diabetes]","[family, high, blood, pressure, diabetes]"
2,PASTMEDICALHX,Significant for anxiety disorder.,Doctor: Have we gone over your survey results ...,"[{'phrase': 'survey', 'label': 'DIAGNOSTIC_PRO...","[{'phrase': 'survey', 'label': 'DIAGNOSTIC_PRO...","[survey, anxiety disorder]","[survey, anxiety disorder]"


In [None]:
test_short_clinical = drop_columns(test_short_clinical, columns_to_drop)
test_short_clinical = change_column_name(test_short_clinical, old_name='dialogue_entities_merged', new_name='clinical_ner_label')
test_short_clinical.head(3)

Unnamed: 0,section_header,section_text,dialogue,clinical_ner_label,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,FAM/SOCHX,The patient lives with her husband of 48 years...,"Doctor: Hi, there. Patient: Hi. Guest_family: ...","[{'phrase': 'mental health', 'label': 'DISEASE...","[{'phrase': 'mental health', 'label': 'DISEASE...","[mental health, registered, nurse, drink alcoh...","[mental health, registered, nurse, drink alcoh..."
1,ROS,"ONCOLOGIC: No history of any cancer, change in...",Doctor: Were you ever diagnosed with any kind ...,[{'phrase': 'diagnosed with any kind of cancer...,[{'phrase': 'diagnosed with any kind of cancer...,[diagnosed with any kind of cancer in the past...,[diagnosed with any kind of cancer in the past...
2,PASTMEDICALHX,The patient denies any previous past medical h...,"Doctor: Hi there! Welcome in, sir. Patient: Hi...",[{'phrase': 'health problems primary care doct...,[{'phrase': 'health problems primary care doct...,[health problems primary care doctor health in...,[health problems primary care doctor health in...


In [None]:
val_short_clinical = drop_columns(val_short_clinical, columns_to_drop)
val_short_clinical = change_column_name(val_short_clinical, old_name='dialogue_entities_merged', new_name='clinical_ner_label')
val_short_clinical.head(3)

Unnamed: 0,section_header,section_text,dialogue,clinical_ner_label,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label
0,ASSESSMENT,Upper respiratory infection.,Doctor: What brings you in today? Patient: I h...,"[{'phrase': 'cough', 'label': 'SIGN_SYMPTOM'},...","[{'phrase': 'cough', 'label': 'SIGN_SYMPTOM'},...","[cough, nose, stuffy, up, chest, pain, headach...","[cough, nose, stuffy, up, chest, pain, headach..."
1,MEDICATIONS,Ibuprofen.,Doctor: Are you taking any medications current...,"[{'phrase': 'medications', 'label': 'HISTORY'}...","[{'phrase': 'medications', 'label': 'HISTORY'}...","[medications, Ibuprofen]","[medications, Ibuprofen]"
2,DISPOSITION,The patient will be going home.,Doctor: Ready to go home? Patient: I just can'...,[],[],[],[]


### Save to csv file

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_cleaning'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
train_short_clinical.to_csv(os.path.join(output_dir, 'train_short_clinical_NER.csv'), index=False)
val_short_clinical.to_csv(os.path.join(output_dir, 'val_short_clinical_NER.csv'), index=False)
test_short_clinical.to_csv(os.path.join(output_dir, 'test_short_clinical_NER.csv'), index=False)

### get word count percentage for NER

In [None]:
dfs = {
    'train_short_clinical': train_short_clinical,
    'test_short_clinical': test_short_clinical,
    'val_short_clinical': val_short_clinical
}

# Function to count words in a string
def word_count(text):
    return len(text.split())

# Function to count words in a list of strings
def word_count_list(text_list):
    return sum(len(text.split()) for text in text_list)

# Process each DataFrame
for name, df in dfs.items():
    # Calculate word counts
    df['dialogue_word_count'] = df['dialogue'].apply(word_count)
    df['clinical_ner_no_label_word_count'] = df['clinical_ner_no_label'].apply(word_count_list)
    df['clinical_ner_unique_no_label_word_count'] = df['clinical_ner_unique_no_label'].apply(word_count_list)

    # Calculate word count percentage and round to 2 decimal places
    df['clinical_ner_no_label_percentage'] = ((df['clinical_ner_no_label_word_count'] / df['dialogue_word_count']) * 100).round(2)
    df['clinical_ner_unique_no_label_percentage'] = ((df['clinical_ner_unique_no_label_word_count'] / df['dialogue_word_count']) * 100).round(2)

    # Get overall statistics
    overall_stats = df[['clinical_ner_no_label_percentage', 'clinical_ner_unique_no_label_percentage']].describe()

    # Print DataFrame name and overall statistics
    print(f"\n{name} Overall Statistics:")
    print(overall_stats)


train_short_clinical Overall Statistics:
       clinical_ner_no_label_percentage  \
count                       1360.000000   
mean                          17.881522   
std                           10.421508   
min                            0.000000   
25%                           11.110000   
50%                           16.450000   
75%                           22.860000   
max                           76.470000   

       clinical_ner_unique_no_label_percentage  
count                              1360.000000  
mean                                 17.190213  
std                                  10.293515  
min                                   0.000000  
25%                                  10.695000  
50%                                  15.620000  
75%                                  21.832500  
max                                  76.470000  

test_short_clinical Overall Statistics:
       clinical_ner_no_label_percentage  \
count                        222.000000   
me

In [None]:
val_short_clinical.head(5)

Unnamed: 0,section_header,section_text,dialogue,clinical_ner_label,clinical_ner_unique_label,clinical_ner_no_label,clinical_ner_unique_no_label,dialogue_word_count,clinical_ner_no_label_word_count,clinical_ner_unique_no_label_word_count,clinical_ner_no_label_percentage,clinical_ner_unique_no_label_percentage
0,ASSESSMENT,Upper respiratory infection.,Doctor: What brings you in today? Patient: I h...,"[{'phrase': 'cough', 'label': 'SIGN_SYMPTOM'},...","[{'phrase': 'cough', 'label': 'SIGN_SYMPTOM'},...","[cough, nose, stuffy, up, chest, pain, headach...","[cough, nose, stuffy, up, chest, pain, headach...",106,19,18,17.92,16.98
1,MEDICATIONS,Ibuprofen.,Doctor: Are you taking any medications current...,"[{'phrase': 'medications', 'label': 'HISTORY'}...","[{'phrase': 'medications', 'label': 'HISTORY'}...","[medications, Ibuprofen]","[medications, Ibuprofen]",14,2,2,14.29,14.29
2,DISPOSITION,The patient will be going home.,Doctor: Ready to go home? Patient: I just can'...,[],[],[],[],23,0,0,0.0,0.0
3,GENHX,The patient states that pain is constant in na...,"Doctor: Out of ten, ten being the worst pain i...","[{'phrase': 'pain', 'label': 'DIAGNOSTIC_PROCE...","[{'phrase': 'pain', 'label': 'SIGN_SYMPTOM'}, ...","[pain, six or seven, pain, ten out of ten, pai...","[pain, six or seven, ten out of ten, day, than...",194,22,16,11.34,8.25
4,FAM/SOCHX,Retired employee of Champion Automotive Co. De...,Doctor: Are you still working? Patient: Retire...,"[{'phrase': 'car', 'label': 'DETAILED_DESCRIPT...","[{'phrase': 'car', 'label': 'BIOLOGICAL_STRUCT...","[car, repair shop, Automotive, Company, car, r...","[car, repair shop, Automotive, Company, repair...",108,12,11,11.11,10.19


# BIO_BERT disease & chemical NER

## Load dataset

In [None]:
# Paths to the files
path_train_long = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction/train_Long_BIO_NER.csv'
path_test_long = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction/test_Long_BIO_NER.csv'
path_val_long = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_extraction/val_Long_BIO_NER.csv'
path_train_short = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction/train_Short_BIO_NER.csv'
path_test_short = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction/test_Short_BIO_NER.csv'
path_val_short = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_extraction/val_Short_BIO_NER.csv'

# Load the datasets
train_long_bio = pd.read_csv(path_train_long)
test_long_bio = pd.read_csv(path_test_long)
val_long_bio = pd.read_csv(path_val_long)
train_short_bio = pd.read_csv(path_train_short)
test_short_bio = pd.read_csv(path_test_short)
val_short_bio = pd.read_csv(path_val_short)


# Check the first few rows to ensure they're loaded correctly
print("Long_dialogue_train_data:")
print(train_long_bio.head())
print("hort_dialogue_train_data:")
print(train_short_bio.head())

# Check the shape
print("Shape of Long_dialogue_train/test/val_data:")
print(train_long_bio.shape)
print(test_long_bio.shape)
print(val_long_bio.shape)

print("Shape of short_dialogue_train/test/val_data:")
print(train_short_bio.shape)
print(test_short_bio.shape)
print(val_short_bio.shape)




Long_dialogue_train_data:
                                            dialogue  \
0  doctor donna torres , date of birth , 08/01/19...   
1  doctor: Good morning, Mr. patient. I'm Dr. doc...   
2  doctor: Hello Mrs. patient, thank you for comi...   
3  doctor hi virginia how're you today patient i'...   
4  doctor: Hello, Mrs. patient, welcome back. How...   

                                                note  Dialogue_Length  \
0  SUBJECTIVE CHIEF COMPLAINT Annual health maint...             8595   
1  SUBJECTIVE CHIEF COMPLAINT Patient reports fru...             1760   
2  SUBJECTIVE CHIEF COMPLAINT Left arm pain after...             4074   
3  SUBJECTIVE CHIEF COMPLAINT Right knee pain. HI...             6728   
4  SUBJECTIVE CHIEF COMPLAINT Recurrent low back ...             1841   

   Note_Length                          dialogue_disease_entities  \
0         2794  [{'token': 'anxiety', 'label': 'B-DISEASE'}, {...   
1         1536  [{'token': 'anxiety', 'label': 'B-DISEASE'},

In [None]:
dfs = [train_long_bio, test_long_bio, val_long_bio, train_short_bio, test_short_bio, val_short_bio]

## merge the chemical and desease entity

In [None]:
def merge_lists(row):
    # Convert string representations of lists to actual lists using ast.literal_eval()
    disease_list = ast.literal_eval(row['dialogue_disease_entities_merged'])
    chemical_list = ast.literal_eval(row['dialogue_chemical_entities_merged'])

    # Merge the lists
    merged_list = disease_list + chemical_list

    return list(merged_list)

# Create the new column by merging the lists
for df in dfs:
  df['bio_ner_label'] = df.apply(merge_lists, axis=1)

In [None]:
# Function to remove "B-" prefix from label
def remove_b_prefix(row):
    for item in row['bio_ner_label']:
        item['label'] = item['label'].replace('B-', '')
    return row

for df in dfs:
  df = df.apply(remove_b_prefix, axis=1)

## Get unique entity

In [None]:
def keep_unique(items):
    seen = set()
    unique_items = []
    for item in items:
        token_label = (item['token'], item['label'])
        if token_label not in seen:
            seen.add(token_label)
            unique_items.append(item)
    return unique_items

In [None]:
for df in dfs:
  df['bio_ner_unique_label'] = df['bio_ner_label'].apply(keep_unique)

## Kepp entity only

In [None]:
def extract_phrases(entities):
    return [entity['token'] for entity in entities]

In [None]:
for df in dfs:
  df['bio_ner_no_label'] = df['bio_ner_label'].apply(extract_phrases)

In [None]:
for df in dfs:
  df['bio_ner_unique_no_label'] = df['bio_ner_unique_label'].apply(extract_phrases)

## Drop some columns

In [None]:
# Drop the specified columns
columns_to_drop = ['Dialogue_Length','Note_Length','dialogue_disease_entities', 'dialogue_disease_entities_merged',
                   'dialogue_chemical_entities', 'dialogue_chemical_entities_merged']

dfs = [train_long_bio, test_long_bio, val_long_bio, train_short_bio, test_short_bio, val_short_bio]

In [None]:
train_long_bio = train_long_bio.drop(columns=columns_to_drop)
train_long_bio.head(2)

Unnamed: 0,dialogue,note,bio_ner_label,bio_ner_unique_label,bio_ner_no_label,bio_ner_unique_no_label
0,"doctor donna torres , date of birth , 08/01/19...",SUBJECTIVE CHIEF COMPLAINT Annual health maint...,"[{'token': 'anxiety', 'label': 'DISEASE'}, {'t...","[{'token': 'anxiety', 'label': 'DISEASE'}, {'t...","[anxiety, anxiety, irritability, anxiety, anxi...","[anxiety, irritability, progesterone]"
1,"doctor: Good morning, Mr. patient. I'm Dr. doc...",SUBJECTIVE CHIEF COMPLAINT Patient reports fru...,"[{'token': 'anxiety', 'label': 'DISEASE'}, {'t...","[{'token': 'anxiety', 'label': 'DISEASE'}, {'t...","[anxiety, depression]","[anxiety, depression]"


In [None]:
test_long_bio = test_long_bio.drop(columns=columns_to_drop)
test_long_bio.head(2)

Unnamed: 0,dialogue,note,bio_ner_label,bio_ner_unique_label,bio_ner_no_label,bio_ner_unique_no_label
0,"doctor: Good morning, Mr. A. Thank you for com...",SUBJECTIVE CHIEF COMPLAINT New patient evaluat...,"[{'token': 'seizures', 'label': 'DISEASE'}, {'...","[{'token': 'seizures', 'label': 'DISEASE'}, {'...","[seizures, hemiparesis, C . diff, malignant me...","[seizures, hemiparesis, C . diff, malignant me..."
1,"doctor: Hello, I'm Dr. doctor's name. I unders...","SUBJECTIVE CHIEF COMPLAINT Fever, fussiness, a...","[{'token': 'irritable', 'label': 'DISEASE'}, {...","[{'token': 'irritable', 'label': 'DISEASE'}, {...","[irritable, fever, fussy, vomiting, diarrhea, ...","[irritable, fever, fussy, vomiting, diarrhea, ..."


In [None]:
val_long_bio = val_long_bio.drop(columns=columns_to_drop)
val_long_bio.head(2)

Unnamed: 0,dialogue,note,bio_ner_label,bio_ner_unique_label,bio_ner_no_label,bio_ner_unique_no_label
0,"doctor: Good morning, patient. Thank you for c...",SUBJECTIVE CHIEF COMPLAINT Ear infections. HIS...,"[{'token': 'ear infections', 'label': 'DISEASE...","[{'token': 'ear infections', 'label': 'DISEASE...","[ear infections, sore throat, cough, fever, ea...","[ear infections, sore throat, cough, fever, ot..."
1,"doctor: Hello, patient, and welcome to my offi...",SUBJECTIVE CHIEF COMPLAINT Picky eating. HISTO...,"[{'token': 'vomiting', 'label': 'DISEASE'}, {'...","[{'token': 'vomiting', 'label': 'DISEASE'}, {'...","[vomiting, diarrhea, constipation, allergies, ...","[vomiting, diarrhea, constipation, allergies, ..."


In [None]:
columns_to_drop = ['Dialogue_Length','Summary_Length','dialogue_disease_entities', 'dialogue_disease_entities_merged',
                   'dialogue_chemical_entities', 'dialogue_chemical_entities_merged']
train_short_bio = train_short_bio.drop(columns=columns_to_drop)
train_short_bio.head(2)

Unnamed: 0,section_header,section_text,dialogue,bio_ner_label,bio_ner_unique_label,bio_ner_no_label,bio_ner_unique_no_label
0,GENHX,The patient is a 75-year-old female who comes ...,Doctor: Welcome to the clinic. I am Doctor Fra...,"[{'token': 'stroke', 'label': 'DISEASE'}, {'to...","[{'token': 'stroke', 'label': 'DISEASE'}, {'to...","[stroke, stroke, dizzy, numbness, dizzy, dizzy...","[stroke, dizzy, numbness, falling down, cold, ..."
1,FAM/SOCHX,Significant for diabetes and hypertension.,Doctor: Does anyone else in your family suffer...,"[{'token': 'high blood pressure', 'label': 'DI...","[{'token': 'high blood pressure', 'label': 'DI...","[high blood pressure, diabetes]","[high blood pressure, diabetes]"


In [None]:
test_short_bio = test_short_bio.drop(columns=columns_to_drop)
test_short_bio.head(2)

Unnamed: 0,section_header,section_text,dialogue,bio_ner_label,bio_ner_unique_label,bio_ner_no_label,bio_ner_unique_no_label
0,FAM/SOCHX,The patient lives with her husband of 48 years...,"Doctor: Hi, there. Patient: Hi. Guest_family: ...","[{'token': 'mental health', 'label': 'DISEASE'...","[{'token': 'mental health', 'label': 'DISEASE'...","[mental health, alcohol]","[mental health, alcohol]"
1,ROS,"ONCOLOGIC: No history of any cancer, change in...",Doctor: Were you ever diagnosed with any kind ...,"[{'token': 'cancer', 'label': 'DISEASE'}, {'to...","[{'token': 'cancer', 'label': 'DISEASE'}, {'to...","[cancer, rash, weight loss]","[cancer, rash, weight loss]"


In [None]:
val_short_bio = val_short_bio.drop(columns=columns_to_drop)
val_short_bio.head(2)

Unnamed: 0,section_header,section_text,dialogue,bio_ner_label,bio_ner_unique_label,bio_ner_no_label,bio_ner_unique_no_label
0,ASSESSMENT,Upper respiratory infection.,Doctor: What brings you in today? Patient: I h...,"[{'token': 'cough', 'label': 'DISEASE'}, {'tok...","[{'token': 'cough', 'label': 'DISEASE'}, {'tok...","[cough, chest pain, headaches, nausea, vomitin...","[cough, chest pain, headaches, nausea, vomitin..."
1,MEDICATIONS,Ibuprofen.,Doctor: Are you taking any medications current...,"[{'token': 'pain', 'label': 'DISEASE'}, {'toke...","[{'token': 'pain', 'label': 'DISEASE'}, {'toke...","[pain, Ibuprofen]","[pain, Ibuprofen]"


## save ro csv new file


In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/short_dialogue_NER_cleaning'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
train_short_bio.to_csv(os.path.join(output_dir, 'train_short_bio_NER.csv'), index=False)
val_short_bio.to_csv(os.path.join(output_dir, 'val_short_bio_NER.csv'), index=False)
test_short_bio.to_csv(os.path.join(output_dir, 'test_short_bio_NER.csv'), index=False)

In [None]:
# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/W266_Project/Data/long_dialogue_NER_cleaning'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
train_long_bio.to_csv(os.path.join(output_dir, 'train_long_bio_NER.csv'), index=False)
val_long_bio.to_csv(os.path.join(output_dir, 'val_long_bio_NER.csv'), index=False)
test_long_bio.to_csv(os.path.join(output_dir, 'test_long_bio_NER.csv'), index=False)

## check entity percentage

In [None]:
dfs = {
    'train_long_bio': train_long_bio,
    'val_long_bio': val_long_bio,
    'test_long_bio': test_long_bio,
    'train_short_bio': train_short_bio,
    'val_short_bio': val_short_bio,
    'test_short_bio': test_short_bio
}

# Function to count words in a string
def word_count(text):
    return len(text.split())

# Function to count words in a list of strings
def word_count_list(text_list):
    return sum(len(text.split()) for text in text_list)

# Process each DataFrame
for name, df in dfs.items():
    # Calculate word counts
    df['dialogue_word_count'] = df['dialogue'].apply(word_count)
    df['bio_ner_no_label_word_count'] = df['bio_ner_no_label'].apply(word_count_list)
    df['bio_ner_unique_no_label_word_count'] = df['bio_ner_unique_no_label'].apply(word_count_list)

    # Calculate word count percentage and round to 2 decimal places
    df['bio_ner_no_label_percentage'] = ((df['bio_ner_no_label_word_count'] / df['dialogue_word_count']) * 100).round(2)
    df['bio_ner_unique_no_label_percentage'] = ((df['bio_ner_unique_no_label_word_count'] / df['dialogue_word_count']) * 100).round(2)

    # Get overall statistics
    overall_stats = df[['bio_ner_no_label_percentage', 'bio_ner_unique_no_label_percentage']].describe()

    # Print DataFrame name and overall statistics
    print(f"\n{name} Overall Statistics:")
    print(overall_stats)


train_long_bio Overall Statistics:
       bio_ner_no_label_percentage  bio_ner_unique_no_label_percentage
count                  1102.000000                         1102.000000
mean                      4.734891                            3.617332
std                       2.686774                            2.223814
min                       0.000000                            0.000000
25%                       2.772500                            1.950000
50%                       4.600000                            3.455000
75%                       6.397500                            4.877500
max                      15.360000                           14.290000

val_long_bio Overall Statistics:
       bio_ner_no_label_percentage  bio_ner_unique_no_label_percentage
count                    96.000000                           96.000000
mean                      4.665104                            3.561042
std                       2.463472                            2.088728
min    