# Step 1: Upload Dataframes

Use the libraries below to parse .csv and .bib files

In [105]:
import pandas as pd
import bibtexparser # Used to read .bib files
import re
pd.set_option("display.max_columns", None)

# Reading to Dataframes

In [106]:
ieee_df = pd.read_csv("./medical_dialogue_summarization/ieee_medical_dialogue_summarization.csv")
print(f"Columns: {ieee_df.columns}, \n\nLength: {len(ieee_df)}")

Columns: Index(['Document Title', 'Authors', 'Author Affiliations', 'Publication Title',
       'Date Added To Xplore', 'Publication Year', 'Volume', 'Issue',
       'Start Page', 'End Page', 'Abstract', 'ISSN', 'ISBNs', 'DOI',
       'Funding Information', 'PDF Link', 'Author Keywords', 'IEEE Terms',
       'Mesh_Terms', 'Article Citation Count', 'Patent Citation Count',
       'Reference Count', 'License', 'Online Date', 'Issue Date',
       'Meeting Date', 'Publisher', 'Document Identifier'],
      dtype='object'), 

Length: 25


### **NOTE:** Springer has NO abstracts. Get them manually or use an LLM!

In [107]:
springer_df = pd.read_csv("./medical_dialogue_summarization/springer_medical_dialogue_summarization.csv")
print(f"Columns: {springer_df.columns}, \n\nLength: {len(springer_df)}")

Columns: Index(['Item Title', 'Publication Title', 'Book Series Title',
       'Journal Volume', 'Journal Issue', 'Item DOI', 'Authors',
       'Publication Year', 'URL', 'Content Type'],
      dtype='object'), 

Length: 90


In [None]:
# springer_df.to_csv('springer_no_abstracts.csv')

### **NOTE:** Pubmed Abstracts come in a seperate .txt file. Get them manually or use an LLM!

In [109]:
pubmed_df = pd.read_csv("./medical_dialogue_summarization/pubmed_medical_dialogue_summarization.csv")
print(f"Columns: {pubmed_df.columns}, \n\nLength: {len(pubmed_df)}")

Columns: Index(['PMID', 'Title', 'Authors', 'Citation', 'First Author', 'Journal/Book',
       'Publication Year', 'Create Date', 'PMCID', 'NIHMS ID', 'DOI'],
      dtype='object'), 

Length: 53


In [110]:
pubmed_abstracts = pd.read_csv("./medical_dialogue_summarization/pubmed_abstracts_medical_dialogue_summarization_v2.csv")

# Merge on PMID
pubmed_df = pubmed_df.merge(pubmed_abstracts.rename(columns={"Abstract Text": "abstract"}), 
                             on="PMID", how="left")


In [111]:
# Read the .bib file
with open("./medical_dialogue_summarization/acm_medical_dialogue_summarization.bib", encoding="utf-8") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

# Convert entries to a pandas DataFrame
acm_df = pd.DataFrame(bib_database.entries)
print(f"Columns: {acm_df.columns}, \n\nLength: {len(acm_df)}")

Columns: Index(['keywords', 'numpages', 'articleno', 'month', 'journal', 'abstract',
       'doi', 'url', 'issn', 'number', 'volume', 'address', 'publisher',
       'issue_date', 'year', 'title', 'author', 'ENTRYTYPE', 'ID', 'note',
       'pages'],
      dtype='object'), 

Length: 21


In [112]:
with open("./medical_dialogue_summarization/elsevier_medical dialogue summarization_L.bib", encoding="utf-8") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)
elsevier_df_l = pd.DataFrame(bib_database.entries)

with open("./medical_dialogue_summarization/elsevier_medical dialogue summarization_S.bib", encoding="utf-8") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)
elsevier_df_s = pd.DataFrame(bib_database.entries)

elsevier_df = pd.concat([elsevier_df_l, elsevier_df_s])
print(f"Columns: {elsevier_df.columns}, \n\nLength: {len(elsevier_df)}")

Columns: Index(['abstract', 'keywords', 'author', 'url', 'doi', 'issn', 'year', 'pages',
       'volume', 'journal', 'title', 'ENTRYTYPE', 'ID', 'number', 'note'],
      dtype='object'), 

Length: 118


# Assessing Duplicate Values with Title

Using title because the formats of DOI are all different :/

In [113]:
# 1. Create a list of tuples: (dataframe_name, title_series)
df_sources = [
    ("ACM", acm_df["title"]),
    ("Elsevier", elsevier_df["title"]),
    ("IEEE", ieee_df["Document Title"]),
    ("PubMed", pubmed_df["Title"]),
    ("Springer", springer_df["Item Title"])
]

# 2. Create a combined DataFrame with source information
combined_data = []
for source_name, title_series in df_sources:
    # Create a temporary DataFrame with title and source
    temp_df = pd.DataFrame({
        'title': title_series.str.lower(),  # Normalize to lowercase
        'source': source_name
    })
    combined_data.append(temp_df)

# Combine all into one DataFrame
all_titles_df = pd.concat(combined_data, ignore_index=True)

# 3. Remove rows with missing titles
all_titles_df = all_titles_df.dropna(subset=['title'])

# 4. Find titles that appear more than once
title_counts = all_titles_df['title'].value_counts()
duplicate_titles = title_counts[title_counts > 1].index.tolist()

# 5. Filter to show only duplicates and group by title
if not duplicate_titles:
    print("No duplicate titles found across the DataFrames.")
else:
    duplicates_df = all_titles_df[all_titles_df['title'].isin(duplicate_titles)]
    
    print(f"Found {len(duplicate_titles)} duplicate titles across datasets")
    print(f"Total duplicate entries: {len(duplicates_df)}")
    print("=" * 80)
    # Group by title and show which sources contain each duplicate
    for title in sorted(duplicate_titles):
        title_data = duplicates_df[duplicates_df['title'] == title]
        sources = title_data['source'].tolist()
        count = len(sources)
        
        print(f"\nTitle: {title}")
        print(f"   Total occurrences: {count}")
        print(f"   Found in: {', '.join(sources)}")


Found 9 duplicate titles across datasets
Total duplicate entries: 19

Title: adapted large language models can outperform medical experts in clinical text summarization
   Total occurrences: 2
   Found in: PubMed, Springer

Title: assessing the effectiveness of automatic speech recognition technology in emergency medicine settings: a comparative study of four ai-powered engines
   Total occurrences: 3
   Found in: PubMed, PubMed, Springer

Title: evaluating the performance of artificial intelligence-based speech recognition for clinical documentation: a systematic review
   Total occurrences: 2
   Found in: PubMed, Springer

Title: expert evaluation of large language models for clinical dialogue summarization
   Total occurrences: 2
   Found in: PubMed, Springer

Title: exploring the potential of chatgpt in medical dialogue summarization: a study on consistency with human preferences
   Total occurrences: 2
   Found in: PubMed, Springer

Title: medchatzh: a tuning llm for traditional c

# Merging & Standardization

In [114]:
# we want ID, title, abstract, library, authors, doi, journal, date (year), (keywords 3/5)

In [115]:
# ACM
acm_df["library"] = "acm"
acm_df = acm_df.rename(columns={
    "ID": "id", "title": "title", "abstract": "abstract", "author": "authors",
    "doi": "doi", "journal": "journal"
})
acm_df["date"] = acm_df["year"].astype(str) + "-" + acm_df["month"].astype(str)
acm_df = acm_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]

# Elsevier
elsevier_df["library"] = "elsevier"
elsevier_df = elsevier_df.rename(columns={
    "ID": "id", "title": "title", "abstract": "abstract", "author": "authors",
    "doi": "doi", "journal": "journal", "year": "date"
})
elsevier_df["date"] = elsevier_df["date"].astype(str)
elsevier_df = elsevier_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]

# IEEE
ieee_df["library"] = "ieee"
ieee_df = ieee_df.rename(columns={
    "ISBNs": "id", "Document Title": "title", "Abstract": "abstract", "Authors": "authors",
    "DOI": "doi", "Publication Title": "journal", "Online Date": "date"
})
ieee_df = ieee_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]

# PubMed
pubmed_df["library"] = "pubmed"
pubmed_df = pubmed_df.rename(columns={
    "PMID": "id", "Title": "title", "abstract": "abstract", "Authors": "authors",
    "DOI": "doi", "Journal/Book": "journal", "Create Date": "date"
})
pubmed_df = pubmed_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]



In [116]:
combined_df = pd.concat([acm_df, elsevier_df, ieee_df, pubmed_df], ignore_index=True)
print(len(combined_df))

217


In [117]:
combined_df.head()

Unnamed: 0,id,title,abstract,library,authors,doi,journal,date
0,10.1145/3620675,Multi-aspect Understanding with Cooperative Gr...,Medical dialogue information extraction is an ...,acm,"Lin, Rui and Fan, Jing and Wu, Haifeng",10.1145/3620675,ACM Trans. Intell. Syst. Technol.,2023-November
1,10.1145/3712300,PALLM: Evaluating and Enhancing PALLiative Car...,Effective patient-provider communication is cr...,acm,"Wang, Zhiyuan and Yuan, Fangxu and LeBaron, Vi...",10.1145/3712300,ACM Trans. Comput. Healthcare,2025-January
2,10.1145/3731445,A Systematic Survey of Text Summarization: Fro...,Text summarization research has undergone seve...,acm,"Zhang, Haopeng and Yu, Philip S. and Zhang, Ji...",10.1145/3731445,ACM Comput. Surv.,2025-June
3,10.1145/3709365,MedInsight: A Multi-Source Context Augmentatio...,Providing contextual and comprehensive medical...,acm,"Neupane, Subash and Mitra, Shaswata and Mittal...",10.1145/3709365,ACM Trans. Comput. Healthcare,2025-April
4,10.1145/3715069,MedProm: Bridging Dialogue Gaps in Healthcare ...,"In medical dialogue systems, recent advancemen...",acm,"Varshney, Deeksha and Behera, Niranshu and Kat...",10.1145/3715069,ACM Trans. Comput. Healthcare,2025-January


# Assessing Incorrect Search Results

In [118]:
medical_terms = ["medical dialogue summarization", "medical dialogue", 
                 "clinical conversation summarization", "doctor patient dialogue", 
                 "medical transcription", "clinical conversation", "clinical dialogue"]

llm_terms = ["reasoning", "transfomer", "genarative ai", "llm", "large language models", 
             "llms", "gpt", "natural language processing", "nlp", "claude", "gemini", 
             "deepseek", "chatgpt", "transformer model", "bert", "llama", "mistral"]

# Extract individual words from each phrase
medical_words = set()
for phrase in medical_terms:
    medical_words.update(phrase.split())

llm_words = set()
for phrase in llm_terms:
    llm_words.update(phrase.split())

# Or combine into one set
all_words = medical_words | llm_words

# More concise version:
all_words = set(' '.join(medical_terms + llm_terms).split())


In [119]:
### IF NONE OF THE WORDS ARE EVEN IN TITLE, GET RID

lower_titles = combined_df["title"].str.lower()

# Create a regex pattern that matches any of the terms (case-insensitive)
pattern = '|'.join(all_words)

# Find records where NONE of the terms appear
filtered_titles = lower_titles[~lower_titles.str.contains(pattern, case=False, na=False, regex=True)]
print(f"{len(filtered_titles)} Incorrect Search Results")

21 Incorrect Search Results


In [120]:
combined_df = combined_df[combined_df["title"].str.contains(pattern, case=False, na=False, regex=True)]

In [121]:
len(combined_df)

196

In [122]:
### NEEDS TO HAVE AT LEAST 1 LLM WORD AND 1 HEALTHCARE WORD IN title
### DO THIS FOR ABSTRACT OR TITLE??? OR BOTH???

# Create regex patterns (escaped + case-insensitive)
medical_pattern = r'(' + '|'.join(map(re.escape, medical_words)) + r')'
llm_pattern = r'(' + '|'.join(map(re.escape, llm_words)) + r')'

lower_titles = combined_df["title"].str.lower()

# Boolean masks
has_medical = lower_titles.str.contains(medical_pattern, case=False, na=False, regex=True)
has_llm = lower_titles.str.contains(llm_pattern, case=False, na=False, regex=True)

# titles missing one side → these are incorrect results
incorrect_matches = lower_titles[~(has_medical & has_llm)]

print(f"{len(incorrect_matches)} Incorrect Search Results")


114 Incorrect Search Results


  has_medical = lower_titles.str.contains(medical_pattern, case=False, na=False, regex=True)
  has_llm = lower_titles.str.contains(llm_pattern, case=False, na=False, regex=True)


In [123]:
incorrect_matches[9]

'commsense: a wearable sensing computational framework for evaluating patient-clinician interactions'

In [124]:
# Keep only titles that have BOTH → these are correct matches
combined_df = combined_df[has_medical & has_llm]
len(combined_df)

82

In [125]:
### BOTH ABSTRACTS AND TITLE
lower_titles = combined_df["title"].str.lower()
lower_abstracts = combined_df["abstract"].str.lower()

has_medical = (
    lower_titles.str.contains(medical_pattern, na=False, regex=True) |
    lower_abstracts.str.contains(medical_pattern, na=False, regex=True)
)

has_llm = (
    lower_titles.str.contains(llm_pattern, na=False, regex=True) |
    lower_abstracts.str.contains(llm_pattern, na=False, regex=True)
)

combined_df = combined_df[has_medical & has_llm]

  lower_titles.str.contains(medical_pattern, na=False, regex=True) |
  lower_abstracts.str.contains(medical_pattern, na=False, regex=True)
  lower_titles.str.contains(llm_pattern, na=False, regex=True) |
  lower_abstracts.str.contains(llm_pattern, na=False, regex=True)


In [126]:
print(len(combined_df))

82


In [127]:
for i in incorrect_matches:
    print(i, "\n")

multi-aspect understanding with cooperative graph attention networks for medical dialogue information extraction 

shennongmgs: an llm-based chinese medication guidance system 

commsense: a wearable sensing computational framework for evaluating patient-clinician interactions 

drhouse: an llm-empowered diagnostic reasoning system through harnessing outcomes from sensor data and expert knowledge 

combination of loss-based active learning and semi-supervised learning for recognizing entities in chinese electronic medical records 

taxonomy of abstractive dialogue summarization: scenarios, approaches, and future directions 

document-level relation extraction with context guided mention integration and inter-pair reasoning 

medical question summarization with entity-driven contrastive learning 

let topic flow: a unified topic-guided segment-wise dialogue summarization framework 

conco-ernie: complex user intent detect model for smart healthcare cognitive bot 

enhancing conversation

In [128]:
# assuming your dataframe is called df and columns are: 'title' and 'library'

# create a normalized comparison key
combined_df['title_norm'] = combined_df['title'].str.lower().str.strip()

# group and filter to only duplicated titles
dupes = combined_df[combined_df.duplicated('title_norm', keep=False)]

# now show which libraries the duplicates belong to
result = dupes.groupby('title_norm')['library'].unique().reset_index()

print(result)


                                          title_norm             library
0  medka: a knowledge graph-augmented approach to...  [elsevier, pubmed]


In [129]:
# DROP DUPLICATES
combined_df['title_norm'] = combined_df['title'].str.lower().str.strip()

# find duplicated keys
dupe_keys = combined_df['title_norm'][combined_df['title_norm'].duplicated(keep=False)]

# remove every row with those keys
combined_df = combined_df[~combined_df['title_norm'].isin(dupe_keys)]

combined_df = combined_df.drop(columns='title_norm')


In [130]:
len(combined_df)

80

In [131]:
combined_df.iloc[55]["title"]

'Enhancing Chinese Medical Diagnostic Chatbot through Supervised Fine-Tuning of Large Language Models'

In [132]:
combined_df[combined_df["journal"] == "Sustainable Energy Technologies and Assessments"]["title"]

61    A role distinguishing Bert model for medical d...
Name: title, dtype: object

In [None]:
# combined_df.to_csv("medical_dialogue_summarization.csv")

In [134]:
# This exact method leaves me with 80 results. I think this is a great number of papers to work with!

In [135]:
("medical imaging recognition" OR "medical image analysis" OR "medical imaging" OR "radiology" OR "x ray" OR "x-ray" OR "ct scan" OR "mri" OR "ultrasound" OR "radiograph")
AND
("transformer" OR "genarative ai" OR "llm" OR "large language models" OR "llms" OR "gpt" OR "gemini" OR "claude" OR "deepseek" OR "chatgpt" OR "transformer model" OR "bert" OR "llama" OR "mistral" OR "vision language model" OR "multimodal model" OR "multimodal transformer" OR "foundation model" OR "multimodal ai")

SyntaxError: invalid syntax. Perhaps you forgot a comma? (4116687004.py, line 1)

In [None]:
("medical imaging" OR "radiology" OR "ct scan" OR "mri")
AND
("transformer" OR "large language models" OR "gpt" OR "multimodal model")


In [None]:
(("medical imaging recognition"[tiab] OR "medical image analysis"[tiab] OR "medical imaging"[tiab] OR "radiology"[tiab] OR "x ray"[tiab] OR "x-ray"[tiab] OR "ct scan"[tiab] OR "mri"[tiab] OR "ultrasound"[tiab] OR "radiograph"[tiab])
AND
("transformer"[tiab] OR "generative ai"[tiab] OR "llm"[tiab] OR "large language models"[tiab] OR "llms"[tiab] OR "gpt"[tiab] OR "gemini"[tiab] OR "claude"[tiab] OR "deepseek"[tiab] OR "chatgpt"[tiab] OR "transformer model"[tiab] OR "bert"[tiab] OR "llama"[tiab] OR "mistral"[tiab] OR "vision language model"[tiab] OR "multimodal model"[tiab] OR "multimodal transformer"[tiab] OR "foundation model"[tiab] OR "multimodal ai"[tiab]))


In [None]:
(("medical imaging recognition"[tiab] OR "medical image analysis"[tiab] OR "medical imaging"[tiab] OR "radiology"[tiab] OR "x ray"[tiab] OR "x-ray"[tiab] OR "ct scan"[tiab] OR "mri"[tiab] OR "ultrasound"[tiab] OR "radiograph"[tiab]) 
AND 
("transformer"[tiab] OR "generative ai"[tiab] OR "llm"[tiab] OR "large language models"[tiab] OR "llms"[tiab] OR "gpt"[tiab] OR "gemini"[tiab] OR "claude"[tiab] OR "deepseek"[tiab] OR "chatgpt"[tiab] OR "transformer model"[tiab] OR "bert"[tiab] OR "llama"[tiab] OR "mistral"[tiab] OR "vision language model"[tiab] OR "multimodal model"[tiab] OR "multimodal transformer"[tiab] OR "foundation model"[tiab] OR "multimodal ai"[tiab]))
AND "journal article"[pt]