### CIS 5930 - Project 6 - BERT Implementation
#### James Gray <br>Chashi Mahiul Islam <br>Renata Schama <br>Yagna Sree Bhavani Pendala

### IMPORTING LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel, pipeline

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

cuda_ = "cuda:0"
device = torch.device(cuda_ if torch.cuda.is_available() else "cpu")

Using cuda device


### IMPORTING DATA POST-PROCESSING

In [3]:
file = '/Users/jelvi/OneDrive/SCHOOL/2023 Q1 Spring/CIS 5930 Projects in Data Science/Project 6/cleaned_radiology.csv'

df = pd.read_csv(file, header=1)
array = df.to_numpy()

In [4]:
X = array[:, 7]

### IMPLEMENTING CLINICAL-BERT
https://arxiv.org/pdf/1904.05342.pdf#:~:text=create%20summaries%20of%20corpora.,task%20of%20hospital%20readmission%20prediction <br>
https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT <br>
https://arxiv.org/abs/1901.08746v4

In [5]:
# Load pre-trained model
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Patient notes to summarize
patient_notes = """
Patient presented with complaints of chest pain and shortness of breath. 
ECG showed ST segment elevation in leads V1 to V4. 
The patient was diagnosed with acute myocardial infarction and was treated with thrombolytic therapy. 
Patient's condition improved over the next few days and was discharged on day 5 with follow-up appointments scheduled.
"""
patient_notes = X[100]

# Generate summary using the pre-trained model
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
summary = summarizer(patient_notes, max_length=100, min_length=10, do_sample=False)

print("Original patient notes: \n", patient_notes)
print("\n")
print("Biomedical summary: \n", summary)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertModel' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditional

TypeError: The current model class (BertModel) is not compatible with `.generate()`, as it doesn't have a language model head. Please use one of the following classes instead: {'BertLMHeadModel'}

### EXAMPLE MODIFICATION CODE FOR CLINICAL-BERT

In [None]:
# Load the Clinical-BERT model and tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Add a language model head for text summarization
class SummaryHead(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SummaryHead, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
    
    def forward(self, inputs):
        return self.linear(inputs)

# Replace the model's classifier with the summary head
input_dim = model.config.hidden_size
output_dim = 1  # or however many summary tokens you want to generate
summary_head = SummaryHead(input_dim, output_dim)
model.classifier = summary_head

# Fine-tune the modified model on a text summarization dataset
train_dataset = ... # load your training dataset here
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = ... # define your loss function here
for epoch in range(num_epochs):
    for batch in train_dataset:
        inputs, targets = batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Evaluate the modified model on a held-out test set
test_dataset = ... # load your test dataset here
for batch in test_dataset:
    inputs, targets = batch
    outputs = model(inputs)
    # compute ROUGE and BLEU metrics here


### IMPLEMENTING DISTILBART-CNN-12-6 MODEL

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

# Example patient medical notes
patient_notes = """
Patient presented with complaints of chest pain and shortness of breath. 
ECG showed ST segment elevation in leads V1 to V4. 
The patient was diagnosed with acute myocardial infarction and was treated with thrombolytic therapy. 
Patient's condition improved over the next few days and was discharged on day 5 with follow-up appointments scheduled.
"""
patient_notes = X[100]

# Tokenize patient notes
inputs = tokenizer(patient_notes, return_tensors="pt")

# Generate summary using BART
summary_ids = model.generate(inputs["input_ids"], 
                              num_beams=10, 
                              max_length=100, 
                              early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print summary
print("Original patient notes: \n", patient_notes)
print("\n")
print("DISTILBART summary: \n", summary)

### IMPLEMENTING FACEBOOK BART MODEL

In [None]:
# Load pre-trained BART model
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Patient notes to summarize
patient_notes = """
Patient presented with complaints of chest pain and shortness of breath. 
ECG showed ST segment elevation in leads V1 to V4. 
The patient was diagnosed with acute myocardial infarction and was treated with thrombolytic therapy. 
Patient's condition improved over the next few days and was discharged on day 5 with follow-up appointments scheduled.
"""
patient_notes = X[100]

# Generate summary using the pre-trained BART model
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
summary = summarizer(patient_notes, max_length=100, min_length=10, do_sample=False)

# Print summary
print("Original patient notes: \n", patient_notes)
print("\n")
print("Facebook BART summary: \n", summary[0]['summary_text'])

### IMPLEMENTING FINE-TUNED CONVERSATIONAL BART

In [None]:
# Load pre-trained BART model
tokenizer = AutoTokenizer.from_pretrained("kabita-choudhary/finetuned-bart-for-conversation-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("kabita-choudhary/finetuned-bart-for-conversation-summary")

# Patient notes to summarize
patient_notes = """
Patient presented with complaints of chest pain and shortness of breath. 
ECG showed ST segment elevation in leads V1 to V4. 
The patient was diagnosed with acute myocardial infarction and was treated with thrombolytic therapy. 
Patient's condition improved over the next few days and was discharged on day 5 with follow-up appointments scheduled.
"""
patient_notes = X[100]

# Generate summary using the pre-trained BART model
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
summary = summarizer(patient_notes, max_length=100, min_length=10, do_sample=False)

# Print summary
print("Original patient notes: \n", patient_notes)
print("\n")
print("Conversational BART summary: \n", summary[0]['summary_text'])