# **Fine-tuned LLM Comparison Notebook 4**

This notebook compares results of the LoRA (Low Rank Adaptation) fine-tuned alpaca-native (fine-tuned on custom text summarisation data) to the base pretrained Flan-T5-small, Flan-T5-base, Alpaca-native and the Vicuna-13b-v1.3 LLMs.

Few-shots prompting is used for all LLMs in this notebook to generate summaries

In [1]:
import os, warnings, torch, gc
import pandas as pd
from tqdm.notebook import tqdm
from transformers import (
    AutoTokenizer, 
    LlamaTokenizer,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    AutoModelForSeq2SeqLM, 
    AutoConfig,
    pipeline
)
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict
from peft import PeftConfig, PeftModel
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable, Union, Any, Optional

warnings.filterwarnings("ignore")

os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "2048"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["ALPACA_NATIVE_ID"] = "alpaca-native_finetuned_results"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["FLAN_T5_SMALL_LLM"] = "google/flan-t5-small"
os.environ["FLAN_T5_BASE_LLM"] = "google/flan-t5-base"
os.environ["VICUNA_LLM"] = "lmsys/vicuna-13b-v1.3"

In [2]:
def run_on_test_data(
    model: Union[AutoModelForSeq2SeqLM, AutoModelForCausalLM, LlamaForCausalLM], 
    tokenizer: Union[AutoTokenizer, LlamaTokenizer], 
    dataset_key: str="test",
    task="text2text-generation",
    n_docs: int = 5,
    log_summary: bool=False,
    log_metrics: bool=False,
    summary_chain_kwargs: Dict[str, Any]={},
    **kwargs):
    
    # switch model to eval mode
    model.eval()
    
    # Define model pipeline for inference with langchain
    kwargs = {**dict(temperature=0.1, top_p=0.15, top_k=0, repetition_penalty=1.1), **kwargs}

    # define model pipeline
    hgf_pipeline = pipeline(
        task=task, 
        model=model, 
        tokenizer=tokenizer,
        max_length=int(os.environ["MAX_TOKENS"]),
        **kwargs
    )
    llm = HuggingFacePipeline(pipeline=hgf_pipeline)
    
    # Define Summary chain
    summary_chain_kwargs = {"chain_type": "map_reduce", **summary_chain_kwargs}
    summary_chain = load_summarize_chain(llm, **summary_chain_kwargs)
    
    # Load dataset
    dataset: DatasetDict = load_dataset(path=os.environ["DATASET_PATH"])
    
    # Generate Summaries and Measure Performance (Rouge Metric and Cosine Similarity Metric)
    rouge = Rouge()  # rouge metric object
    embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"]) # embeddings model object
    embeddings_model.to(os.environ["DEVICE"])
    document_ids = dataset[dataset_key]["id"][:n_docs]
    documents = dataset[dataset_key]["document"][:n_docs]
    target_summaries = dataset[dataset_key]["summary"][:n_docs]
    _zipped = zip(document_ids, documents, target_summaries)
    metrics_values: Iterable[Dict[str, Any]] = []

    for i, (document_id, document, target_summary) in enumerate(_zipped):
        document = Document(page_content=document)
        try:
            generated_summary = summary_chain.run([document])
        except ValueError as e:
            print(f"Error summarizing document-{i+1}: {e}")
            continue
            
        if log_summary:
            print(f"DOCUMENT {document_id}: {document}\n")
            print(f"GENERATED SUMARY: {generated_summary}\n")
            print(f"TARGET SUMARY: {target_summary}\n")

        generated_summary_embeddings, target_summary_embeddings = (
            embeddings_model.encode(generated_summary).reshape(1, -1),
            embeddings_model.encode(target_summary).reshape(1, -1)
        )
        cos_similarity = cosine_similarity(target_summary_embeddings, generated_summary_embeddings)
        rouge_scores = rouge.get_scores(generated_summary, target_summary)
        if log_metrics:
            print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
            print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n")
            
        if log_metrics or log_summary:
            print("-"*120)
            print("\n")
        _metric = dict(semantic_similarity=cos_similarity, rouge_scores=rouge_scores)
        metrics_values.append(_metric)
        
    return metrics_values

In [3]:
INFERENCE_DATASET_KEY = "test"
N_INFERENCE_DOCS = 10
LOG_SUMMARY = True
LOG_METRICS = True
DELETE_LLM_AFTER_USE = True

## Define Few Shots Prompts

In [4]:
PROMPT_TEMPLATE = """:

Using the following document and summarisation pair examples below:

EXAMPLES:

Example 1:
-----------------------------------
"document": "Medical Center-Lakeside, Lubbock, Texas, 79410, United States|Texas Tech University Health Sciences \
Center-Lubbock, Lubbock, Texas, 79430, United States|MD Anderson Regional Care Center-Bay Area, Nassau Bay, Texas, \
77058, United States|University of Texas Health Science Center at San Antonio, San Antonio, Texas, 78229, United States|MD\
Anderson Regional Care Center-Sugar Land, Sugar Land, Texas, 77478, United States|MD Anderson Regional Care Center-The \
Woodlands, The Woodlands, Texas, 77384, United States|American Fork Hospital / Huntsman Intermountain Cancer Center, \
American Fork, Utah, 84003, United States|Sandra L Maxwell Cancer Center, Cedar City, Utah, 84720, United States|Logan\
Regional Hospital, Logan, Utah, 84321, United States|Intermountain Medical Center, Murray, Utah, 84107, United \
States|McKay-Dee Hospital Center, Ogden, Utah, 84403, United States|Utah Valley Regional Medical Center, Provo, Utah, \
84604, United States|Dixie Medical Center Regional Cancer Center, Saint George, Utah, 84770, United",

"summary": "There are medical and cancer centers in Texas and Utah in the US."


Example 2:
-----------------------------------
"document": "Wisconsin, 54449, United States|Marshfield Medical Center, Marshfield, Wisconsin, 54449, United \
States|Community Memorial Hospital, Menomonee Falls, Wisconsin, 53051, United States|Aurora Cancer Care-Milwaukee, \
Milwaukee, Wisconsin, 53209, United States|Aurora Saint Luke's Medical Center, Milwaukee, Wisconsin, 53215, United \
States|Froedtert and the Medical College of Wisconsin, Milwaukee, Wisconsin, 53226, United States|Aurora Sinai Medical Center, \
Milwaukee, Wisconsin, 53233, United States|Marshfield Clinic-Minocqua Center, Minocqua, Wisconsin, 54548, United \
States|ProHealth D N Greenwald Center, Mukwonago, Wisconsin, 53149, United States|Cancer Center of Western Wisconsin, \
New Richmond, Wisconsin, 54017, United States|ProHealth Oconomowoc Memorial Hospital, Oconomowoc, Wisconsin, 53066, \
United States|Vince Lombardi Cancer Clinic - Oshkosh, Oshkosh, Wisconsin, 54904, United States|Aurora Cancer Care-Racine, \
Racine, Wisconsin, 53406, United States|Marshfield Clinic at James Beck Cancer Center, Rhinelander,",

"summary": "The list provides information about medical centers and cancer clinics in different cities in Wisconsin, USA."


Generate a concise summary of the text below:

"{text}"


SUMMARY:"""

PROMPT = PromptTemplate.from_template(PROMPT_TEMPLATE)

## Evaluate Base Pretrained Alpaca Native

In [5]:
alpaca_config = PeftConfig.from_pretrained(os.environ["ALPACA_NATIVE_ID"])

# load alpaca base model and tokenizer
alpaca_model = LlamaForCausalLM.from_pretrained(
    alpaca_config.base_model_name_or_path,
    load_in_8bit=True,
    device_map="auto",
)

alpaca_tokenizer = LlamaTokenizer.from_pretrained(
    alpaca_config.base_model_name_or_path,
    max_length=os.environ["MAX_TOKENS"],
)

lora_alpaca_native_performance = run_on_test_data(
    alpaca_model, 
    alpaca_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Token indices sequence length is longer than the specified maximum sequence length for this model (1163 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

## Evaluate LoRA Fine-tuned Alpaca Native

In [None]:
# load LoRA peft alpaca model
alpaca_model = PeftModel.from_pretrained(
    alpaca_model, 
    os.environ["ALPACA_NATIVE_ID"],
    device_map="auto"
)

alpaca_native_performance = run_on_test_data(
    alpaca_model, 
    alpaca_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del alpaca_model, alpaca_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

Token indices sequence length is longer than the specified maximum sequence length for this model (2336 > 1024). Running this sequence through the model will result in indexing errors
Input length of input_ids is 2515, but `max_length` is set to 2048. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


DOCUMENT 4503: page_content='Bath, Maine, 04530, United States|MaineHealth/SMHC Cancer Care and Blood Disorders-Biddeford, Biddeford, Maine, 04005, United States|Maine Medical Center-Bramhall Campus, Portland, Maine, 04102, United States|MaineHealth Cancer Care Center of York County, Sanford, Maine, 04073, United States|MaineHealth/SMHC Cancer Care and Blood Disorders-Sanford, Sanford, Maine, 04073, United States|Maine Medical Center- Scarborough Campus, Scarborough, Maine, 04074, United States|University of Maryland/Greenebaum Cancer Center, Baltimore, Maryland, 21201, United States|Greater Baltimore Medical Center, Baltimore, Maryland, 21204, United States|UM Upper Chesapeake Medical Center, Bel Air, Maryland, 21014, United States|Central Maryland Radiation Oncology in Howard County, Columbia, Maryland, 21044, United States|Lahey Hospital and Medical Center, Burlington, Massachusetts, 01805, United States|Lowell General Hospital, Lowell, Massachusetts, 01854, United States|University

Token indices sequence length is longer than the specified maximum sequence length for this model (1049 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4507: page_content='of Rochester, Rochester, New York, 14642, United States|Dickstein Cancer Treatment Center, White Plains, New York, 10601, United States|Rex Hematology Oncology Associates-Cary, Cary, North Carolina, 27518, United States|UNC Lineberger Comprehensive Cancer Center, Chapel Hill, North Carolina, 27599, United States|Duke University Medical Center, Durham, North Carolina, 27710, United States|Rex Hematology Oncology Associates-Garner, Garner, North Carolina, 27529, United States|Rex Hematology Oncology Associates-Blue Ridge, Raleigh, North Carolina, 27607, United States|UNC Rex Healthcare, Raleigh, North Carolina, 27607, United States|UNC Rex Cancer Center of Wakefield, Raleigh, North Carolina, 27614, United States|Novant Cancer Institute Radiation Oncology - Supply, Supply, North Carolina, 28462, United States|Novant Health Cancer Institute Radiation Oncology - Wilmington, Wilmington, North Carolina, 28401, United States|Novant Health New Hanover Regional Medic

## Evaluate Base Pre-trained FLAN-T5-SMALL

In [7]:
flan_t5_small_model = AutoModelForSeq2SeqLM.from_pretrained(
    os.environ["FLAN_T5_SMALL_LLM"],  
    load_in_8bit=True,  
    device_map="auto",
)
flan_t5_small_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["FLAN_T5_SMALL_LLM"], 
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_small_performance = run_on_test_data(
    flan_t5_small_model, 
    flan_t5_small_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_small_model, flan_t5_small_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Token indices sequence length is longer than the specified maximum sequence length for this model (2547 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2759 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

Token indices sequence length is longer than the specified maximum sequence length for this model (2417 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4501: page_content="Southeastern Regional Medical Center, Newnan, Georgia, 30265, United States|Lewis Hall Singletary Oncology Center, Thomasville, Georgia, 31792, United States|Queen's Medical Center, Honolulu, Hawaii, 96813, United States|The Cancer Center of Hawaii-Liliha, Honolulu, Hawaii, 96817, United States|Northwestern University, Chicago, Illinois, 60611, United States|Rush University Medical Center, Chicago, Illinois, 60612, United States|University of Illinois, Chicago, Illinois, 60612, United States|University of Chicago Comprehensive Cancer Center, Chicago, Illinois, 60637, United States|Decatur Memorial Hospital, Decatur, Illinois, 62526, United States|Northwestern Medicine Cancer Center Delnor, Geneva, Illinois, 60134, United States|Loyola University Medical Center, Maywood, Illinois, 60153, United States|Methodist Medical Center of Illinois, Peoria, Illinois, 61636, United States|Memorial Medical Center, Springfield, Illinois, 62781, United States|Southwest Ill

Token indices sequence length is longer than the specified maximum sequence length for this model (2596 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4504: page_content="Massachusetts, 01854, United States|University of Michigan Comprehensive Cancer Center, Ann Arbor, Michigan, 48109, United States|Henry Ford Cancer Institute-Downriver, Brownstown, Michigan, 48183, United States|GenesisCare USA - Clarkston, Clarkston, Michigan, 48346, United States|Henry Ford Macomb Hospital-Clinton Township, Clinton Township, Michigan, 48038, United States|Henry Ford Hospital, Detroit, Michigan, 48202, United States|GenesisCare USA - Farmington Hills, Farmington Hills, Michigan, 48334, United States|West Michigan Cancer Center, Kalamazoo, Michigan, 49007, United States|Saint Joseph Mercy Oakland, Pontiac, Michigan, 48341, United States|William Beaumont Hospital-Royal Oak, Royal Oak, Michigan, 48073, United States|GenesisCare USA - Troy, Troy, Michigan, 48098, United States|Henry Ford West Bloomfield Hospital, West Bloomfield, Michigan, 48322, United States|Mercy Hospital, Coon Rapids, Minnesota, 55433, United States|Saint Luke's Hospital o

Token indices sequence length is longer than the specified maximum sequence length for this model (1708 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4505: page_content="Minnesota, 55433, United States|Saint Luke's Hospital of Duluth, Duluth, Minnesota, 55805, United States|Mayo Clinic in Rochester, Rochester, Minnesota, 55905, United States|Regions Hospital, Saint Paul, Minnesota, 55101, United States|Saint Francis Medical Center, Cape Girardeau, Missouri, 63703, United States|Washington University School of Medicine, Saint Louis, Missouri, 63110, United States|Mercy Hospital South, Saint Louis, Missouri, 63128, United States|Missouri Baptist Medical Center, Saint Louis, Missouri, 63131, United States|Benefis Healthcare- Sletten Cancer Institute, Great Falls, Montana, 59405, United States|Kalispell Regional Medical Center, Kalispell, Montana, 59901, United States|University of Nebraska Medical Center, Omaha, Nebraska, 68198, United States|Renown Regional Medical Center, Reno, Nevada, 89502, United States|Wentworth-Douglass Hospital, Dover, New Hampshire, 03820, United States|Dartmouth Hitchcock Medical Center/Dartmouth Can

Token indices sequence length is longer than the specified maximum sequence length for this model (2472 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4508: page_content='States|Novant Health New Hanover Regional Medical Center, Wilmington, North Carolina, 28401, United States|Wake Forest University Health Sciences, Winston-Salem, North Carolina, 27157, United States|Sanford Bismarck Medical Center, Bismarck, North Dakota, 58501, United States|Cleveland Clinic Akron General, Akron, Ohio, 44307, United States|Case Western Reserve University, Cleveland, Ohio, 44106, United States|Ohio State University Comprehensive Cancer Center, Columbus, Ohio, 43210, United States|ProMedica Flower Hospital, Sylvania, Ohio, 43560, United States|University of Oklahoma Health Sciences Center, Oklahoma City, Oklahoma, 73104, United States|Legacy Mount Hood Medical Center, Gresham, Oregon, 97030, United States|Legacy Good Samaritan Hospital and Medical Center, Portland, Oregon, 97210, United States|Providence Portland Medical Center, Portland, Oregon, 97213, United States|Providence Saint Vincent Medical Center, Portland, Oregon, 97225, United St

## Evaluate Base Pretrained FLAN-T5-BASE

In [8]:
flan_t5_base_model = AutoModelForSeq2SeqLM.from_pretrained(
    os.environ["FLAN_T5_BASE_LLM"], 
    load_in_8bit=True,  
    device_map="auto",
)

flan_t5_base_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["FLAN_T5_BASE_LLM"],
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_base_performance = run_on_test_data(
    flan_t5_base_model, 
    flan_t5_base_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_base_model, flan_t5_base_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (727 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

Token indices sequence length is longer than the specified maximum sequence length for this model (2491 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4503: page_content='Bath, Maine, 04530, United States|MaineHealth/SMHC Cancer Care and Blood Disorders-Biddeford, Biddeford, Maine, 04005, United States|Maine Medical Center-Bramhall Campus, Portland, Maine, 04102, United States|MaineHealth Cancer Care Center of York County, Sanford, Maine, 04073, United States|MaineHealth/SMHC Cancer Care and Blood Disorders-Sanford, Sanford, Maine, 04073, United States|Maine Medical Center- Scarborough Campus, Scarborough, Maine, 04074, United States|University of Maryland/Greenebaum Cancer Center, Baltimore, Maryland, 21201, United States|Greater Baltimore Medical Center, Baltimore, Maryland, 21204, United States|UM Upper Chesapeake Medical Center, Bel Air, Maryland, 21014, United States|Central Maryland Radiation Oncology in Howard County, Columbia, Maryland, 21044, United States|Lahey Hospital and Medical Center, Burlington, Massachusetts, 01805, United States|Lowell General Hospital, Lowell, Massachusetts, 01854, United States|University

## Evaluate Base Pretrained Vicuna 13b v1.3

In [10]:
vicuna_model = AutoModelForCausalLM.from_pretrained(
    os.environ["VICUNA_LLM"],
    load_in_8bit=True,
    device_map="auto",
)

vicuna_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["VICUNA_LLM"], 
    max_length=os.environ["MAX_TOKENS"],
)

vicuna_13b_performance = run_on_test_data(
    vicuna_model, 
    vicuna_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del vicuna_model, vicuna_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

In [11]:
def compile_metrics(performance: Iterable[Dict[str, Any]], outer_key: Optional[str]=None) -> Dict[str, Iterable]:
    performance_table_dict = dict()
    performance_dict = dict()
    join_keys = lambda inner_key : (outer_key, inner_key) if outer_key else inner_key
    
    for i, item in enumerate(performance):
        similarity_score = item["semantic_similarity"][0][0]
        rouge_scores = item["rouge_scores"][0]
        if i == 0:
            performance_dict[join_keys("similarity_score")]=[similarity_score]
        else:
            performance_dict[join_keys("similarity_score")].append(similarity_score)
        for key in rouge_scores.keys():
            if i == 0:
                performance_dict = {
                    **performance_dict, 
                    **{join_keys(f"{key}_{k}"):[v] for k, v in rouge_scores[key].items()}
                }
            else:
                for k, v in rouge_scores[key].items():
                    performance_dict[join_keys(f"{key}_{k}")].append(v)
    return performance_dict

In [12]:
performances = {
    "alpaca-7b-native": alpaca_native_performance,
    "lora-alpaca-7b-native": lora_alpaca_native_performance,
    "flan-t5-small": flan_t5_small_performance,
    "flan-t5-base": flan_t5_base_performance,
    "vicuna-13b-v1.3": vicuna_13b_performance,
}
metrics_dict = dict()
for key, performance_values in performances.items():
    metrics_dict = {**metrics_dict, **compile_metrics(performance_values, outer_key=key)}
    
outer_cols = [k for k in performances.keys()]
inner_cols = [k[1] for k in metrics_dict.keys() if k[0]=="flan-t5-small"]
columns = pd.MultiIndex.from_product([outer_cols, inner_cols])
metrics_df = pd.DataFrame(metrics_dict, columns=columns).round(2)

## Performance Comparison Table

In [13]:
pd.set_option('display.max_columns', None)
metrics_df

Unnamed: 0_level_0,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3
Unnamed: 0_level_1,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.73,0.4,0.26,0.31,0.21,0.13,0.16,0.36,0.23,0.28,0.69,0.32,0.3,0.31,0.04,0.03,0.03,0.28,0.26,0.27,0.6,0.08,0.2,0.11,0.0,0.0,0.0,0.08,0.2,0.11,0.83,0.32,0.67,0.43,0.18,0.45,0.26,0.32,0.67,0.43,0.84,0.4,0.53,0.45,0.14,0.21,0.17,0.4,0.53,0.45
1,0.63,0.52,0.16,0.24,0.27,0.05,0.09,0.52,0.16,0.24,0.83,0.38,0.47,0.42,0.18,0.24,0.21,0.24,0.29,0.26,0.53,0.1,0.17,0.12,0.0,0.0,0.0,0.1,0.17,0.12,0.48,0.14,0.33,0.2,0.0,0.0,0.0,0.14,0.33,0.2,0.71,0.38,0.38,0.38,0.14,0.14,0.14,0.33,0.33,0.33
2,0.78,0.6,0.18,0.27,0.11,0.02,0.03,0.4,0.12,0.18,0.69,0.25,0.2,0.22,0.0,0.0,0.0,0.2,0.16,0.18,0.55,0.05,0.02,0.03,0.0,0.0,0.0,0.05,0.02,0.03,0.6,0.3,0.6,0.4,0.11,0.22,0.14,0.25,0.5,0.33,0.68,0.35,0.37,0.36,0.0,0.0,0.0,0.2,0.21,0.21
3,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.69,0.39,0.27,0.32,0.08,0.06,0.07,0.26,0.18,0.21,0.45,0.17,0.33,0.23,0.0,0.0,0.0,0.13,0.25,0.17,0.44,0.09,0.04,0.06,0.0,0.0,0.0,0.09,0.04,0.06,0.74,0.43,0.48,0.45,0.17,0.18,0.17,0.26,0.29,0.27
4,0.55,0.64,0.11,0.19,0.14,0.02,0.03,0.57,0.1,0.17,0.76,0.43,0.13,0.2,0.21,0.05,0.08,0.36,0.11,0.17,0.74,0.21,0.19,0.2,0.0,0.0,0.0,0.21,0.19,0.2,0.7,0.21,0.05,0.08,0.0,0.0,0.0,0.14,0.03,0.05,0.65,0.43,0.32,0.36,0.14,0.1,0.12,0.43,0.32,0.36
5,0.6,0.78,0.12,0.21,0.18,0.02,0.04,0.67,0.11,0.18,0.56,0.33,0.24,0.28,0.06,0.04,0.05,0.28,0.2,0.23,0.58,0.33,0.15,0.21,0.06,0.02,0.03,0.28,0.12,0.17,0.52,0.22,0.06,0.09,0.06,0.01,0.02,0.22,0.06,0.09,0.75,0.44,0.47,0.46,0.06,0.06,0.06,0.33,0.35,0.34
6,0.69,0.48,0.21,0.29,0.2,0.08,0.12,0.48,0.21,0.29,0.79,0.38,0.5,0.43,0.2,0.31,0.24,0.38,0.5,0.43,0.37,0.19,0.36,0.25,0.04,0.1,0.06,0.19,0.36,0.25,0.74,0.24,0.56,0.33,0.0,0.0,0.0,0.19,0.44,0.27,0.79,0.43,0.23,0.3,0.2,0.12,0.15,0.43,0.23,0.3
7,0.18,0.38,0.05,0.09,0.08,0.01,0.01,0.38,0.05,0.09,0.67,0.54,0.11,0.19,0.23,0.04,0.06,0.46,0.1,0.16,0.53,0.31,0.33,0.32,0.08,0.09,0.08,0.23,0.25,0.24,0.47,0.08,0.06,0.06,0.0,0.0,0.0,0.08,0.06,0.06,0.75,0.15,0.04,0.07,0.0,0.0,0.0,0.15,0.04,0.07
8,0.73,0.79,0.22,0.34,0.38,0.08,0.13,0.71,0.2,0.31,0.52,0.29,0.1,0.15,0.0,0.0,0.0,0.29,0.1,0.15,0.42,0.14,0.17,0.15,0.0,0.0,0.0,0.14,0.17,0.15,0.49,0.29,0.12,0.17,0.0,0.0,0.0,0.29,0.12,0.17,0.91,0.43,0.38,0.4,0.23,0.2,0.21,0.43,0.38,0.4
9,0.72,0.52,0.27,0.36,0.24,0.1,0.15,0.41,0.22,0.28,0.71,0.26,0.58,0.36,0.07,0.17,0.1,0.26,0.58,0.36,0.58,0.15,0.4,0.22,0.03,0.11,0.05,0.15,0.4,0.22,0.81,0.33,0.56,0.42,0.14,0.25,0.18,0.33,0.56,0.42,0.81,0.56,0.37,0.44,0.28,0.18,0.22,0.48,0.32,0.38


## Mean Performance Comparison Table

In [26]:
metrics_mean = metrics_df.mean(axis=0)
mean_metrics_df = pd.DataFrame(metrics_mean.values.reshape(1, -1), columns=metrics_mean.index)
mean_metrics_df

Unnamed: 0_level_0,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3
Unnamed: 0_level_1,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.565,0.511,0.158,0.23,0.181,0.051,0.076,0.45,0.14,0.202,0.691,0.357,0.29,0.288,0.107,0.094,0.084,0.301,0.248,0.242,0.535,0.173,0.232,0.184,0.021,0.032,0.022,0.156,0.213,0.166,0.608,0.222,0.305,0.224,0.049,0.093,0.06,0.205,0.281,0.208,0.763,0.4,0.357,0.367,0.136,0.119,0.124,0.344,0.3,0.311
