# **Fine-tuned LLM Comparison Notebook 3**

This notebook compares results of the LoRA (Low Rank Adaptation) fine-tuned alpaca-native (fine-tuned on custom text summarisation data) to the base pretrained Flan-T5-small, Flan-T5-base, Alpaca-native and the Vicuna-13b-v1.3 LLMs

In [1]:
import os, warnings, torch, gc
import pandas as pd
from tqdm.notebook import tqdm
from transformers import (
    AutoTokenizer, 
    LlamaTokenizer,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    AutoModelForSeq2SeqLM, 
    AutoConfig,
    pipeline
)
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict
from peft import PeftConfig, PeftModel
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable, Union, Any, Optional

warnings.filterwarnings("ignore")

os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "2048"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["ALPACA_NATIVE_ID"] = "alpaca-native_finetuned_results"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["FLAN_T5_SMALL_LLM"] = "google/flan-t5-small"
os.environ["FLAN_T5_BASE_LLM"] = "google/flan-t5-base"
os.environ["VICUNA_LLM"] = "lmsys/vicuna-13b-v1.3"

In [2]:
def run_on_test_data(
    model: Union[AutoModelForSeq2SeqLM, AutoModelForCausalLM, LlamaForCausalLM], 
    tokenizer: Union[AutoTokenizer, LlamaTokenizer], 
    dataset_key: str="test",
    task="text2text-generation",
    n_docs: int = 5,
    log_summary: bool=False,
    log_metrics: bool=False,
    **kwargs):
    
    # switch model to eval mode
    model.eval()
    
    # Define model pipeline for inference with langchain
    kwargs = {**dict(temperature=0.1, top_p=0.15, top_k=0, repetition_penalty=1.1), **kwargs}

    # define model pipeline
    hgf_pipeline = pipeline(
        task=task, 
        model=model, 
        tokenizer=tokenizer,
        max_length=int(os.environ["MAX_TOKENS"]),
        **kwargs
    )
    llm = HuggingFacePipeline(pipeline=hgf_pipeline)
    
    # Define Summary chain
    summary_chain = load_summarize_chain(llm, chain_type="map_reduce")
    
    # Load dataset
    dataset: DatasetDict = load_dataset(path=os.environ["DATASET_PATH"])
    
    # Generate Summaries and Measure Performance (Rouge Metric and Cosine Similarity Metric)
    rouge = Rouge()  # rouge metric object
    embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"]) # embeddings model object
    embeddings_model.to(os.environ["DEVICE"])
    document_ids = dataset[dataset_key]["id"][:n_docs]
    documents = dataset[dataset_key]["document"][:n_docs]
    target_summaries = dataset[dataset_key]["summary"][:n_docs]
    _zipped = zip(document_ids, documents, target_summaries)
    metrics_values: Iterable[Dict[str, Any]] = []

    for i, (document_id, document, target_summary) in enumerate(_zipped):
        document = Document(page_content=document)
        try:
            generated_summary = summary_chain.run([document])
        except ValueError as e:
            print(f"Error summarizing document-{i+1}: {e}")
            continue
            
        if log_summary:
            print(f"DOCUMENT {document_id}: {document}\n")
            print(f"GENERATED SUMARY: {generated_summary}\n")
            print(f"TARGET SUMARY: {target_summary}\n")

        generated_summary_embeddings, target_summary_embeddings = (
            embeddings_model.encode(generated_summary).reshape(1, -1),
            embeddings_model.encode(target_summary).reshape(1, -1)
        )
        cos_similarity = cosine_similarity(target_summary_embeddings, generated_summary_embeddings)
        rouge_scores = rouge.get_scores(generated_summary, target_summary)
        if log_metrics:
            print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
            print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n")
            
        if log_metrics or log_summary:
            print("-"*120)
            print("\n")
        _metric = dict(semantic_similarity=cos_similarity, rouge_scores=rouge_scores)
        metrics_values.append(_metric)
        
    return metrics_values

In [3]:
INFERENCE_DATASET_KEY = "test"
N_INFERENCE_DOCS = 10
LOG_SUMMARY = True
LOG_METRICS = True
DELETE_LLM_AFTER_USE = True

## Evaluate Base Pretrained Alpaca Native

In [4]:
alpaca_config = PeftConfig.from_pretrained(os.environ["ALPACA_NATIVE_ID"])

# load alpaca base model and tokenizer
alpaca_model = LlamaForCausalLM.from_pretrained(
    alpaca_config.base_model_name_or_path,
    load_in_8bit=True,
    device_map="auto",
)

alpaca_tokenizer = LlamaTokenizer.from_pretrained(
    alpaca_config.base_model_name_or_path,
    max_length=os.environ["MAX_TOKENS"],
)

lora_alpaca_native_performance = run_on_test_data(
    alpaca_model, 
    alpaca_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

## Evaluate LoRA Fine-tuned Alpaca Native

In [5]:
# load LoRA peft alpaca model
alpaca_model = PeftModel.from_pretrained(
    alpaca_model, 
    os.environ["ALPACA_NATIVE_ID"],
    device_map="auto"
)

alpaca_native_performance = run_on_test_data(
    alpaca_model, 
    alpaca_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del alpaca_model, alpaca_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

Token indices sequence length is longer than the specified maximum sequence length for this model (1721 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1727 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 4503: page_content='Bath, Maine, 04530, United States|MaineHealth/SMHC Cancer Care and Blood Disorders-Biddeford, Biddeford, Maine, 04005, United States|Maine Medical Center-Bramhall Campus, Portland, Maine, 04102, United States|MaineHealth Cancer Care Center of York County, Sanford, Maine, 04073, United States|MaineHealth/SMHC Cancer Care and Blood Disorders-Sanford, Sanford, Maine, 04073, United States|Maine Medical Center- Scarborough Campus, Scarborough, Maine, 04074, United States|University of Maryland/Greenebaum Cancer Center, Baltimore, Maryland, 21201, United States|Greater Baltimore Medical Center, Baltimore, Maryland, 21204, United States|UM Upper Chesapeake Medical Center, Bel Air, Maryland, 21014, United States|Central Maryland Radiation Oncology in Howard County, Columbia, Maryland, 21044, United States|Lahey Hospital and Medical Center, Burlington, Massachusetts, 01805, United States|Lowell General Hospital, Lowell, Massachusetts, 01854, United States|University

Token indices sequence length is longer than the specified maximum sequence length for this model (1695 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4508: page_content='States|Novant Health New Hanover Regional Medical Center, Wilmington, North Carolina, 28401, United States|Wake Forest University Health Sciences, Winston-Salem, North Carolina, 27157, United States|Sanford Bismarck Medical Center, Bismarck, North Dakota, 58501, United States|Cleveland Clinic Akron General, Akron, Ohio, 44307, United States|Case Western Reserve University, Cleveland, Ohio, 44106, United States|Ohio State University Comprehensive Cancer Center, Columbus, Ohio, 43210, United States|ProMedica Flower Hospital, Sylvania, Ohio, 43560, United States|University of Oklahoma Health Sciences Center, Oklahoma City, Oklahoma, 73104, United States|Legacy Mount Hood Medical Center, Gresham, Oregon, 97030, United States|Legacy Good Samaritan Hospital and Medical Center, Portland, Oregon, 97210, United States|Providence Portland Medical Center, Portland, Oregon, 97213, United States|Providence Saint Vincent Medical Center, Portland, Oregon, 97225, United St

## Evaluate Base Pre-trained FLAN-T5-SMALL

In [6]:
flan_t5_small_model = AutoModelForSeq2SeqLM.from_pretrained(
    os.environ["FLAN_T5_SMALL_LLM"],  
    load_in_8bit=True,  
    device_map="auto",
)
flan_t5_small_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["FLAN_T5_SMALL_LLM"], 
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_small_performance = run_on_test_data(
    flan_t5_small_model, 
    flan_t5_small_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_small_model, flan_t5_small_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Token indices sequence length is longer than the specified maximum sequence length for this model (1932 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2066 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

Token indices sequence length is longer than the specified maximum sequence length for this model (1802 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4501: page_content="Southeastern Regional Medical Center, Newnan, Georgia, 30265, United States|Lewis Hall Singletary Oncology Center, Thomasville, Georgia, 31792, United States|Queen's Medical Center, Honolulu, Hawaii, 96813, United States|The Cancer Center of Hawaii-Liliha, Honolulu, Hawaii, 96817, United States|Northwestern University, Chicago, Illinois, 60611, United States|Rush University Medical Center, Chicago, Illinois, 60612, United States|University of Illinois, Chicago, Illinois, 60612, United States|University of Chicago Comprehensive Cancer Center, Chicago, Illinois, 60637, United States|Decatur Memorial Hospital, Decatur, Illinois, 62526, United States|Northwestern Medicine Cancer Center Delnor, Geneva, Illinois, 60134, United States|Loyola University Medical Center, Maywood, Illinois, 60153, United States|Methodist Medical Center of Illinois, Peoria, Illinois, 61636, United States|Memorial Medical Center, Springfield, Illinois, 62781, United States|Southwest Ill

Token indices sequence length is longer than the specified maximum sequence length for this model (1981 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4504: page_content="Massachusetts, 01854, United States|University of Michigan Comprehensive Cancer Center, Ann Arbor, Michigan, 48109, United States|Henry Ford Cancer Institute-Downriver, Brownstown, Michigan, 48183, United States|GenesisCare USA - Clarkston, Clarkston, Michigan, 48346, United States|Henry Ford Macomb Hospital-Clinton Township, Clinton Township, Michigan, 48038, United States|Henry Ford Hospital, Detroit, Michigan, 48202, United States|GenesisCare USA - Farmington Hills, Farmington Hills, Michigan, 48334, United States|West Michigan Cancer Center, Kalamazoo, Michigan, 49007, United States|Saint Joseph Mercy Oakland, Pontiac, Michigan, 48341, United States|William Beaumont Hospital-Royal Oak, Royal Oak, Michigan, 48073, United States|GenesisCare USA - Troy, Troy, Michigan, 48098, United States|Henry Ford West Bloomfield Hospital, West Bloomfield, Michigan, 48322, United States|Mercy Hospital, Coon Rapids, Minnesota, 55433, United States|Saint Luke's Hospital o

Token indices sequence length is longer than the specified maximum sequence length for this model (1093 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4505: page_content="Minnesota, 55433, United States|Saint Luke's Hospital of Duluth, Duluth, Minnesota, 55805, United States|Mayo Clinic in Rochester, Rochester, Minnesota, 55905, United States|Regions Hospital, Saint Paul, Minnesota, 55101, United States|Saint Francis Medical Center, Cape Girardeau, Missouri, 63703, United States|Washington University School of Medicine, Saint Louis, Missouri, 63110, United States|Mercy Hospital South, Saint Louis, Missouri, 63128, United States|Missouri Baptist Medical Center, Saint Louis, Missouri, 63131, United States|Benefis Healthcare- Sletten Cancer Institute, Great Falls, Montana, 59405, United States|Kalispell Regional Medical Center, Kalispell, Montana, 59901, United States|University of Nebraska Medical Center, Omaha, Nebraska, 68198, United States|Renown Regional Medical Center, Reno, Nevada, 89502, United States|Wentworth-Douglass Hospital, Dover, New Hampshire, 03820, United States|Dartmouth Hitchcock Medical Center/Dartmouth Can

Token indices sequence length is longer than the specified maximum sequence length for this model (1857 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 4508: page_content='States|Novant Health New Hanover Regional Medical Center, Wilmington, North Carolina, 28401, United States|Wake Forest University Health Sciences, Winston-Salem, North Carolina, 27157, United States|Sanford Bismarck Medical Center, Bismarck, North Dakota, 58501, United States|Cleveland Clinic Akron General, Akron, Ohio, 44307, United States|Case Western Reserve University, Cleveland, Ohio, 44106, United States|Ohio State University Comprehensive Cancer Center, Columbus, Ohio, 43210, United States|ProMedica Flower Hospital, Sylvania, Ohio, 43560, United States|University of Oklahoma Health Sciences Center, Oklahoma City, Oklahoma, 73104, United States|Legacy Mount Hood Medical Center, Gresham, Oregon, 97030, United States|Legacy Good Samaritan Hospital and Medical Center, Portland, Oregon, 97210, United States|Providence Portland Medical Center, Portland, Oregon, 97213, United States|Providence Saint Vincent Medical Center, Portland, Oregon, 97225, United St

## Evaluate Base Pretrained FLAN-T5-BASE

In [7]:
flan_t5_base_model = AutoModelForSeq2SeqLM.from_pretrained(
    os.environ["FLAN_T5_BASE_LLM"], 
    load_in_8bit=True,  
    device_map="auto",
)

flan_t5_base_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["FLAN_T5_BASE_LLM"],
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_base_performance = run_on_test_data(
    flan_t5_base_model, 
    flan_t5_base_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_base_model, flan_t5_base_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

Token indices sequence length is longer than the specified maximum sequence length for this model (1876 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2068 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 4503: page_content='Bath, Maine, 04530, United States|MaineHealth/SMHC Cancer Care and Blood Disorders-Biddeford, Biddeford, Maine, 04005, United States|Maine Medical Center-Bramhall Campus, Portland, Maine, 04102, United States|MaineHealth Cancer Care Center of York County, Sanford, Maine, 04073, United States|MaineHealth/SMHC Cancer Care and Blood Disorders-Sanford, Sanford, Maine, 04073, United States|Maine Medical Center- Scarborough Campus, Scarborough, Maine, 04074, United States|University of Maryland/Greenebaum Cancer Center, Baltimore, Maryland, 21201, United States|Greater Baltimore Medical Center, Baltimore, Maryland, 21204, United States|UM Upper Chesapeake Medical Center, Bel Air, Maryland, 21014, United States|Central Maryland Radiation Oncology in Howard County, Columbia, Maryland, 21044, United States|Lahey Hospital and Medical Center, Burlington, Massachusetts, 01805, United States|Lowell General Hospital, Lowell, Massachusetts, 01854, United States|University

## Evaluate Base Pretrained Vicuna 13b v1.3

In [8]:
vicuna_model = AutoModelForCausalLM.from_pretrained(
    os.environ["VICUNA_LLM"],
    load_in_8bit=True,
    device_map="auto",
)

vicuna_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["VICUNA_LLM"], 
    max_length=os.environ["MAX_TOKENS"],
)

vicuna_13b_performance = run_on_test_data(
    vicuna_model, 
    vicuna_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del vicuna_model, vicuna_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

DOCUMENT 4500: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, G

In [9]:
def compile_metrics(performance: Iterable[Dict[str, Any]], outer_key: Optional[str]=None) -> Dict[str, Iterable]:
    performance_table_dict = dict()
    performance_dict = dict()
    join_keys = lambda inner_key : (outer_key, inner_key) if outer_key else inner_key
    
    for i, item in enumerate(performance):
        similarity_score = item["semantic_similarity"][0][0]
        rouge_scores = item["rouge_scores"][0]
        if i == 0:
            performance_dict[join_keys("similarity_score")]=[similarity_score]
        else:
            performance_dict[join_keys("similarity_score")].append(similarity_score)
        for key in rouge_scores.keys():
            if i == 0:
                performance_dict = {
                    **performance_dict, 
                    **{join_keys(f"{key}_{k}"):[v] for k, v in rouge_scores[key].items()}
                }
            else:
                for k, v in rouge_scores[key].items():
                    performance_dict[join_keys(f"{key}_{k}")].append(v)
    return performance_dict

In [10]:
performances = {
    "alpaca-7b-native": alpaca_native_performance,
    "lora-alpaca-7b-native": lora_alpaca_native_performance,
    "flan-t5-small": flan_t5_small_performance,
    "flan-t5-base": flan_t5_base_performance,
    "vicuna-13b-v1.3": vicuna_13b_performance,
}
metrics_dict = dict()
for key, performance_values in performances.items():
    metrics_dict = {**metrics_dict, **compile_metrics(performance_values, outer_key=key)}
    
outer_cols = [k for k in performances.keys()]
inner_cols = [k[1] for k in metrics_dict.keys() if k[0]=="flan-t5-small"]
columns = pd.MultiIndex.from_product([outer_cols, inner_cols])
metrics_df = pd.DataFrame(metrics_dict, columns=columns).round(2)

## Performance Comparison Table

In [11]:
pd.set_option('display.max_columns', None)
metrics_df

Unnamed: 0_level_0,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3
Unnamed: 0_level_1,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.57,0.4,0.29,0.34,0.18,0.13,0.15,0.4,0.29,0.34,0.68,0.4,0.12,0.19,0.07,0.02,0.03,0.4,0.12,0.19,0.6,0.08,0.2,0.11,0.0,0.0,0.0,0.08,0.2,0.11,0.83,0.32,0.67,0.43,0.18,0.45,0.26,0.32,0.67,0.43,0.86,0.68,0.42,0.52,0.29,0.19,0.23,0.6,0.38,0.46
1,0.56,0.48,0.17,0.25,0.14,0.04,0.06,0.43,0.15,0.22,0.71,0.05,0.02,0.03,0.0,0.0,0.0,0.05,0.02,0.03,0.53,0.1,0.17,0.12,0.0,0.0,0.0,0.1,0.17,0.12,0.48,0.14,0.33,0.2,0.0,0.0,0.0,0.14,0.33,0.2,0.78,0.52,0.2,0.29,0.27,0.07,0.12,0.52,0.2,0.29
2,0.76,0.55,0.29,0.38,0.11,0.04,0.06,0.35,0.18,0.24,0.68,0.2,0.22,0.21,0.0,0.0,0.0,0.15,0.17,0.16,0.53,0.05,0.02,0.03,0.0,0.0,0.0,0.05,0.02,0.03,0.6,0.3,0.6,0.4,0.11,0.22,0.14,0.25,0.5,0.33,0.66,0.3,0.18,0.22,0.0,0.0,0.0,0.2,0.12,0.15
3,0.46,0.22,0.17,0.19,0.04,0.03,0.03,0.17,0.14,0.15,0.72,0.39,0.27,0.32,0.08,0.05,0.07,0.3,0.21,0.25,0.43,0.17,0.36,0.24,0.04,0.07,0.05,0.13,0.27,0.18,0.58,0.09,0.05,0.06,0.0,0.0,0.0,0.09,0.05,0.06,0.7,0.61,0.37,0.46,0.25,0.13,0.17,0.43,0.26,0.33
4,0.7,0.64,0.18,0.29,0.14,0.03,0.05,0.64,0.18,0.29,0.69,0.43,0.1,0.17,0.0,0.0,0.0,0.29,0.07,0.11,0.74,0.21,0.19,0.2,0.0,0.0,0.0,0.21,0.19,0.2,0.74,0.5,0.58,0.54,0.29,0.36,0.32,0.5,0.58,0.54,0.7,0.57,0.15,0.23,0.14,0.03,0.05,0.57,0.15,0.23
5,0.66,0.72,0.3,0.43,0.24,0.07,0.11,0.56,0.23,0.33,0.63,0.44,0.14,0.22,0.24,0.05,0.09,0.44,0.14,0.22,0.5,0.28,0.36,0.31,0.0,0.0,0.0,0.28,0.36,0.31,0.52,0.22,0.06,0.09,0.06,0.01,0.02,0.22,0.06,0.09,0.75,0.39,0.19,0.26,0.0,0.0,0.0,0.39,0.19,0.26
6,0.73,0.48,0.22,0.3,0.12,0.06,0.08,0.48,0.22,0.3,0.63,0.05,0.02,0.03,0.0,0.0,0.0,0.05,0.02,0.03,0.37,0.19,0.36,0.25,0.04,0.1,0.06,0.19,0.36,0.25,0.74,0.24,0.56,0.33,0.0,0.0,0.0,0.19,0.44,0.27,0.78,0.48,0.21,0.29,0.2,0.09,0.13,0.43,0.19,0.26
7,0.73,0.38,0.1,0.16,0.08,0.02,0.03,0.38,0.1,0.16,0.67,0.54,0.11,0.19,0.23,0.04,0.06,0.46,0.1,0.16,0.53,0.23,0.21,0.22,0.0,0.0,0.0,0.15,0.14,0.15,0.48,0.08,0.06,0.06,0.0,0.0,0.0,0.08,0.06,0.06,0.8,0.38,0.09,0.14,0.08,0.01,0.02,0.38,0.09,0.14
8,0.45,0.43,0.32,0.36,0.08,0.04,0.06,0.36,0.26,0.3,0.5,0.29,0.1,0.14,0.0,0.0,0.0,0.29,0.1,0.14,0.45,0.14,0.14,0.14,0.0,0.0,0.0,0.14,0.14,0.14,0.52,0.29,0.1,0.15,0.0,0.0,0.0,0.29,0.1,0.15,0.73,0.71,0.14,0.23,0.54,0.07,0.12,0.71,0.14,0.23
9,0.8,0.56,0.48,0.52,0.28,0.22,0.25,0.48,0.42,0.45,0.67,0.37,0.33,0.35,0.1,0.09,0.1,0.3,0.27,0.28,0.58,0.15,0.4,0.22,0.03,0.11,0.05,0.15,0.4,0.22,0.81,0.33,0.56,0.42,0.14,0.25,0.18,0.33,0.56,0.42,0.8,0.67,0.36,0.47,0.24,0.12,0.16,0.52,0.28,0.36


## Mean Performance Comparison Table

In [12]:
metrics_mean = metrics_df.mean(axis=0)
mean_metrics_df = pd.DataFrame(metrics_mean.values.reshape(1, -1), columns=metrics_mean.index)
mean_metrics_df

Unnamed: 0_level_0,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3
Unnamed: 0_level_1,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.642,0.486,0.252,0.322,0.141,0.068,0.088,0.425,0.217,0.278,0.658,0.316,0.143,0.185,0.072,0.025,0.035,0.273,0.122,0.157,0.526,0.16,0.241,0.184,0.011,0.028,0.016,0.148,0.225,0.171,0.63,0.251,0.357,0.268,0.078,0.129,0.092,0.241,0.335,0.255,0.756,0.531,0.231,0.311,0.201,0.071,0.1,0.475,0.2,0.271
