# Installations of packages required

In [1]:
!pip install -U bitsandbytes transformers peft datasets accelerate nltk rouge-score trl==0.12.0
# !pip install -U
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -U peft
# !pip install datasets
# !pip install trl==0.12.0
# ! pip install -U accelerate
# !pip install nltk rouge-score



# Huggingface and wanddb login

In [2]:
!huggingface-cli login --token xxxxxxxxxxxxxxxxxxxxxxx

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `Huggingface_store_token` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Huggingface_store_token`


In [3]:
!huggingface-cli whoami

Existance


In [4]:
import wandb
!wandb login xxxxxxxxxxxxxxxx

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


# Import Neccesary packages for finetuning

In [5]:
import torch
import time
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

from datasets import Dataset, load_dataset
from datasets import load_dataset#, load_metric
from transformers import pipeline, set_seed
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import warnings
warnings.filterwarnings("ignore")

# import dataset from huggingface

In [6]:
huggingface_dataset_name = "ccdv/pubmed-summarization"

# dataset = load_dataset(huggingface_dataset_name, "3.0.0")
dataset = load_dataset(huggingface_dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})

# View One sample from dataset
using this we will get what kind of data is and what kind of structure it holds

In [7]:
sample = dataset["train"][1]
print(f"""Article (excerpt of 500 characters, total length: {len(sample["article"])}):""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["abstract"])}):')
print(sample["abstract"])

Article (excerpt of 500 characters, total length: 18281):
it occurs in more than 50% of patients and may reach 90% in certain types of cancers , especially in patients undergoing chemotherapy and/or radiation therapy.1 anemia is defined as an inadequate circulating level of hemoglobin ( hb ) ( hb < 12 g / dl ) and may arise as a result of the underlying disease , bleeding , poor nutrition , chemotherapy , or radiation therapy . 
 preliminary studies suggest that survival and loco - regional control after radiation therapy , especially in head and neck 

Summary (length: 2010):
backgroundanemia in patients with cancer who are undergoing active therapy is commonly encountered and may worsen quality of life in these patients . the effect of blood transfusion is often temporary and may be associated with serious adverse events . 
 erythropoiesis - stimulating agents are not effective in 30%50% of patients and may have a negative effect on overall survival.aimsto assess the efficacy and fea

# convert to Instruct finetune dataset

In [8]:
def format_instruction(dialogue: str, summary: str):
    return f"""### Instruction:
Summarize the following conversation.

### Input:
{dialogue.strip()}

### Summary:
{summary}
""".strip()

def generate_instruction_dataset(data_point):

    return {
        "article": data_point["article"],
        "abstract": data_point["abstract"],
        "text": format_instruction(data_point["article"],data_point["abstract"])
    }

def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_instruction_dataset)
    )

# apply preprocessing onto process_dataset



In [9]:
dataset["train"] = process_dataset(dataset["train"])
dataset["test"] = process_dataset(dataset["validation"])
dataset["validation"] = process_dataset(dataset["validation"])

# Select 1000 rows from the training split
train_data = dataset['train'].shuffle(seed=42).select([i for i in range(1000)])

# Select 100 rows from the test and validation splits
test_data = dataset['test'].shuffle(seed=42).select([i for i in range(100)])
validation_data = dataset['validation'].shuffle(seed=42).select([i for i in range(100)])

train_data,test_data,validation_data

(Dataset({
     features: ['article', 'abstract', 'text'],
     num_rows: 1000
 }),
 Dataset({
     features: ['article', 'abstract', 'text'],
     num_rows: 100
 }),
 Dataset({
     features: ['article', 'abstract', 'text'],
     num_rows: 100
 }))

# Load model in 4-bit quant

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id =  "NousResearch/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="cuda:0")

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# ZERO-SHOT INFERENCE WITH LLAMA-2 7B

In [11]:
index = 2

dialogue = test_data['article'][index]
summary = test_data['abstract'][index]

prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

### Input:
the north american association of central cancer registries reported in the period from 1992 to 1997 more than 1.6 million females with the diagnosis of cancer . of these , 67,746 were ovarian cancers , 1563 ( or 2.3% ) of which occurred in patients younger than 25 . 
 close to 50% ( 780 cases ) were reported in women aged 20 to 24 , 481 cases in the 15- to 19-year - old group , and 302 cases in the neonates to the 14-year - old group.1 the three major types of ovarian tumors are epithelial , sex cord , and germ cell . 
 epithelial cell tumors represent the majority of all ovarian neoplasms ( 82%).conversely , germ cell tumors ( gcts ) are rare , comprising approximately 20% of all ovarian tumors , both benign and malignant . 
 the most commonly occurring gct is the dysgerminoma , which accounts for only 1 - 5% of all ovari

# TRAINING STEP (FINE TUNING

In [12]:
from peft import prepare_model_for_kbit_training

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Lla

# Peft Model define

In [13]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], #specific to Llama models.
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 16777216 || all params: 3517190144 || trainable%: 0.477006226934315


# Defining the training arguements

In [14]:
from transformers import TrainingArguments
import time


OUTPUT_DIR = f"llama2-summarizations-adapter-models-{time.time()}"

training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    # report_to="tensorboard",
    report_to=["wandb", "tensorboard"],
    push_to_hub=True,
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [15]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=validation_data,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvipulgote4[0m ([33mexistence[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
38,1.6864,1.592302
76,1.5727,1.577435
114,1.5508,1.570868
152,1.4845,1.570034


TrainOutput(global_step=186, training_loss=1.6106457511583965, metrics={'train_runtime': 9189.1049, 'train_samples_per_second': 0.326, 'train_steps_per_second': 0.02, 'total_flos': 1.1902384589134234e+17, 'train_loss': 1.6106457511583965, 'epoch': 2.96})

In [17]:
peft_model_path="./peft-summary"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-summary/tokenizer_config.json',
 './peft-summary/special_tokens_map.json',
 './peft-summary/tokenizer.model',
 './peft-summary/added_tokens.json',
 './peft-summary/tokenizer.json')

# Inferance

In [18]:
from transformers import TextStreamer

model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              

In [19]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# One Sample based Infereance after finetuning

In [20]:
index = 51

dialogue = train_data['article'][index][:10000]
summary = train_data['abstract'][index]

prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt',truncation=True).input_ids.cuda()
outputs = trained_model.generate(input_ids=input_ids, max_new_tokens=200, )
output= tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'TRAINED MODEL GENERATED TEXT :\n{output}')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

### Input:
in the course of treating cancer patients hospitalized for surgical procedures , health providers variably utilize multiple processes of care . 
 processes of care , defined by the centers for medicare and medicaid services ( cms ) as a  series of interrelated activities undertaken to achieve objectives ,  are believed to be integral components of safe and effective care received by surgery patients , based on a belief that they prevent or ameliorate the impact of complications 1 . for the most part , these processes are optional and under the control of the surgical and anesthetic teams . 
 the cmsmonitored processes are an important component of treatment to study , because they are readily quantified and more accepted by providers as indicators of their individual quality compared to outcome measures attributable to pati

# Calculate Rouge score and BLEU score
  - generally Rouge score is often used for summarization task.
  - BLEU score is calculated for next word prediction realted task....it shows how good the generation accuracy.

In [22]:
import random
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from tqdm import tqdm
random.seed(42)

num_samples = 30
indices = random.sample(range(len(train_data)), num_samples)

smoothie = SmoothingFunction().method4
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for idx in tqdm(indices):
    dialogue = train_data['article'][idx][:10000]
    reference_summary = train_data['abstract'][idx]

    prompt = f"""
    Summarize the following conversation.

    ### Input:
    {dialogue}

    ### Summary:
    """
    input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()
    outputs = trained_model.generate(input_ids=input_ids, max_new_tokens=200)
    generated_summary = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

    bleu = sentence_bleu([reference_summary.split()], generated_summary.split(), smoothing_function=smoothie)
    bleu_scores.append(bleu)

    rouge_scores = scorer.score(reference_summary, generated_summary)
    rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
    rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
    rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

avg_bleu = np.mean(bleu_scores)
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)

print(f"Average BLEU Score: {avg_bleu:.4f}")
print(f"Average ROUGE-1 Score: {avg_rouge1:.4f}")
print(f"Average ROUGE-2 Score: {avg_rouge2:.4f}")
print(f"Average ROUGE-L Score: {avg_rougeL:.4f}")

100%|██████████| 30/30 [15:10<00:00, 30.35s/it]

Average BLEU Score: 0.0544
Average ROUGE-1 Score: 0.2790
Average ROUGE-2 Score: 0.0866
Average ROUGE-L Score: 0.1581



