In [None]:
from google.colab import drive
drive.mount('/content/drive')

dataset_path = "/content/drive/My Drive/llama_chat_dataset.json"
!ls -l "{dataset_path}"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
-rw------- 1 root root 449373 Jan 26  2025 '/content/drive/My Drive/llama_chat_dataset.json'


In [None]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
from transformers.trainer_utils import get_last_checkpoint

In [None]:
base_model = "meta-llama/Llama-3.1-8B-Instruct"
dataset_name = dataset_path
new_model_name = "llama-3.1-8b-finance-chat"

# --- QLoRA Configuration ---
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# --- bitsandbytes Configuration (4-bit Quantization) ---
use_4bit = True
bnb_4bit_compute_dtype = "bfloat16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# --- TrainingArguments Configuration ---
output_dir = "/content/drive/My Drive/llama-finance-checkpoints"
num_train_epochs = 3
fp16 = False
bf16 = True # Enable bfloat16
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 50
logging_steps = 10

# --- SFTTrainer Configuration ---
max_seq_length = 512
packing = False
device_map = {"": 0}

In [None]:
print("Loading dataset...")
full_dataset = load_dataset("json", data_files=dataset_name, split="train")

split_dataset = full_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

print(f"Total examples: {len(full_dataset)}")
print(f"Training examples: {len(split_dataset['train'])}")
print(f"Validation examples: {len(split_dataset['test'])}")
print("-" * 30)

print("Setting up 4-bit quantization...")
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map=device_map,
    token=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading dataset...


Generating train split: 0 examples [00:00, ? examples/s]

Total examples: 3325
Training examples: 2992
Validation examples: 333
------------------------------
Setting up 4-bit quantization...
Loading base model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
print("Configuring PEFT (LoRA)...")
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ]
)

Configuring PEFT (LoRA)...


In [None]:
print("Setting up SFTConfig...")
training_arguments = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    eval_strategy="steps",
    eval_steps=save_steps,

    dataset_text_field="text",
    max_length=max_seq_length,
    packing=packing,
)

Setting up SFTConfig...


In [None]:
print("Initializing SFTTrainer...")
trainer = SFTTrainer(
    model=model,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    peft_config=peft_config,
    processing_class=tokenizer,
    args=training_arguments,
)


Initializing SFTTrainer...


Adding EOS to train dataset:   0%|          | 0/2992 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2992 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2992 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/333 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/333 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/333 [00:00<?, ? examples/s]

In [None]:
print("Starting training...")

last_checkpoint = get_last_checkpoint(training_arguments.output_dir)

if last_checkpoint is None:
    print("No checkpoint found. Starting training from scratch.")
    trainer.train()
else:
    print(f"Checkpoint found at {last_checkpoint}. Resuming training.")
    try:
        trainer.train(resume_from_checkpoint=True)
    except Exception as e:
        print(f"Error resuming from checkpoint: {e}")

print("Training finished.")

Initializing SFTTrainer...


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Starting training...


SafetensorError: Error while deserializing header: incomplete metadata, file not fully covered

In [None]:
print("Loading model for inference...")
import gc

del model
del trainer
gc.collect()
torch.cuda.empty_cache()

reloaded_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map=device_map,
    token=True
)


adapter_path_for_loading = "/content/drive/My Drive/llama-finance-checkpoints/checkpoint-1500"

reloaded_model = PeftModel.from_pretrained(reloaded_model, adapter_path_for_loading)

=reloaded_tokenizer = AutoTokenizer.from_pretrained(adapter_path_for_loading)

print("Setting up inference pipeline...")
pipe = pipeline(task="text-generation", model=reloaded_model, tokenizer=reloaded_tokenizer, max_length=200)

# --- Test Prompt ---
prompt = "What did Geojit Financial Services recommend for Avanti Feeds?"
messages = [
    {"role": "user", "content": prompt}
]

prompt_template = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Generate response
result = pipe(prompt_template)
print("\n--- INFERENCE RESULT ---")
print(result[0]['generated_text'])
print("------------------------")

# --- Test Prompt 2 ---
prompt_2 = "What is the upside potential of InterGlobe Aviation stock?"
messages_2 = [
    {"role": "user", "content": prompt_2}
]
prompt_template_2 = pipe.tokenizer.apply_chat_template(messages_2, tokenize=False, add_generation_prompt=True)

# Generate response
result_2 = pipe(prompt_template_2)
print("\n--- INFERENCE RESULT 2 ---")
print(result_2[0]['generated_text'])
print("--------------------------")

Loading model for inference...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Setting up inference pipeline...

--- INFERENCE RESULT ---
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What did Geojit Financial Services recommend for Avanti Feeds?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

accumulate rating
------------------------

--- INFERENCE RESULT 2 ---
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the upside potential of InterGlobe Aviation stock?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

26 percent
--------------------------


In [None]:
!pip install -q evaluate rouge_score bert_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
import evaluate
from tqdm.auto import tqdm

predictions = []
references = []

print("Generating predictions on validation set...")
for example in tqdm(split_dataset['test']):
    text = example['text']

    try:
        reference = text.split('[/INST]')[1].split('</s>')[0].strip()
        references.append(reference)

        question = text.split('[INST]')[1].split('[/INST]')[0].strip()

        messages = [
            {"role": "user", "content": question}
        ]
        prompt_template = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        result = pipe(prompt_template, max_new_tokens=100, num_return_sequences=1)

        full_output = result[0]['generated_text']
        answer_start_tag = "<|start_header_id|>assistant<|end_header_id|>\n\n"

        answer_start_index = full_output.rfind(answer_start_tag)
        if answer_start_index != -1:
            model_answer = full_output[answer_start_index + len(answer_start_tag):].strip()
            model_answer = model_answer.split('<|eot_id|>')[0].strip()
            predictions.append(model_answer)
        else:
            predictions.append(full_output)

    except Exception as e:
        print(f"Skipping an example due to parsing error: {e}")

print("Generation complete.")
print(f"Total predictions: {len(predictions)}")
print(f"Total references: {len(references)}")

Generating predictions on validation set...


  0%|          | 0/333 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Generation complete.
Total predictions: 333
Total references: 333


In [None]:
import numpy as np

# --- ROUGE Score ---
print("\n--- Computing ROUGE Score ---")
rouge = evaluate.load('rouge')
rouge_results = rouge.compute(
    predictions=predictions,
    references=references
)
print(rouge_results)

# --- BERTScore ---
print("\n--- Computing BERTScore ---")
bertscore = evaluate.load('bertscore')
bert_results = bertscore.compute(
    predictions=predictions,
    references=references,
    lang="en"
)

bert_f1 = np.mean(bert_results['f1'])
print(f"\nBERTScore (Average F1): {bert_f1:.4f}")


--- Computing ROUGE Score ---


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(0.1246989866849243), 'rouge2': np.float64(0.032310794279809776), 'rougeL': np.float64(0.12261033216459753), 'rougeLsum': np.float64(0.12237679969668844)}

--- Computing BERTScore ---


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERTScore (Average F1): 0.8486
