In [1]:
!pip install -q -U transformers peft bitsandbytes datasets accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m108.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m520.7/520.7 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from huggingface_hub import notebook_login
notebook_login()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer

# -------- Config --------
MODEL_ID = "google/gemma-3-4b-it"
DATA_PATH = "/content/insurance_sales_data_expanded.jsonl"
OUTPUT_DIR = "./gemma-3-finetuned"

CONFIG = {
    "max_length": 1024,
    "batch_size": 1,
    "grad_acc": 8,
    "lr": 2e-4,
    "epochs": 1,
    "lora_r": 8,
    "lora_alpha": 16,
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device}")

Running on: cuda


In [4]:
# -------- Tokenizer --------
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -------- Parse dialogue into user/agent messages --------
def parse_dialogue_to_messages(dialogue_str):
    messages = []
    lines = dialogue_str.strip().split("\n")
    for line in lines:
        if line.startswith("User:"):
            content = line.replace("User:", "").strip()
            messages.append({"role": "user", "content": content})
        elif line.startswith("Agent:"):
            content = line.replace("Agent:", "").strip()
            messages.append({"role": "assistant", "content": content})
    return messages

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [5]:
def build_example(example):
    messages = parse_dialogue_to_messages(example["dialogue"])
    assert messages and messages[-1]["role"] == "assistant"

    context_messages = messages[:-1]
    target_text = messages[-1]["content"] + tokenizer.eos_token

    prompt_text = tokenizer.apply_chat_template(
        context_messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    prompt_ids = tokenizer(prompt_text, add_special_tokens=False)["input_ids"]
    target_ids = tokenizer(target_text, add_special_tokens=False)["input_ids"]

    input_ids = prompt_ids + target_ids
    max_len = CONFIG["max_length"]

    if len(input_ids) > max_len:
        input_ids = input_ids[-max_len:]
        target_start = max(0, len(input_ids) - len(target_ids))
    else:
        target_start = len(prompt_ids)

    labels = [-100] * target_start + input_ids[target_start:]

    pad_id = tokenizer.pad_token_id
    attention_mask = [1] * len(input_ids)
    token_type_ids = [0] * len(input_ids)   # NEW: all zeros

    while len(input_ids) < max_len:
        input_ids.append(pad_id)
        labels.append(-100)
        attention_mask.append(0)
        token_type_ids.append(0)           # pad with 0 as well

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "token_type_ids": token_type_ids,  # NEW
    }

dataset = load_dataset("json", data_files=DATA_PATH, split="train")
dataset = dataset.map(build_example, remove_columns=dataset.column_names)
dataset.set_format("torch")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [6]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, BitsAndBytesConfig

# 4-bit base model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model.config.use_cache = False

# LoRA
lora_config = LoraConfig(
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training args
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["grad_acc"],
    learning_rate=CONFIG["lr"],
    num_train_epochs=CONFIG["epochs"],
    bf16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    processing_class=tokenizer,
)

print("\n==== STARTING TRAINING ====\n")
trainer.train()

model.save_pretrained(OUTPUT_DIR)
print("Done! Model saved to:", OUTPUT_DIR)

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/883 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 0}.


trainable params: 16,394,240 || all params: 4,316,473,712 || trainable%: 0.3798

==== STARTING TRAINING ====



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,4.166555
20,2.502126
30,2.103988


Done! Model saved to: ./gemma-3-finetuned


In [12]:
import torch
import gc

# 1. Clear out the variables from the training session
if 'trainer' in locals():
    del trainer
if 'model' in locals():
    del model
if 'base_model' in locals():
    del base_model

# 2. Force Python's garbage collector to run
gc.collect()

# 3. Clear the CUDA cache (The most important step for the GPU)
torch.cuda.empty_cache()

# 4. (Optional) Check how much memory you just recovered
free_memory = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_reserved(0)
print(f"Memory Cleared! Current Free VRAM: {free_memory / 1024**3:.2f} GB")

Memory Cleared! Current Free VRAM: 7.75 GB


In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# 1. Paths (Ensure these match your training)
MODEL_ID = "google/gemma-3-4b-it"
ADAPTER_PATH = "./gemma-3-finetuned"

# 2. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# 3. Load Base Model in 4-bit (Essential for Colab T4)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# 4. Load your Adapter
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()

def agent_response(user_input):
    messages = [
        {"role": "user", "content": f"Answer as a helpful insurance agent: {user_input}"}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=0.4,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    input_length = inputs.input_ids.shape[1]
    response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
    return response.strip()

# --- THE MISSING PART: CHAT LOOP ---
print("--- Insurance Agent Bot Active (Type 'quit' to stop) ---")
while True:
    user_query = input("You: ")
    if user_query.lower() in ["quit", "exit", "stop"]:
        break

    bot_reply = agent_response(user_query)
    print(f"Agent: {bot_reply}\n")

Loading weights:   0%|          | 0/883 [00:00<?, ?it/s]

--- Insurance Agent Bot Active (Type 'quit' to stop) ---
You: "क्या Hospitalization के बाद के खर्चे भी Cover होते हैं?"
Agent: जी हाँ, अगर यह Pre-existing condition न हो तो hospitalization expenses cover किए जाते हैं।

You: "क्या Nursing charges अलग से देने होंगे?
Agent: जी हाँ, Hospitalization के दौरान आपके खर्च किए गए सभी expenses का पूरा cover मिलता है। क्या मैं आपकी family details नोट करूँ?”

You: अगर मैं Claim नहीं करता तो क्या फायदा मिलेगा?
Agent: जी हाँ, यह आपके लिए एक बहुत ही अच्छा विकल्प है।

You: "क्या मैं Installments में payment कर सकता हूँ?
Agent: जी हाँ, आप installment के हिसाब से payment कर सकते हैं। क्या मैं process शुरू करूँ?

You: Waiting period कितना होता है?
Agent: जी हाँ, यह आपके age और health condition पर निर्भर करता है।

You: Plus benefit के बारे में बताइए।
Agent: जी हाँ, क्या मैं आपको इसे समझाऊँ?

You: hello
Agent: Hi there, thanks for reaching out! I'm [Your Name], and I’d be happy to help you find the right coverage. Can we start with your details?

You: "क्या

KeyboardInterrupt: Interrupted by user

You: exit
