In [1]:
!pip install -U pypdf2 bitsandbytes



In [2]:
import re
import json
from pathlib import Path
from PyPDF2 import PdfReader

# Load PDF
pdf_path = "New_QuestionBank.pdf"
reader = PdfReader(pdf_path)

# Extract text from PDF
text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

# Regex to capture Q/A pairs
qa_pairs = re.findall(r"Q:\s*(.*?)\s*A:\s*(.*?)(?=Q:|$)", text, re.DOTALL)

dataset = []
for q, a in qa_pairs:
    q = q.strip().replace("\n", " ")
    a = a.strip().replace("\n", " ")

    # Format according to your FormatDataset style
    record = {
        "instruction": q,
        "output": a
    }
    dataset.append(record)

# Save as JSONL
out_path = Path("tirumala_dataset.jsonl")
with open(out_path, "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Converted {len(dataset)} Q/A pairs into {out_path}")


Converted 135 Q/A pairs into tirumala_dataset.jsonl


In [5]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

In [25]:
# ==============================
# 1. Load Dataset
# ==============================
dataset = load_dataset("json", data_files="tirumala_dataset.jsonl")

# Split into train/validation
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)

Generating train split: 135 examples [00:00, 14944.86 examples/s]


In [26]:
dataset['test']

Dataset({
    features: ['instruction', 'output'],
    num_rows: 14
})

In [27]:
dataset['test']['instruction']

Column(['How   can   I   get   a   Laddu   prasadam?', 'How   many   steps   are   there   in   Alipiri   Mettu   and   Srivari   Mettu?', 'Where   can   I   get   laddu   prasadam   after   darshan?', 'What   are   the   options   for   public   transport   from   Tirupati   to   Tirumala?', 'What   sevas   and   rituals   can   devotees   participate   in?'])

In [28]:
dataset['test']['output']

Column(['After   darshan,   you   can   purchase   the   famous   Tirupati   Laddu   at   the   designated   counters.   Each   darshan   token   often   includes   a   free   Laddu   as   well.', 'Alipiri   Mettu   has   about   3,550   steps   covering   a   distance   of   9   km,   taking   3–4   hours   to   climb.   Srivari   Mettu   has   about   2,388   steps   with   a   shorter   distance   of   2.1   km,   taking   around   2–3   hours   to   climb.', 'After   darshan,   every   pilgrim   receives   laddu   prasadam   at   the   Laddu   Counter   inside   the   temple   complex .   Additional   laddus   can   be   purchased.', 'APSRTC   runs   frequent   bus   services   from   various   points   in   Tirupati,   including   the   railway   station   and   bus   stand,   to   Tirumala.', 'Important   sevas   include:   Daily   Sevas :   Suprabhatam,   Thomala   Seva,   Archana,   Ekantha   Seva,   Weekly   Sevas :   Kalyanotsavam,   Sahasra   Deepalankarana,   Annual   Event

In [29]:
from huggingface_hub import login
login("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")

In [30]:
##optional
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # or load_in_4bit=True
    llm_int8_enable_fp32_cpu_offload=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"   # lets HF place layers across GPU/CPU
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:28<00:00,  7.06s/it]


In [31]:
# ==============================
# 2. Model & Tokenizer
# ==============================
# MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"   # <-- Changed to a public model



# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
# tokenizer.pad_token = tokenizer.eos_token  # ensure pad token is defined

In [32]:
# ==============================
# 3. Tokenize Function
# ==============================
def tokenize_function(example):
    tokenized_example = tokenizer(
        str(example["instruction"]) + " " + str(example["output"]), # Ensure values are strings before concatenation
        truncation=True,
        max_length=1024,
        padding="max_length" # Add padding to ensure uniform length
    )
    return {
        "input_ids": tokenized_example["input_ids"],
        "attention_mask": tokenized_example["attention_mask"],
        "labels": tokenized_example["input_ids"].copy(),
        "instruction": example["instruction"], # Keep original instruction
        "output": example["output"] # Keep original output
    }

tokenized_datasets = dataset.map(tokenize_function, batched=False) # Process without batching

# Set dataset format to torch to help data collator
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'instruction', 'output'])

Map: 100%|██████████| 121/121 [00:00<00:00, 1235.22 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 736.89 examples/s]


In [33]:
!pip install huggingface_hub[hf_xet] 
!pip install hf_xet



In [34]:
# ==============================
# 4. Load Base Model + LoRA
# ==============================
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

# 1. Define the quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto"
)

# LoRA config
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
# ==============================
# 5. Training Arguments
# ==============================
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    eval_strategy="epoch", # Replaced evaluation_strategy with eval_strategy
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=25,
    save_total_limit=2,
    push_to_hub=False,
    fp16=True,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    load_best_model_at_end=True
)

In [None]:
# ==============================
# 6. Trainer
# ==============================
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer, # Use processing_class instead of tokenizer
    data_collator=data_collator, # Add data collator for padding
)

In [None]:
# ==============================
# 7. Train Model
# ==============================
trainer.train()
# d674e8f3b33ef23520bf99b3a5a7504c03fdaa2a - wandb api key

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Epoch,Training Loss,Validation Loss
1,No log,2.42045
2,No log,2.332589
3,No log,2.171201
4,2.272900,1.946551
5,2.272900,1.713401
6,2.272900,1.478219
7,1.726300,1.329298
8,1.726300,1.197964
9,1.726300,1.140367
10,1.068500,1.092266


TrainOutput(global_step=160, training_loss=0.996976638585329, metrics={'train_runtime': 2216.2264, 'train_samples_per_second': 1.092, 'train_steps_per_second': 0.072, 'total_flos': 1.551560953823232e+16, 'train_loss': 0.996976638585329, 'epoch': 20.0})

In [17]:
# ==============================
# 8. Save Model
# ==============================
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

print("✅ Fine-tuning complete! Model saved at ./finetuned_model")

✅ Fine-tuning complete! Model saved at ./finetuned_model




---

# Loading and testing the model

# Evaluating the model with test data

In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define the path where you saved the model and tokenizer
saved_model_path = "./finetuned_model"

loaded_tokenizer = AutoTokenizer.from_pretrained(saved_model_path)
loaded_model = AutoModelForCausalLM.from_pretrained(saved_model_path)

# Ensure the model is in evaluation mode and on the correct device
loaded_model.eval()
if torch.cuda.is_available():
    loaded_model.to("cuda")

print("✅ Model and tokenizer loaded successfully!")

# Example of how to use the loaded model for inference
def generate_response(prompt, model, tokenizer, max_length=1024):
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Move input_ids to the same device as the model
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")

    # Generate a response
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id # Use eos_token_id for padding
        )

    # Decode the generated output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

✅ Model and tokenizer loaded successfully!


In [19]:
prompt = "Are there buses from Alipiri to Tirumala for those who get tired?"

# Generate a response using the fine-tuned model
response = generate_response(prompt, loaded_model, loaded_tokenizer)

print("Prompt:")
print(prompt)
print("\nGenerated Response:")
print(response)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt:
Are there buses from Alipiri to Tirumala for those who get tired?

Generated Response:
Are there buses from Alipiri to Tirumala for those who get tired? Yes. There are buses from Alipiri to Tirumala that run regularly. However, the journey can be long and arduous, and it's advisable to take breaks and rest along the way. Many people opt for the TTD buses, which are cheaper and faster. It's also possible to hire a taxi or a rickshaw.  

Buses from Alipiri usually depart every 30 minutes or so, depending on traffic. The journey takes around 2-3 hours, depending on the route. The TTD buses are also frequent and run continuously. The journey is usually shorter, but it can be more crowded.  

It's always a good idea to arrive at the bus stand early, especially during peak hours, to avoid waiting in long queues. You can also book your ticket online in advance.  

Remember to carry your ID, cash, and a small bag for your belongings. Also, dress appropriately, as the buses are often cr

In [21]:
prompt = "Is drinking water available on the steps route?"

# Generate a response using the fine-tuned model
response = generate_response(prompt, loaded_model, loaded_tokenizer)

print("Prompt:")
print(prompt)
print("\nGenerated Response:")
print(response)

Prompt:
Is drinking water available on the steps route?

Generated Response:
Is drinking water available on the steps route? Yes, there are water points along the steps route. However, it is advisable to bring your own water bottle. You can also buy bottled water at the starting point.  

Is it safe to walk on the steps route at night? Yes, it is safe to walk on the steps route at night. The steps are lit up with LED lights, and there are security cameras to deter any potential theft. However, it is recommended to walk with a group or carry a flashlight.  

Is it possible to walk the steps route in a wheelchair or with a stroller? Yes, the steps are designed to accommodate wheelchairs and strollers. However, it may be more challenging to climb the steps due to their steepness. It is recommended to hire a stroller or use a ramp to make the climb easier.  

Is it possible to walk the steps route with a dog? Yes, dogs are allowed on the steps route. However, they should be kept on a leash