<a href="https://colab.research.google.com/github/vrandamanihar/Question-Answer-Generator-Bot/blob/main/llama_q_a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 1. Update the "bitsandbytes" library to the latest version (Fixes errors)
!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes

# 2. Install the rest of the AI tools
# langchain: For splitting the PDF smartly
# transformers/peft: For running the Llama-3 model
# gradio: For the App UI
!pip install -q -U torch transformers peft datasets trl accelerate
!pip install -q pypdf python-docx gradio
!pip install -q langchain langchain-community faiss-cpu sentence-transformers langchain_text_splitters

import os
import torch
import pandas as pd
from google.colab import drive

# 3. Connect to Google Drive
drive.mount('/content/drive')

# 4. Define Paths (Where your data lives)
# Make sure your CSV is inside a folder named 'QA_Dataset' in your Drive
DATASET_PATH = "/content/drive/MyDrive/QA_Dataset/final_balanced_QA_dataset.csv"
MODEL_SAVE_PATH = "/content/drive/MyDrive/QA_Dataset/My_FineTuned_Llama3_Model"

# 5. Check if GPU is ready
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Setup Complete! Using device: {device}")
if device == "cpu":
    print("⚠️ Warning: You are on CPU. Go to Runtime > Change runtime type > T4 GPU")

[0mCollecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m146.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m

In [4]:
from datasets import Dataset
from transformers import AutoTokenizer

# We use the Unsloth version of Llama-3 because it is faster and memory-efficient
MODEL_ID = "unsloth/llama-3-8b-Instruct-bnb-4bit"

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token

class QADataProcessor:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def clean_text(self, text):
        if not isinstance(text, str): return ""
        return " ".join(text.split())

    def format_row_to_prompt(self, row):
        # 1. Clean inputs
        context = self.clean_text(row['context'])
        question = self.clean_text(row['question'])
        answer = self.clean_text(row['answer'])

        # 2. Standardize Type (MCQ, True/False, Short)
        raw_type = str(row['type']).lower()
        if 'mcq' in raw_type: q_type = 'mcq'
        elif 'true' in raw_type: q_type = 'true_false'
        else: q_type = 'short'

        difficulty = str(row['difficulty']).lower()

        # 3. Create Specific Instructions
        if q_type == 'mcq':
            instruction = f"Create a {difficulty} Multiple Choice Question (MCQ) based on the text. Format: Question, Options (A-D), Correct Answer."
        elif q_type == 'true_false':
            instruction = f"Create a {difficulty} True/False question based on the text. Format: Question, Answer (True/False)."
        else:
            instruction = f"Create a {difficulty} Short Answer question based on the text."

        # 4. Build the Prompt (Input)
        prompt_input = (
            f"### Instruction:\n{instruction}\n\n"
            f"### Context:\n{context}\n\n"
            f"### Response:\n"
        )

        # 5. Build the Target (Output)
        response_text = f"Question: {question}"
        if q_type == 'mcq' and isinstance(row.get('options'), str):
            response_text += f"\nOptions:\n{row['options']}"
        response_text += f"\nAnswer: {answer}<|endoftext|>"

        return {"text": prompt_input + response_text}

    def process_csv(self, file_path):
        print("Reading CSV...")
        df = pd.read_csv(file_path)
        # Remove bad rows
        df = df.dropna(subset=['context', 'question', 'answer'])

        # Convert to AI Dataset
        dataset = Dataset.from_pandas(df)
        dataset = dataset.map(self.format_row_to_prompt)
        return dataset

# Process Data
if os.path.exists(DATASET_PATH):
    processor = QADataProcessor(tokenizer)
    full_dataset = processor.process_csv(DATASET_PATH)

    # Split 90% Training, 10% Testing
    dataset_split = full_dataset.train_test_split(test_size=0.1)
    train_dataset = dataset_split['train']
    eval_dataset = dataset_split['test']
    print(f"✅ Data Ready! Training examples: {len(train_dataset)}")
else:
    print(f"❌ Error: File not found at {DATASET_PATH}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

❌ Error: File not found at /content/drive/MyDrive/QA_Dataset/final_balanced_QA_dataset.csv


In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

# 1. Configure 4-Bit Loading (Magic step to fit Llama-3 in Colab)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# 2. Load Model
print("Loading Llama-3-8B (This may take a minute)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config, # Use 4-bit config
    device_map="auto",
    trust_remote_code=True
)

# 3. Configure LoRA (Efficient Training)
peft_config = LoraConfig(
    r=16, # Rank (Higher = smarter but slower, 16 is good)
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"] # Train more layers for better logic
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# 4. Tokenize Function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

# 5. Training Settings
training_args = TrainingArguments(
    output_dir="./llama3_temp_checkpoints",
    per_device_train_batch_size=2, # Lower batch size for bigger model
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=10,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# 6. Start Training
print("🚀 Starting Training... (Grab a coffee, this takes ~10 mins)")
trainer.train()

# 7. Save to Drive
print(f"💾 Saving model to {MODEL_SAVE_PATH}...")
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print("✅ Llama-3 Model Saved Successfully!")

Loading Llama-3-8B (This may take a minute)...




trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

🚀 Starting Training... (Grab a coffee, this takes ~10 mins)


Step,Training Loss
10,2.0051
20,1.4043
30,1.2924
40,1.1614
50,1.1455
60,1.212
70,1.1954
80,1.0449
90,1.0528
100,0.9603


💾 Saving model to /content/drive/MyDrive/QA_Dataset/My_FineTuned_Llama3_Model...
✅ Llama-3 Model Saved Successfully!


In [7]:
import pypdf
import random
from docx import Document
from peft import PeftModel
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

class SmartQAEngine:
    def __init__(self, saved_model_path, base_model_id):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_id)

        # Load Base Model (4-bit)
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_id,
            quantization_config=bnb_config,
            device_map="auto"
        )

        # Load Fine-Tuned Weights
        if os.path.exists(saved_model_path):
            print(f"Loading Fine-Tuned Model from {saved_model_path}")
            self.model = PeftModel.from_pretrained(base_model, saved_model_path)
        else:
            print("⚠️ Fine-tuned model not found. Using Base Model.")
            self.model = base_model
        self.model.eval()

        # --- SMART CHUNKING SETUP ---
        # This splits the PDF into chunks of 800 characters
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=800,
            chunk_overlap=100,
            length_function=len
        )

    def extract_text(self, file_obj):
        if file_obj is None: return ""
        text = ""
        try:
            name = file_obj.name.lower()
            if name.endswith('.pdf'):
                reader = pypdf.PdfReader(file_obj.name)
                for page in reader.pages:
                    text += page.extract_text() + "\n"
            elif name.endswith('.docx'):
                doc = Document(file_obj.name)
                for para in doc.paragraphs:
                    text += para.text + "\n"
            elif name.endswith('.txt'):
                with open(file_obj.name, 'r') as f:
                    text = f.read()
        except Exception as e:
            return f"Error reading file: {e}"
        return text

    def get_diverse_chunks(self, full_text, num_chunks):
        """Pick different parts of the PDF for different questions"""
        chunks = self.text_splitter.split_text(full_text)
        if not chunks: return []

        # If text is short, return what we have. If long, pick random unique parts.
        if len(chunks) <= num_chunks:
            return chunks
        return random.sample(chunks, num_chunks)

    def generate(self, full_text, count, difficulty, q_type, show_ans):
        # 1. Get Chunks (RAG Lite)
        contexts = self.get_diverse_chunks(full_text, int(count))

        results = []
        print(f"Generating {count} questions using {len(contexts)} text chunks...")

        for i, context_chunk in enumerate(contexts):
            # 2. Strict Prompting
            if q_type == 'mcq':
                instr = f"Create a {difficulty} Multiple Choice Question (MCQ) based on the context. Provide 4 options (A,B,C,D) and the Correct Answer."
            elif q_type == 'true_false':
                instr = f"Create a {difficulty} True or False question based on the context. State the correct answer."
            else:
                instr = f"Create a {difficulty} Short Answer question based on the context. Provide the answer."

            prompt = (
                f"### Instruction:\n{instr}\n\n"
                f"### Context:\n{context_chunk}\n\n"
                f"### Response:\n"
            )

            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=150,
                    temperature=0.6, # Keep it strictly factual
                    top_p=0.9,
                    do_sample=True,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            full_out = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract response
            if "### Response:" in full_out:
                response = full_out.split("### Response:")[-1].strip()
            else:
                response = full_out

            # Hide Answer Logic
            display_text = f"**Q{i+1} ({q_type.upper()}):**\n{response}"
            if not show_ans:
                for keyword in ["Answer:", "Correct Answer:", "Ans:"]:
                    if keyword in display_text:
                        display_text = display_text.split(keyword)[0].strip()
                        break

            results.append(display_text)

        return "\n\n---\n\n".join(results)

# Initialize Engine
# NOTE: If you just finished training, this will use the model you just saved
engine = SmartQAEngine(MODEL_SAVE_PATH, MODEL_ID)

config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

⚠️ Fine-tuned model not found. Using Base Model.


In [8]:
import gradio as gr

def run_app(file, text_in, count, diff, q_type, show_ans):
    # 1. Get Text
    if file:
        raw_text = engine.extract_text(file)
    elif text_in:
        raw_text = text_in
    else:
        return "Please upload a document or paste text."

    if len(raw_text) < 50:
        return "⚠️ Text is too short. Please provide more content."

    # 2. Generate
    return engine.generate(raw_text, count, diff, q_type, show_ans)

# UI Layout
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🦙 Llama-3 Smart Question Generator")
    gr.Markdown("Uses RAG (Chunking) + Llama-3-8B to generate unique, high-quality questions.")

    with gr.Row():
        with gr.Column():
            file_in = gr.File(label="Upload PDF / Docx")
            text_in = gr.Textbox(label="Or Paste Text", lines=5)

            count_sl = gr.Slider(1, 10, value=3, step=1, label="Number of Questions")
            diff_drp = gr.Dropdown(["easy", "medium", "hard"], value="medium", label="Difficulty")
            type_drp = gr.Dropdown(["mcq", "true_false", "short", "long"], value="mcq", label="Question Type")
            ans_chk = gr.Checkbox(label="Show Answers", value=True)

            btn = gr.Button("Generate Questions", variant="primary")

        with gr.Column():
            out = gr.Markdown(label="Generated Output")

    btn.click(run_app, [file_in, text_in, count_sl, diff_drp, type_drp, ans_chk], out)

print("Launching App...")
demo.launch(debug=True)

  with gr.Blocks(theme=gr.themes.Soft()) as demo:


Launching App...
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d7e0e7db6acf5297a6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Generating 3 questions using 3 text chunks...
Generating 6 questions using 6 text chunks...
Generating 6 questions using 6 text chunks...
Generating 6 questions using 6 text chunks...
Generating 6 questions using 6 text chunks...
Generating 6 questions using 6 text chunks...
Generating 6 questions using 6 text chunks...
Generating 6 questions using 6 text chunks...
Generating 6 questions using 6 text chunks...
Generating 6 questions using 6 text chunks...
Generating 7 questions using 7 text chunks...
Generating 7 questions using 7 text chunks...
Generating 7 questions using 7 text chunks...
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://d7e0e7db6acf5297a6.gradio.live


