# Adaptive RAG MCQ Generator

This notebook contains:

- AI2-ARC dataset processing
- LoRA fine-tuning of Flan-T5
- FAISS index building
- Model saving for deployment


# INSTALL DEPENDENCIES

In [None]:
!pip install -q transformers datasets peft accelerate sentence-transformers faiss-cpu evaluate


# LOAD ARC DATASET(Hugging face)

In [None]:
from datasets import load_dataset
import pandas as pd

easy = load_dataset("ai2_arc", "ARC-Easy")
hard = load_dataset("ai2_arc", "ARC-Challenge") # Changed 'ARC-Hard' to 'ARC-Challenge'

df_easy = pd.DataFrame(easy["train"])
df_hard = pd.DataFrame(hard["train"])

df = pd.concat([df_easy, df_hard], ignore_index=True)
df = df.dropna(subset=["question"])

df = df[["question", "choices", "answerKey"]]
df.head()

# CONVERT TO PROMPT FORMAT

In [None]:
def format_mcq(example):
    choices = example["choices"]["text"]
    labels = example["choices"]["label"]

    options = "\n".join([f"{l}. {c}" for l, c in zip(labels, choices)])

    prompt = f"""
Create a multiple choice question.

Question: {example['question']}

Options:
{options}

Answer: {example['answerKey']}
"""
    return {"text": prompt}

from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_mcq)



# LOAD MODEL + LoRA

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


# TOKENIZE

In [None]:
def tokenize(example):
    # Tokenize the input text (which is a list of strings when batched=True)
    tokenized_inputs = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

    # Process each text item in the batch to extract and tokenize its answer
    tokenized_labels_batch = []
    for text_item in example["text"]:
        answer_start = text_item.rfind("Answer: ") + len("Answer: ")
        answer = text_item[answer_start:].strip()

        tokenized_labels = tokenizer(
            answer,
            truncation=True,
            padding="max_length",
            max_length=512
        ).input_ids
        tokenized_labels_batch.append(tokenized_labels)

    tokenized_inputs["labels"] = tokenized_labels_batch
    return tokenized_inputs

dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch")

# TRAINING

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./mcq-model",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


# BUILD FAISS INDEX

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")

questions = df["question"].tolist()
embeddings = embedder.encode(questions, convert_to_numpy=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, "arc_faiss.index")
df.to_csv("arc_data.csv", index=False)


# SAVE MODEL

In [None]:
model.save_pretrained("mcq_lora_model")
tokenizer.save_pretrained("mcq_lora_model")


In [None]:
import shutil
shutil.make_archive("mcq_lora_model", 'zip', "mcq_lora_model")

from google.colab import files
files.download("mcq_lora_model.zip")
