<a href="https://colab.research.google.com/github/swati-mishra07/mcq-rag-app/blob/main/training_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INSTALL DEPENDENCIES

In [1]:
!pip install -q transformers datasets peft accelerate sentence-transformers faiss-cpu evaluate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# LOAD ARC DATASET(Hugging face)

In [2]:
from datasets import load_dataset
import pandas as pd

easy = load_dataset("ai2_arc", "ARC-Easy")
hard = load_dataset("ai2_arc", "ARC-Challenge") # Changed 'ARC-Hard' to 'ARC-Challenge'

df_easy = pd.DataFrame(easy["train"])
df_hard = pd.DataFrame(hard["train"])

df = pd.concat([df_easy, df_hard], ignore_index=True)
df = df.dropna(subset=["question"])

df = df[["question", "choices", "answerKey"]]
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]



ARC-Easy/train-00000-of-00001.parquet:   0%|          | 0.00/331k [00:00<?, ?B/s]

ARC-Easy/test-00000-of-00001.parquet:   0%|          | 0.00/346k [00:00<?, ?B/s]

ARC-Easy/validation-00000-of-00001.parqu(…):   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

ARC-Challenge/train-00000-of-00001.parqu(…):   0%|          | 0.00/190k [00:00<?, ?B/s]

ARC-Challenge/test-00000-of-00001.parque(…):   0%|          | 0.00/204k [00:00<?, ?B/s]

ARC-Challenge/validation-00000-of-00001.(…):   0%|          | 0.00/55.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1119 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1172 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/299 [00:00<?, ? examples/s]

Unnamed: 0,question,choices,answerKey
0,Which factor will most likely cause a person t...,{'text': ['a leg muscle relaxing after exercis...,B
1,Lichens are symbiotic organisms made of green ...,"{'text': ['carbon dioxide', 'food', 'protectio...",B
2,When a switch is used in an electrical circuit...,"{'text': ['cause the charge to build.', 'incre...",D
3,Which of the following is an example of an ass...,"{'text': ['contact lens', 'motorcycle', 'rainc...",A
4,"Rocks are classified as igneous, metamorphic, ...","{'text': ['their color', 'their shape', 'how t...",3


# CONVERT TO PROMPT FORMAT

In [3]:
def format_mcq(example):
    choices = example["choices"]["text"]
    labels = example["choices"]["label"]

    options = "\n".join([f"{l}. {c}" for l, c in zip(labels, choices)])

    prompt = f"""
Create a multiple choice question.

Question: {example['question']}

Options:
{options}

Answer: {example['answerKey']}
"""
    return {"text": prompt}

from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_mcq)



Map:   0%|          | 0/3370 [00:00<?, ? examples/s]

# LOAD MODEL + LoRA

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


# TOKENIZE

In [6]:
def tokenize(example):
    # Tokenize the input text (which is a list of strings when batched=True)
    tokenized_inputs = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

    # Process each text item in the batch to extract and tokenize its answer
    tokenized_labels_batch = []
    for text_item in example["text"]:
        answer_start = text_item.rfind("Answer: ") + len("Answer: ")
        answer = text_item[answer_start:].strip()

        tokenized_labels = tokenizer(
            answer,
            truncation=True,
            padding="max_length",
            max_length=512
        ).input_ids
        tokenized_labels_batch.append(tokenized_labels)

    tokenized_inputs["labels"] = tokenized_labels_batch
    return tokenized_inputs

dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch")

Map:   0%|          | 0/3370 [00:00<?, ? examples/s]

# TRAINING

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./mcq-model",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


Step,Training Loss
50,11.471827
100,11.286775
150,11.07705
200,10.875662
250,10.645449
300,10.493445
350,10.447219
400,10.383374
450,10.247034
500,10.069771


TrainOutput(global_step=1686, training_loss=9.209605116295503, metrics={'train_runtime': 1788.9935, 'train_samples_per_second': 3.767, 'train_steps_per_second': 0.942, 'total_flos': 4633583262105600.0, 'train_loss': 9.209605116295503, 'epoch': 2.0})

# BUILD FAISS INDEX

In [8]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")

questions = df["question"].tolist()
embeddings = embedder.encode(questions, convert_to_numpy=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, "arc_faiss.index")
df.to_csv("arc_data.csv", index=False)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# SAVE MODEL

In [9]:
model.save_pretrained("mcq_lora_model")
tokenizer.save_pretrained("mcq_lora_model")


('mcq_lora_model/tokenizer_config.json', 'mcq_lora_model/tokenizer.json')

In [10]:
import shutil
shutil.make_archive("mcq_lora_model", 'zip', "mcq_lora_model")

from google.colab import files
files.download("mcq_lora_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>