# Finetuning LLM for SAT Reading section

## 1. Install and import necessary libaries

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U datasets
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U loralib
!pip install -q -U einops

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

In [2]:
from peft import PeftModelForCausalLM

PeftModelForCausalLM.__mro__

(peft.peft_model.PeftModelForCausalLM,
 peft.peft_model.PeftModel,
 transformers.utils.hub.PushToHubMixin,
 torch.nn.modules.module.Module,
 object)

In [None]:
# import os
# from google.colab import userdata

# os.environ['HF_TOKEN'] =  userdata.get('HF_TOKEN')

## 2. Load pre-trained LLM

In [None]:
from transformers.utils.quantization_config import BitsAndBytesConfig

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    ## Use NormalFloat4b or FP4b
    bnb_4bit_quant_type="nf4",
    ## dtype to dequantize weights into
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cuda:0",
    offload_state_dict=True,
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,)

In [6]:
## 1- Cast the layernorm in fp32
## 2- Making output embedding layer require grads (no need for model.enable_input_require_grads())
## 3- Add the upcasting of the lm head to fp32
## 4- Freezing the base model layers to ensure they are not updated during training
## use_gradient_checkpointing: Forward computation in checkpointed regions omits saving tensors for backward and recomputes them during the backward pass
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [7]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,)

In [9]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    ## output = MHA(q_proj x q, k_proj x k, v_proj x v)
    ## output = o_proj x output
    ## output = down_proj( act_fn(gate_proj(input)) x up_proj(input) )
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    ## Use Rank-Stablized LoRA, setting scaling to alpha/sqrt(r) instead of alpha/r
    # use_rslora=True,
    lora_dropout=0.05,
    ## ["none", "all", "lora_only"]
    bias="none",
    ## "SEQ_CLS": PeftModelForSequenceClassification,
    ## "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
    ## "CAUSAL_LM": PeftModelForCausalLM,
    ## "TOKEN_CLS": PeftModelForTokenClassification,
    ## "QUESTION_ANS": PeftModelForQuestionAnswering,
    ## "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
)

## Freeze the base model parameters
model = get_peft_model(model, config, adapter_name="lora")



In [10]:
model.print_trainable_parameters()

trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511


In [11]:
mem_free, mem_total = torch.cuda.mem_get_info()
print(f"Free memory: {mem_free / 1024**3:.2f} GB")
print(f"Total memory: {mem_total / 1024**3:.2f} GB")

Free memory: 9.74 GB
Total memory: 15.99 GB


In [12]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (lora): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (lora): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (lora): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4

## 3. Test pre-trained model performance

In [None]:
from transformers.generation.configuration_utils import GenerationConfig

In [None]:
# Llama-3's official system prompt structure
LLAMA3_SYSTEM_PROMPT = """You are a helpful AI assistant developed by Meta. Respond safely and accurately."""
prompt = [
    {"role": "system", "content": LLAMA3_SYSTEM_PROMPT},
    {
        "role": "user",
        "content": """Analyze the given passage and question. Choose the best answer from the options below.

### Passage:
[This passage is adapted from George Eliot, Silas Marner. Originally published in 1861. Silas was a weaver and a notorious miser, but then the gold he had hoarded was stolen. Shortly after, Silas adopted a young child, Eppie, the daughter of an impoverished woman who had died suddenly.

    Unlike the gold which needed nothing, and must
be worshipped in close-locked solitude—which was
hidden away from the daylight, was deaf to the song
of birds, and started to no human tones—Eppie was a
creature of endless claims and ever-growing desires,
seeking and loving sunshine, and living sounds, and
living movements; making trial of everything, with
trust in new joy, and stirring the human kindness in
all eyes that looked on her. The gold had kept his
thoughts in an ever-repeated circle, leading to
nothing beyond itself; but Eppie was an object
compacted of changes and hopes that forced his
thoughts onward, and carried them far away from
their old eager pacing towards the same blank
limit—carried them away to the new things that
would come with the coming years, when Eppie
would have learned to understand how her father
Silas cared for her; and made him look for images of
that time in the ties and charities that bound together
the families of his neighbors. The gold had asked that
he should sit weaving longer and longer, deafened
and blinded more and more to all things except the
monotony of his loom and the repetition of his web;
but Eppie called him away from his weaving, and
made him think all its pauses a holiday, reawakening
his senses with her fresh life, even to the old
winter-flies that came crawling forth in the early
spring sunshine, and warming him into joy because
she had joy.
    And when the sunshine grew strong and lasting,
so that the buttercups were thick in the meadows,
Silas might be seen in the sunny mid-day, or in the
late afternoon when the shadows were lengthening
under the hedgerows, strolling out with uncovered
head to carry Eppie beyond the Stone-pits to where
the flowers grew, till they reached some favorite bank
where he could sit down, while Eppie toddled to
pluck the flowers, and make remarks to the winged
things that murmured happily above the bright
petals, calling “Dad-dad’s” attention continually by
bringing him the flowers. Then she would turn her
ear to some sudden bird-note, and Silas learned to
please her by making signs of hushed stillness, that
they might listen for the note to come again: so that
when it came, she set up her small back and laughed
with gurgling triumph. Sitting on the banks in this
way, Silas began to look for the once familiar herbs
again; and as the leaves, with their unchanged outline
and markings, lay on his palm, there was a sense of
crowding remembrances from which he turned away
timidly, taking refuge in Eppie’s little world, that lay
lightly on his enfeebled spirit.
    As the child’s mind was growing into knowledge,
his mind was growing into memory: as her life
unfolded, his soul, long stupefied in a cold narrow
prison, was unfolding too, and trembling gradually
into full consciousness.
    It was an influence which must gather force with
every new year: the tones that stirred Silas’ heart
grew articulate, and called for more distinct answers;
shapes and sounds grew clearer for Eppie’s eyes and
ears, and there was more that “Dad-dad” was
imperatively required to notice and account for.
Also, by the time Eppie was three years old, she
developed a fine capacity for mischief, and for
devising ingenious ways of being troublesome, which
found much exercise, not only for Silas’ patience, but
for his watchfulness and penetration. Sorely was poor
Silas puzzled on such occasions by the incompatible
demands of love.]

### Question:
Which statement best describes a technique the narrator uses to represent Silas's character before he adopted Eppie?

### Choices:
A) The narrator emphasizes Silas's former obsession with wealth by depicting his gold as requiring certain behaviors on his part.
B) The narrator underscores Silas's former greed by describing his gold as seeming to reproduce on its own.
C) The narrator hints at Silas's former antisocial attitude by contrasting his present behavior toward his neighbors with his past behavior toward them.
D) The narrator demonstrates Silas's former lack of self-awareness by implying that he is unable to recall life before Eppie.

Respond ONLY with the letter and full text of the correct answer choice.""",
    },
]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Apply chat template (text only)
chat_text = tokenizer.apply_chat_template(
    prompt, add_generation_prompt=True, tokenize=False
)
print(chat_text)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 05 May 2025

You are a helpful AI assistant developed by Meta. Respond safely and accurately.<|eot_id|><|start_header_id|>user<|end_header_id|>

Analyze the given passage and question. Choose the best answer from the options below.

### Passage:
[This passage is adapted from George Eliot, Silas Marner. Originally published in 1861. Silas was a weaver and a notorious miser, but then the gold he had hoarded was stolen. Shortly after, Silas adopted a young child, Eppie, the daughter of an impoverished woman who had died suddenly.

    Unlike the gold which needed nothing, and must
be worshipped in close-locked solitude—which was
hidden away from the daylight, was deaf to the song
of birds, and started to no human tones—Eppie was a
creature of endless claims and ever-growing desires,
seeking and loving sunshine, and living sounds, and
living movements; making trial of everything, 

In [None]:
tokenizer("<|begin_of_text|><|end_of_text|>")

{'input_ids': [128000, 128000, 128001], 'attention_mask': [1, 1, 1]}

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenize
inputs = tokenizer(chat_text, return_tensors="pt").to(device)

for k, v in inputs.items():
    print(f"{k}: {v.shape}")
v.all()

input_ids: torch.Size([1, 1104])
attention_mask: torch.Size([1, 1104])


tensor(True, device='cuda:0')

In [None]:
generation_config = GenerationConfig(
    max_new_tokens=50,
    temperature=0.01,
    do_sample=True,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.3,
)

# Generate output
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        generation_config=generation_config,
    )

    # Decode output
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)


# print
if "<|assistant|>" in output_text:
    print(output_text.split("<|assistant|>")[-1].strip())
else:
    print(output_text.strip())

system

Cutting Knowledge Date: December 2023
Today Date: 05 May 2025

You are a helpful AI assistant developed by Meta. Respond safely and accurately.user

Analyze the given passage and question. Choose the best answer from the options below.

### Passage:
[This passage is adapted from George Eliot, Silas Marner. Originally published in 1861. Silas was a weaver and a notorious miser, but then the gold he had hoarded was stolen. Shortly after, Silas adopted a young child, Eppie, the daughter of an impoverished woman who had died suddenly.

    Unlike the gold which needed nothing, and must
be worshipped in close-locked solitude—which was
hidden away from the daylight, was deaf to the song
of birds, and started to no human tones—Eppie was a
creature of endless claims and ever-growing desires,
seeking and loving sunshine, and living sounds, and
living movements; making trial of everything, with
trust in new joy, and stirring the human kindness in
all eyes that looked on her. The gold had

## 4. Fine-tuning LLM

### 4.1. Prepare dataset

In [14]:
# Random dataset for demonstration purposes
dataset_name = "wikitext"
dataset_version = "wikitext-2-raw-v1"
# Only load 1% of the dataset
train_dataset = load_dataset(dataset_name, dataset_version, split="train[:1%]")
eval_dataset = load_dataset(dataset_name, dataset_version, split="validation[:1%]")

In [15]:
train_dataset["text"][:5]

['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more f

In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token for Llama-3

In [None]:
def tokenize_function(examples):
    output = tokenizer.apply_chat_template(
        [
            {
                "role": "system",
                "content": "Please capitalize the first letter of each sentence.",
            },
            {"role": "user", "content": text},
        ],
        add_generation_prompt=True,
        tokenize=True,
    )
    return output
    # return tokenizer(
    #     examples["text"],
    #     truncation=True,
    #     padding=False,
    #     # padding="max_length",
    #     max_length=500,
    #     return_tensors="pt",
    # )


tokenized_train_dataset = train_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_eval_dataset = eval_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)

Map:   0%|          | 0/367 [00:00<?, ? examples/s]

TypeError: Provided `function` which is applied to all elements of table returns a variable of type <class 'list'>. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.

In [27]:
tokenized_train_dataset["input_ids"]

[[128000,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,


### 4.2. Training

In [19]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
 

In [None]:
# https://huggingface.co/docs/transformers/main_classes/trainer
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer
from transformers.data.data_collator import DataCollatorForLanguageModeling

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    ## Use gradient accumulation to simulate larger batch size
    ## (instead of batch_size=16, use batch_size=8 and gradient_accumulation_steps=2)
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    learning_rate=2e-4,
    ## Use fp16 to train (5b exponent + 10b fraction)
    fp16=True,
    logging_steps=10,
    output_dir="llama3-8b-sat-reading",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    ## Already handled by prepare_model_for_kbit_training
    # gradient_checkpointing=True,
    ## Avoid warning
    gradient_checkpointing_kwargs={'use_reentrant':False},
    ## Evaluate every 50 steps
    eval_strategy="steps",
    eval_steps=50,
    ## Save every 50 steps
    save_strategy="steps",
    save_steps=50,
    ## Save the 3 most recent checkpoints, including the best one (load_best_model_at_end=True)
    save_total_limit=3,
    load_best_model_at_end=True,
    ## Loss function is embedded in the model
    metric_for_best_model="loss",
    greater_is_better=False,
    ## May report to WandB, tensorboard, etc.
    report_to="none",
    remove_unused_columns=False,
)

## Inputs are dynamically padded to the maximum length of a batch if they are not all of the same length.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8
)

In [21]:
# Quantization-aware training settings
## Disable the meaningless KV cache to save memory during training
model.config.use_cache = False
## Already enabled in prepare_model_for_kbit_training
# model.gradient_checkpointing_enable()
# model.enable_input_require_grads()

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,  # type: ignore
    eval_dataset=tokenized_eval_dataset,  # type: ignore
    args=training_args,
    data_collator=data_collator,
    ## May define subclasses from TrainerCallback
    # callbacks=[LogLossCallback()],
)


trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
50,1.755,2.446278
100,2.4912,2.391737
150,2.4882,2.333528
200,1.5356,2.337354
250,1.1315,2.396464
300,1.3949,2.440839
350,1.348,2.437834


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
Could not locate the best model at llama3-8b-sat-reading\checkpoint-150\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=366, training_loss=1.9095426569870912, metrics={'train_runtime': 252.4295, 'train_samples_per_second': 2.908, 'train_steps_per_second': 1.45, 'total_flos': 1596117329117184.0, 'train_loss': 1.9095426569870912, 'epoch': 1.991825613079019})

### 4.3. Test prediction

In [None]:
print(data["test"]["text"][1])

SAT READING COMPREHENSION TEST

This passage is adapted from Mary Helen Stefaniak, The
Cailiffs of Baghdad, Georgia: A Novel. ©2010 by Mary Helen
Stefaniak.

    Miss Grace Spivey arrived in Threestep, Georgia,
in August 1938. She stepped off the train wearing a
pair of thick-soled boots suitable for hiking, a navy
Line blue dress, and a little white tam that rode the waves
of her red hair at a gravity-defying angle. August was
a hellish month to step off the train in Georgia,
although it was nothing, she said, compared to the
119 degrees that greeted her when she arrived one
time in Timbuktu, which, she assured us, was a real
place in Africa. I believe her remark irritated some of
the people gathered to welcome her on the burned
grass alongside the tracks. When folks are sweating
through their shorts, they don’t like to hear that this
is nothing compared to someplace else. Irritated or
not, the majority of those present were inclined to see
the arrival of the new schoolteacher in a po

In [None]:
print("Ground truth: ", data["test"]["answer"][1])

Ground truth:  D


In [None]:
import torch
import re
from transformers import GenerationConfig

generation_config = GenerationConfig(
    max_new_tokens=64,
    # temperature=0.0,
    top_p=1.0,
    do_sample=False,
    repetition_penalty=1.0,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)


def extract_answer(output_text):
    if "<|assistant|>" in output_text:
        answer_part = output_text.split("<|assistant|>")[-1].strip()
    else:
        answer_part = output_text.split("assistant")[-1].strip()

    match = re.search(r"^([A-D])\)\s*([^\n\(]+)", answer_part, re.MULTILINE)

    if match:
        return f"{match.group(1)}) {match.group(2).strip()}"
    else:
        clean_lines = [
            line for line in answer_part.split("\n") if not line.startswith("**")
        ]
        return clean_lines[0].strip() if clean_lines else answer_part


def format_test_prompt(text, answer_letter=None):
    """Format input text as chat conversation (for prediction or test)"""
    sections = extract_sections(text)

    # Build choices block
    choices_text = "\n".join(sections["choices"])

    user_prompt = f"""Read the passage and answer the question.

### Passage:
{sections['passage']}

### Question:
{sections['question']}

### Choices:
{choices_text}

Respond with ONLY the letter and full text of the correct answer."""

    messages = [
        {"role": "system", "content": LLAMA3_SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]

    if answer_letter:
        messages.append(
            {"role": "assistant", "content": map_answer(text, answer_letter)}
        )

    return messages


def predict(text):
    messages = format_test_prompt(text)

    prompt_text = tokenizer.apply_chat_template(
        messages, add_generation_prompt=False, tokenize=False
    )

    inputs = tokenizer(prompt_text, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            generation_config=generation_config,
        )

    # Decode
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Only answer
    return extract_answer(output_text), output_text

In [None]:
test_sample_idx = 4
input_text = data["test"]["text"][test_sample_idx]
true_answer = data["test"]["answer"][test_sample_idx]

predicted_answer, output_text = predict(input_text)

true_answer_full = map_answer(input_text, true_answer)

print("=== Final Result ===")
print(f"[Model Prediction]\n{predicted_answer}")
print(f"\n[Ground Truth]\n{true_answer_full}")
print(f"\n[Output Text]\n{output_text}")

=== Final Result ===
[Model Prediction]
B) small rural town.

[Ground Truth]
B) small rural town.

[Output Text]
system

Cutting Knowledge Date: December 2023
Today Date: 05 May 2025

You are a helpful AI assistant developed by Meta. Respond safely and accurately.user

Read the passage and answer the question.

### Passage:
This passage is adapted from Mary Helen Stefaniak, The
Cailiffs of Baghdad, Georgia: A Novel. ©2010 by Mary Helen
Stefaniak.
Miss Grace Spivey arrived in Threestep, Georgia,
in August 1938. She stepped off the train wearing a
pair of thick-soled boots suitable for hiking, a navy
Line blue dress, and a little white tam that rode the waves
of her red hair at a gravity-defying angle. August was
a hellish month to step off the train in Georgia,
although it was nothing, she said, compared to the
119 degrees that greeted her when she arrived one
time in Timbuktu, which, she assured us, was a real
place in Africa. I believe her remark irritated some of
the people gathered 

## 5. Evaluation

In [None]:
from tqdm import tqdm


def evaluate(test_dataset, max_samples=None):
    """
    Evaluate model accuracy on test set
    Args:
        test_dataset: Dataset object containing 'text' and 'answer'
        max_samples: Optional limit for quick testing
    """
    correct = 0
    total = 0
    wrong_samples = []

    # Process samples with progress bar
    for idx in tqdm(range(len(test_dataset[:max_samples]["text"]))):
        try:
            text = test_dataset["text"][idx]
            true_answer = test_dataset["answer"][idx].strip().upper()

            # Get model prediction
            predicted, _ = predict(text)

            # Extract first valid choice letter from prediction
            predicted_letter = re.search(r"\b([A-D])\b", predicted.upper())
            if predicted_letter:
                predicted_letter = predicted_letter.group(1)
            else:
                predicted_letter = None

            # Compare with ground truth
            if predicted_letter == true_answer:
                correct += 1
            else:
                wrong_samples.append(
                    {"text": text, "predicted": predicted, "true": true_answer}
                )

            total += 1

        except Exception as e:
            print(f"Error processing sample {idx}: {str(e)}")

    # Calculate metrics
    accuracy = correct / total if total > 0 else 0

    # Print summary
    print(f"\n=== Evaluation Results ===")
    print(f"Correct: {correct}/{total}")
    print(f"Accuracy: {accuracy:.2%}")
    print(f"Wrong samples saved in 'wrong_samples' list")

    return {"accuracy": accuracy, "total": total, "wrong_samples": wrong_samples}


# Usage
test_results = evaluate(data["test"])

100%|██████████| 38/38 [02:23<00:00,  3.78s/it]


=== Evaluation Results ===
Correct: 14/38
Accuracy: 36.84%
Wrong samples saved in 'wrong_samples' list





In [None]:
for i, sample in enumerate(test_results["wrong_samples"][:5]):
    print(f"### Sample {i+1}")
    print(f"Ground Truth: {sample['true']}")
    print(f"Predicted: {sample['predicted']}")
    print("Passage:")
    print(extract_sections(sample["text"])["passage"][:200] + "...")
    print("\n" + "-" * 50)

### Sample 1
Ground Truth: D
Predicted: A) sympathy, because they assume that she is experiencing intense heat for the first time.
Passage:
This passage is adapted from Mary Helen Stefaniak, The
Cailiffs of Baghdad, Georgia: A Novel. ©2010 by Mary Helen
Stefaniak.
Miss Grace Spivey arrived in Threestep, Georgia,
in August 1938. She steppe...

--------------------------------------------------
### Sample 2
Ground Truth: A
Predicted: C) an anonymous member of the community.
Passage:
This passage is adapted from Mary Helen Stefaniak, The
Cailiffs of Baghdad, Georgia: A Novel. ©2010 by Mary Helen
Stefaniak.
Miss Grace Spivey arrived in Threestep, Georgia,
in August 1938. She steppe...

--------------------------------------------------
### Sample 3
Ground Truth: C
Predicted: A) delighted.
Passage:
This passage is adapted from Mary Helen Stefaniak, The
Cailiffs of Baghdad, Georgia: A Novel. ©2010 by Mary Helen
Stefaniak.
Miss Grace Spivey arrived in Threestep, Georgia,
in August 1938. She s

## 6. Save model to huggingface

In [None]:
model.save_pretrained("trained-model")

In [None]:
PEFT_MODEL = "tiviluson/Llama-3.2-3B-SAT"

model.push_to_hub(PEFT_MODEL, use_auth_token=True)



README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tiviluson/Llama-3.2-3B-SAT/commit/d01ea047020b762afabd473947827cce07d34986', commit_message='Upload model', commit_description='', oid='d01ea047020b762afabd473947827cce07d34986', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tiviluson/Llama-3.2-3B-SAT', endpoint='https://huggingface.co', repo_type='model', repo_id='tiviluson/Llama-3.2-3B-SAT'), pr_revision=None, pr_num=None)

## 7. Inference

In [None]:
from peft import PeftConfig, PeftModel


def format_inference_prompt(text):
    sections = extract_sections(text)
    choices_text = "\n".join(sections["choices"])

    return [
        {"role": "system", "content": LLAMA3_SYSTEM_PROMPT},
        {
            "role": "user",
            "content": f"""Read the passage and answer the question.

### Passage: {sections['passage']}
### Question: {sections['question']}
### Choices: {choices_text}

Respond with ONLY the letter and full text of the correct answer.""",
        },
    ]


PEFT_MODEL = "tiviluson/Llama-3.2-3B-SAT"

# Load config v& model
config = PeftConfig.from_pretrained(PEFT_MODEL)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model = PeftModel.from_pretrained(model, PEFT_MODEL)

# Tokenizer & generation config
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

generation_config = GenerationConfig(
    max_new_tokens=128,
    # temperature=0.01,
    do_sample=False,
    repetition_penalty=1.15,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    forced_eos_token_id=tokenizer.eos_token_id,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Inference loop
for i in range(5):
    print("=" * 100)

    input_text = data["test"]["text"][i]
    true_answer = data["test"]["answer"][i]

    messages = format_inference_prompt(input_text)
    prompt = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=False
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            generation_config=generation_config,
        )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

    if "<|assistant|>" in full_output:
        response = (
            full_output.split("<|assistant|>")[1].replace("<|eot_id|>", "").strip()
        )
    else:
        response = full_output.replace(prompt, "").strip()

    print(f"=== Sample {i+1} ===")
    print(f"[Question]\n{messages[1]['content']}")
    print(f"\n[Ground Truth] {true_answer}")
    print(f"[Prediction] {response}")
    print("\n" + "-" * 50)

=== Sample 1 ===
[Question]
Read the passage and answer the question.

### Passage: This passage is adapted from Mary Helen Stefaniak, The
Cailiffs of Baghdad, Georgia: A Novel. ©2010 by Mary Helen
Stefaniak.
Miss Grace Spivey arrived in Threestep, Georgia,
in August 1938. She stepped off the train wearing a
pair of thick-soled boots suitable for hiking, a navy
Line blue dress, and a little white tam that rode the waves
of her red hair at a gravity-defying angle. August was
a hellish month to step off the train in Georgia,
although it was nothing, she said, compared to the
119 degrees that greeted her when she arrived one
time in Timbuktu, which, she assured us, was a real
place in Africa. I believe her remark irritated some of
the people gathered to welcome her on the burned
grass alongside the tracks. When folks are sweating
through their shorts, they don’t like to hear that this
is nothing compared to someplace else. Irritated or
not, the majority of those present were inclined to s

In [None]:
def custom_predict(passage: str, question: str, choices: list):
    choices_text = "\n".join(choices)

    messages = [
        {"role": "system", "content": LLAMA3_SYSTEM_PROMPT},
        {
            "role": "user",
            "content": f"""Read the passage and answer the question.

### Passage:
{passage}

### Question:
{question}

### Choices:
{choices_text}

Respond with ONLY the letter and full text of the correct answer.""",
        },
    ]

    prompt = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=False
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            generation_config=generation_config,
        )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

    if "<|assistant|>" in full_output:
        response = (
            full_output.split("<|assistant|>")[1].replace("<|eot_id|>", "").strip()
        )
    else:
        response = full_output.replace(prompt, "").strip()

    return response

In [None]:
custom_passage = """
This passage is adapted from F. Scott Fitzgerald, The Great Gatsby.
"In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in my mind ever since. ‘Whenever you feel like criticizing anyone,’ he told me, ‘just remember that all the people in this world haven’t had the advantages that you’ve had.’ He didn’t say any more, but we’ve always been unusually communicative in a reserved way, and I understood that he meant a great deal more than that. In consequence, I’m inclined to reserve all judgments, a habit that has opened up many curious natures to me and also made me the victim of not a few veteran bores."
"""

custom_question = (
    "What is the primary purpose of the narrator’s recollection of his father’s advice?"
)
custom_choices = [
    "A) To explain his reluctance to judge others",  # A is correct, just test
    "B) To highlight his privileged upbringing",
    "C) To criticize his father’s moral values",
    "D) To foreshadow future conflicts in the story",
]

prediction = custom_predict(custom_passage, custom_question, custom_choices)
print("\n=== Custom Test Result ===")
print("[Prediction]", prediction)


=== Custom Test Result ===
[Prediction] <|begin_of_text|>A) To explain his reluctance to judge others<|eot_id|>
