# ML Midterm version 2_3

## Step 1. Install Necessary Libraries

In [1]:
# %%capture
!pip install numpy==1.26.4
!pip install "transformers==4.41.2" "tokenizers==0.19.1" "accelerate==0.31.0" "peft==0.11.1" "bitsandbytes==0.43.3"
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting transformers==4.41.2
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers==0.19.1
  Using cached tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting accelerate==0.31.0
  Using cached accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Collecting peft==0.11.1
  Using cached peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes==0.43.3
  Using cached bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Using cached tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Using cached accelerate-0.31.0-py3-none-any.whl (309 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.wh

## Step 2: Load the Model and Tokenizer

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024  # Choose any sequence length
dtype = None  # This will auto-detect the best data type for your GPU
load_in_4bit = True  # Use 4-bit quantization to save memory

model, tokenizer = FastLanguageModel.from_pretrained(
  model_name = "unsloth/Meta-Llama-3.1-8B", # Competition-approved model
  max_seq_length = max_seq_length,
  dtype = dtype,
  load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

## Step 3: Prepare the Dataset

In [3]:
from datasets import load_dataset

# Load the full training dataset
full_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")

# Shuffle the dataset for randomness and create our smaller splits
shuffled_dataset = full_dataset.shuffle(seed=42)
# Increase the amount of training data
train_dataset = shuffled_dataset.select(range(20000))
validation_dataset = shuffled_dataset.select(range(20000, 22000))

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
# Optimize prompt word template
# More detailed and explicit instructions
training_prompt_v3 = """Task: Verify this mathematical solution step-by-step.

Question: {}

Solution to verify:
{}

Verification process:
1. Read the question
2. Check each calculation step
3. Verify the logic flow
4. Confirm the final answer

Decision: Is this solution correct?
Output only: True or False
Decision: {}"""

# We must add an End Of Sequence (EOS) token to tell the model when a completion is finished.
EOS_TOKEN = tokenizer.eos_token

# This function formats our data samples into the prompt template.
def formatting_prompts_func(examples):
  questions = examples["question"]
  solutions = examples["solution"]
  outputs = examples["is_correct"]
  texts = []
  for question, solution, output in zip(questions, solutions, outputs):
    # Format the prompt and add the EOS token
    text = training_prompt_v3.format(question, str(solution), str(output)) + EOS_TOKEN
    texts.append(text)
  return { "text" : texts }

# Apply the formatting function to our training and validation dataset
formatted_train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
formatted_validation_dataset = validation_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## Step 4: Configure LoRA and Set Up the Trainer

In [5]:
# Increase LoRA rank
model = FastLanguageModel.get_peft_model(
  model,
  r = 16,
  target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
  lora_alpha = 32,
  lora_dropout = 0,
  bias = "none",
  use_gradient_checkpointing = "unsloth",
  random_state = 42,
)

Unsloth 2025.10.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### SFTTrainer Setup

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
  model = model,
  tokenizer = tokenizer,
  train_dataset = formatted_train_dataset,
  eval_dataset = formatted_validation_dataset,
  dataset_text_field = "text",
  max_seq_length = max_seq_length,
  args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,

    warmup_steps = 50,
    max_steps = 500,
    learning_rate = 2e-4,

    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),

    logging_steps = 50,
    eval_strategy = "steps",
    eval_steps = 50,
    save_strategy = "steps",
    save_steps = 50,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",

    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 42,
    output_dir = "outputs",
    report_to = "none",
  ),
)

## Step 5: Start Training

In [10]:
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 20,000 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss,Validation Loss
50,0.65,0.714753
100,0.7274,0.68043
150,0.6854,0.673563
200,0.6724,0.667321
250,0.6866,0.661877
300,0.6733,0.656295
350,0.6774,0.65266
400,0.6605,0.649074
450,0.6675,0.646862
500,0.659,0.645468


TrainOutput(global_step=500, training_loss=0.6759539031982422, metrics={'train_runtime': 2133.3032, 'train_samples_per_second': 1.875, 'train_steps_per_second': 0.234, 'total_flos': 6.602470799002829e+16, 'train_loss': 0.6759539031982422, 'epoch': 0.2})

## Step 6: Inference and Evaluation

In [11]:
# Prepare the model for faster inference
FastLanguageModel.for_inference(model)
model.eval()
inference_prompt = """Task: Verify this mathematical solution step-by-step.

Question: {}

Solution to verify:
{}

Verification process:
1. Read the question
2. Check each calculation step
3. Verify the logic flow
4. Confirm the final answer

Decision: Is this solution correct?
Output only: True or False
Decision: """

import re
def parse_output_strict(text: str) -> bool:
  m = re.search(r"Your\s+verdict\s*:\s*(.+)", text, flags=re.IGNORECASE | re.DOTALL)
  tail = m.group(1).strip() if m else text.strip()

  if not tail:
    return False

  lines = tail.splitlines()
  if not lines:
    return False

  first_line = lines[0].strip()

  tokens = re.split(r"\s+", re.sub(r"[^\w]", "", first_line))
  first_token = tokens[0].lower() if tokens and tokens[0] else ""

  if first_token.startswith("t"):   # true/TRUE/True.
    return True
  if first_token.startswith("f"):   # false/FALSE/False.
    return False

  if re.search(r"\btrue\b", first_line, flags=re.IGNORECASE):
    return True
  if re.search(r"\bfalse\b", first_line, flags=re.IGNORECASE):
    return False

  return False

# ------------------------ Validation Inference & Metrics ------------------------
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch

print("\nStarting inference on validation set...")

predictions = []
ground_truths = []

gen_kwargs = dict(
  max_new_tokens=4,
  do_sample=False,
  temperature=0.0,
  top_p=1.0,
  use_cache=True,
  eos_token_id=tokenizer.eos_token_id,
  pad_token_id=tokenizer.eos_token_id,
)

for example in tqdm(validation_dataset, desc="Evaluating validation set"):
  question = example["question"]
  solution = example["solution"]
  true_label = bool(example["is_correct"])

  prompt = inference_prompt.format(question, str(solution))
  inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

  with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)

  gen_only = outputs[:, inputs["input_ids"].shape[-1]:]
  response_text = tokenizer.decode(gen_only[0], skip_special_tokens=True)

  pred = parse_output_strict(response_text)
  predictions.append(pred)
  ground_truths.append(true_label)

print("\n" + "=" * 80)
print("Computing evaluation metrics")
print("=" * 80)

accuracy = accuracy_score(ground_truths, predictions)
precision = precision_score(ground_truths, predictions, zero_division=0)
recall = recall_score(ground_truths, predictions, zero_division=0)
f1 = f1_score(ground_truths, predictions, zero_division=0)

cm = confusion_matrix(ground_truths, predictions)
tn, fp, fn, tp = cm.ravel()

print("Validation Set Evaluation Results")

print(f"\nMain Metrics:")
print(f"  Accuracy:   {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"  Precision:  {precision:.4f} ({precision*100:.2f}%)")
print(f"  Recall:     {recall:.4f} ({recall*100:.2f}%)")
print(f"  F1-Score:   {f1:.4f}")

print(f"\nConfusion Matrix:")
print(f"                Predicted True    Predicted False")
print(f"  Actual True    {tp:6d}           {fn:6d}")
print(f"  Actual False   {fp:6d}           {tn:6d}")

print(f"\nDetailed Statistics:")
print(f"  Total samples:       {len(predictions)}")
print(f"  Correct predictions: {tp + tn} ({(tp+tn)/len(predictions)*100:.2f}%)")
print(f"  Wrong predictions:   {fp + fn} ({(fp+fn)/len(predictions)*100:.2f}%)")
print(f"  Predicted as True:   {sum(predictions)} ({sum(predictions)/len(predictions)*100:.2f}%)")
print(f"  Actually True:       {sum(ground_truths)} ({sum(ground_truths)/len(ground_truths)*100:.2f}%)")


Starting inference on validation set...


Evaluating validation set:   9%|▉         | 186/2000 [00:37<06:00,  5.04it/s]Unsloth: Input IDs of shape torch.Size([1, 1087]) with length 1087 > the model's max sequence length of 1024.
We shall truncate it ourselves. It's imperative if you correct this issue first.
Evaluating validation set:  10%|█         | 206/2000 [00:41<05:54,  5.05it/s]Unsloth: Input IDs of shape torch.Size([1, 1182]) with length 1182 > the model's max sequence length of 1024.
We shall truncate it ourselves. It's imperative if you correct this issue first.
Evaluating validation set:  19%|█▉        | 379/2000 [01:16<05:25,  4.98it/s]Unsloth: Input IDs of shape torch.Size([1, 1269]) with length 1269 > the model's max sequence length of 1024.
We shall truncate it ourselves. It's imperative if you correct this issue first.
Evaluating validation set:  54%|█████▍    | 1076/2000 [03:36<03:07,  4.94it/s]Unsloth: Input IDs of shape torch.Size([1, 1041]) with length 1041 > the model's max sequence length of 1024.
We shall


Computing evaluation metrics
Validation Set Evaluation Results

Main Metrics:
  Accuracy:   0.7910 (79.10%)
  Precision:  0.7239 (72.39%)
  Recall:     0.8389 (83.89%)
  F1-Score:   0.7772

Confusion Matrix:
                Predicted True    Predicted False
  Actual True       729              140
  Actual False      278              853

Detailed Statistics:
  Total samples:       2000
  Correct predictions: 1582 (79.10%)
  Wrong predictions:   418 (20.90%)
  Predicted as True:   1007 (50.35%)
  Actually True:       869 (43.45%)





## SAVE THE MODEL TO DRIVE AND RUN INFERENCE

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import os

# Define the path to save the model checkpoint in Google Drive
save_path = "/content/drive/MyDrive/llama3_8b_math_verifier_checkpoint_version2_3"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model checkpoint and tokenizer saved to: {save_path}")

Model checkpoint and tokenizer saved to: /content/drive/MyDrive/llama3_8b_math_verifier_checkpoint_version2_3


In [14]:
import pandas as pd

history = trainer.state.log_history
df = pd.DataFrame(history)
df.to_csv("loss_history_2_3.csv", index=False)
df.head()

Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,0.65,0.81899,0.000196,0.02,50,,,,,,,,,
1,,,,0.02,50,0.714753,130.7516,15.296,7.648,,,,,
2,0.7274,0.41993,0.000178,0.04,100,,,,,,,,,
3,,,,0.04,100,0.68043,131.0329,15.263,7.632,,,,,
4,0.6854,0.400741,0.000156,0.06,150,,,,,,,,,
