# Math Verification Model - Binary Classification


## Step 1: Install Dependencies

In [None]:
%%capture
!pip install transformers accelerate peft bitsandbytes datasets pandas


## Step 2: Import Libraries and Configuration

In [None]:
import torch
from transformers import AutoTokenizer, LlamaForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from torch.utils.data import Dataset
import pandas as pd

# Configuration
BASE_MODEL = "unsloth/Llama-3.2-3B-Instruct" 
OUTPUT_DIR = "lora_binary_classifier"
MAX_LENGTH = 2048
DEFAULT_DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

print(" Configuration complete")

Configuration complete


## Step 3: Define Dataset Class

In [3]:
class BinaryDataset(Dataset):
    """Binary classification dataset"""
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Build input text
        text = f"""Verify if the following solution is correct.

Question: {item['question']}
Answer: {item['answer']}
Solution: {item['solution']}

Is this solution correct? Answer only 'yes' or 'no'."""

        # Tokenize
        encoding = self.tokenizer(
            text,
            max_length=MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Label: 0=no, 1=yes
        label = 1 if item['is_correct'] else 0

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print(" Dataset class defined")

Dataset class defined


## Step 4: Load and Clean Data

In [4]:
print(" Loading dataset...")
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")

print(f"Original size: {len(dataset):,}")

# Data cleaning function
def is_valid(example):
    try:
        text = str(example['question']) + str(example['solution'])
        if len(text.strip()) < 20:
            return False
        solution_len = len(str(example['solution']))
        if solution_len < 10 or solution_len > 3000:
            return False
        return True
    except:
        return False

# Clean data
dataset = dataset.filter(is_valid)
print(f"After cleaning: {len(dataset):,}")

# Split data
shuffled = dataset.shuffle(seed=42)
train_size = min(30000, len(shuffled) - 2000)

train_data = shuffled.select(range(train_size))
val_data = shuffled.select(range(train_size, train_size + 2000))

print(f"\n Data preparation complete:")
print(f"  Training samples: {len(train_data):,}")
print(f"  Validation samples: {len(val_data):,}")

 Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Original size: 1,000,000


Filter:   0%|          | 0/1000000 [00:00<?, ? examples/s]

After cleaning: 999,515

Data preparation complete:
  Training samples: 30,000
  Validation samples: 2,000


## Step 5: Load Model and Convert to Binary Classification

In [5]:
print(f" Loading model: {BASE_MODEL}")

# Load model
model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=DEFAULT_DTYPE,
    device_map="auto",

)
# Key: Convert lm_head to 2-class classifier
print(" Converting model to binary classification...")
model.lm_head = torch.nn.Linear(
    model.config.hidden_size,
    2,  # yes=1, no=0
    bias=False,
    device=model.lm_head.weight.device,
    dtype=model.lm_head.weight.dtype
)
model.config.vocab_size = 2

# Freeze all base model parameters
for param in model.parameters():
    param.requires_grad = False

print(" Model modification complete")

`torch_dtype` is deprecated! Use `dtype` instead!


 Loading model: unsloth/Llama-3.2-3B-Instruct


config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Converting model to binary classification...
Model modification complete


## Step 6: Add LoRA

In [None]:
print(" Adding LoRA layers...")

lora_config = LoraConfig(
    r=64,  # LoRA rank
    inference_mode=False,
    target_modules=["q_proj", "k_proj", "v_proj"],  # Only fine-tune attention
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0,
    modules_to_save=["lm_head"],  
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print(" LoRA added")

Adding LoRA layers...
trainable params: 25,696,256 || all params: 3,238,452,224 || trainable%: 0.7935
LoRA added


## Step 7: Prepare Tokenizer and Datasets

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Create datasets
train_dataset = BinaryDataset(train_data, tokenizer)
val_dataset = BinaryDataset(val_data, tokenizer)

print(f" Tokenizer and datasets ready")
print(f"  Training set size: {len(train_dataset):,}")
print(f"  Validation set size: {len(val_dataset):,}")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Tokenizer and datasets ready
  Training set size: 30,000
  Validation set size: 2,000


## Step 8: Define Custom Trainer

In [8]:
class BinaryClassificationTrainer(Trainer):
    """Binary classification trainer"""
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Take logits from last token
        logits = logits[:, -1, :]  # [batch_size, 2]

        # Compute cross-entropy loss
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


def compute_metrics(eval_pred):
    """Compute accuracy"""
    logits, labels = eval_pred
    # Take predictions from last token
    logits = logits[:, -1, :]
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = (predictions == torch.tensor(labels)).float().mean().item()
    return {"accuracy": accuracy}


print(" Trainer class defined")

Trainer class defined


## Step 9: Configure Training Parameters

In [9]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,  # Proven configuration: 1 epoch
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=200,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    optim="adamw_8bit",
    report_to="none",
    remove_unused_columns=False,
)

trainer = BinaryClassificationTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print(" Trainer configured")
print(f"  Estimated training time: ~60-70 minutes")

The model is already on multiple devices. Skipping the move to device specified in `args`.


Trainer configured
  Estimated training time: ~60-70 minutes


## Step 10: Start Training

⏱ **Estimated time: 60-70 minutes**

In [10]:
print("\n Starting training...")
print("=" * 60)

trainer.train()

print("=" * 60)
print(" Training complete!\n")

# Save model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f" Model saved to: {OUTPUT_DIR}")


Starting training...



Step,Training Loss,Validation Loss,Accuracy
200,1.9885,0.587018,0.7755
400,1.6391,0.476464,0.8335
600,1.3837,0.350486,0.843
800,1.555,0.325677,0.864
1000,1.1779,0.285539,0.876




Step,Training Loss,Validation Loss,Accuracy
200,1.9885,0.587018,0.7755
400,1.6391,0.476464,0.8335
600,1.3837,0.350486,0.843
800,1.555,0.325677,0.864
1000,1.1779,0.285539,0.876
1200,1.1635,0.286691,0.8755
1400,0.9805,0.275313,0.887
1600,1.0235,0.253447,0.896
1800,0.963,0.240893,0.9005





Training complete.





Model saved to: lora_binary_classifier


## Step 11: Generate Test Set Predictions

In [11]:
print(" Generating test set predictions...\n")

# Load test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
print(f"Test set size: {len(test_dataset):,}")

# Predict
predictions = []
model.eval()

for i, item in enumerate(test_dataset):
    if i % 100 == 0:
        print(f"Processing: {i}/{len(test_dataset)}")

    # Build input
    text = f"""Verify if the following solution is correct.

Question: {item['question']}
Answer: {item['answer']}
Solution: {item['solution']}

Is this solution correct? Answer only 'yes' or 'no'."""

    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", max_length=MAX_LENGTH, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]  # Take last token
        prediction = torch.argmax(logits, dim=-1).item()

    predictions.append(prediction)

print(f"\n Predictions complete!")

Generating test set predictions...

Test set size: 10,000
Processing: 0/10000
Processing: 100/10000
Processing: 200/10000
Processing: 300/10000
Processing: 400/10000
Processing: 500/10000
Processing: 600/10000
Processing: 700/10000
Processing: 800/10000
Processing: 900/10000
Processing: 1000/10000
Processing: 1100/10000
Processing: 1200/10000
Processing: 1300/10000
Processing: 1400/10000
Processing: 1500/10000
Processing: 1600/10000
Processing: 1700/10000
Processing: 1800/10000
Processing: 1900/10000
Processing: 2000/10000
Processing: 2100/10000
Processing: 2200/10000
Processing: 2300/10000
Processing: 2400/10000
Processing: 2500/10000
Processing: 2600/10000
Processing: 2700/10000
Processing: 2800/10000
Processing: 2900/10000
Processing: 3000/10000
Processing: 3100/10000
Processing: 3200/10000
Processing: 3300/10000
Processing: 3400/10000
Processing: 3500/10000
Processing: 3600/10000
Processing: 3700/10000
Processing: 3800/10000
Processing: 3900/10000
Processing: 4000/10000
Processing:

## Step 12: Create Submission File

In [12]:
# Create submission file
submission = pd.DataFrame({
    'ID': range(len(predictions)),  # Uppercase ID for Kaggle
    'is_correct': [bool(p) for p in predictions]  # True/False format
})
submission.to_csv('submission.csv', index=False)

print(" Submission file saved: submission.csv")
print(f"\n Prediction statistics:")
print(f"  Predicted as correct: {sum(predictions):,}")
print(f"  Predicted as incorrect: {len(predictions) - sum(predictions):,}")
print(f"  Correct ratio: {sum(predictions)/len(predictions)*100:.1f}%")

# Display preview
print(f"\nSubmission file preview:")
print(submission.head(10))

print("\n" + "=" * 60)
print(" All complete!")
print("=" * 60)
print("\nDownload submission.csv and submit to Kaggle")

Submission file saved: submission.csv

Prediction statistics:
  Predicted as correct: 3,659
  Predicted as incorrect: 6,341
  Correct ratio: 36.6%

Submission file preview:
   ID  is_correct
0   0       False
1   1       False
2   2       False
3   3        True
4   4       False
5   5       False
6   6       False
7   7       False
8   8       False
9   9       False

Download submission.csv and submit to Kaggle
