In [1]:
# Math Answer Verification - Llama3-8B Fine-Tuning
# DL-Fall-25 Kaggle Competition

# ============================================================================
# SETUP AND INSTALLATIONS
# ============================================================================

# Install required packages
!pip install -q transformers datasets accelerate peft bitsandbytes scipy

# Import libraries
import os
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    # Model settings
    model_name = "meta-llama/Meta-Llama-3-8B"
    max_length = 512
    
    # LoRA settings
    lora_r = 16
    lora_alpha = 32
    lora_dropout = 0.05
    
    # Training settings
    output_dir = "./results"
    num_epochs = 3
    batch_size = 4
    gradient_accumulation_steps = 4
    learning_rate = 2e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    
    # Other settings
    validation_split = 0.1
    seed = 42
    save_steps = 500
    eval_steps = 500
    logging_steps = 100

config = Config()

# Set seed for reproducibility
torch.manual_seed(config.seed)
np.random.seed(config.seed)

# ============================================================================
# LOAD DATASET
# ============================================================================

print("Loading dataset from Hugging Face...")
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")

train_dataset = dataset['train']
test_dataset = dataset['test']

print(f"Train samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print("\nSample from training data:")
print(train_dataset[0])

# ============================================================================
# PREPARE DATA
# ============================================================================

# Load tokenizer
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def format_prompt(question, answer, solution):
    """Format the input prompt for the model"""
    prompt = f"""Verify if the provided answer is correct for the given question.

Question: {question}

Provided Answer: {answer}

Correct Solution: {solution}

Is the provided answer correct?"""
    return prompt

def preprocess_function(examples):
    """Preprocess and tokenize the data"""
    # Format prompts
    texts = [
        format_prompt(q, a, s) 
        for q, a, s in zip(
            examples['question'], 
            examples['answer'], 
            examples['solution']
        )
    ]
    
    # Tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=config.max_length,
        padding=False  # We'll use dynamic padding
    )
    
    # Add labels for training data
    if 'is_correct' in examples:
        tokenized['labels'] = [1 if x else 0 for x in examples['is_correct']]
    
    return tokenized

# Preprocess datasets
print("\nPreprocessing training data...")
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

print("Preprocessing test data...")
test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=test_dataset.column_names
)

# Split training data for validation
train_test_split = train_dataset.train_test_split(
    test_size=config.validation_split,
    seed=config.seed
)
train_data = train_test_split['train']
eval_data = train_test_split['test']

print(f"\nFinal splits:")
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(eval_data)}")
print(f"Test samples: {len(test_dataset)}")

# ============================================================================
# LOAD MODEL WITH LORA
# ============================================================================

print("\nLoading model...")
model = AutoModelForSequenceClassification.from_pretrained(
    config.model_name,
    num_labels=2,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# Set pad token id
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id

# Configure LoRA
print("Configuring LoRA...")
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=config.lora_dropout,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# ============================================================================
# EVALUATION METRICS
# ============================================================================

def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# ============================================================================
# TRAINING CONFIGURATION
# ============================================================================

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir=config.output_dir,
    num_train_epochs=config.num_epochs,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=config.learning_rate,
    weight_decay=config.weight_decay,
    warmup_ratio=config.warmup_ratio,
    logging_steps=config.logging_steps,
    evaluation_strategy="steps",
    eval_steps=config.eval_steps,
    save_steps=config.save_steps,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=False,
    bf16=torch.cuda.is_available(),
    report_to="none",
    seed=config.seed,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# ============================================================================
# TRAIN MODEL
# ============================================================================

print("\n" + "="*80)
print("STARTING TRAINING")
print("="*80 + "\n")

train_result = trainer.train()

print("\n" + "="*80)
print("TRAINING COMPLETED")
print("="*80 + "\n")

# Print training metrics
print("Training Metrics:")
for key, value in train_result.metrics.items():
    print(f"  {key}: {value}")

# ============================================================================
# EVALUATE ON VALIDATION SET
# ============================================================================

print("\n" + "="*80)
print("EVALUATING ON VALIDATION SET")
print("="*80 + "\n")

eval_results = trainer.evaluate()

print("Validation Metrics:")
for key, value in eval_results.items():
    print(f"  {key}: {value}")

# ============================================================================
# GENERATE PREDICTIONS FOR TEST SET
# ============================================================================

print("\n" + "="*80)
print("GENERATING PREDICTIONS FOR TEST SET")
print("="*80 + "\n")

# Get predictions
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Convert to boolean (0 -> False, 1 -> True)
pred_bool = [bool(label) for label in pred_labels]

# Create submission dataframe
submission_df = pd.DataFrame({
    'ID': range(len(pred_bool)),
    'is_correct': pred_bool
})

print("\nPrediction distribution:")
print(submission_df['is_correct'].value_counts())

# Save submission file
submission_path = "submission.csv"
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission file saved to: {submission_path}")

# Display first few predictions
print("\nFirst 10 predictions:")
print(submission_df.head(10))

# ============================================================================
# SAVE MODEL
# ============================================================================

print("\nSaving model...")
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")
print("Model saved to: ./final_model")

print("\n" + "="*80)
print("PIPELINE COMPLETED SUCCESSFULLY!")
print("="*80)
print("\nNext steps:")
print("1. Review the submission.csv file")
print("2. Upload to Kaggle competition")
print("3. Check leaderboard score")
print("\nTo improve:")
print("- Adjust hyperparameters in Config class")
print("- Try different prompt formats")
print("- Experiment with max_length")
print("- Increase epochs or adjust learning rate")

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support pip 21.0 will remove support for this functionality.[0m
[31m    ERROR: Command errored out with exit status 1:
     command: /Library/Frameworks/Python.framework/Versions/2.7/Resources/Python.app/Contents/MacOS/Python -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/private/var/folders/39/h509vzss5cb4ftjj6z648wsm0000gn/T/pip-install-aeJkjH/datasets/setup.py'"'"'; __file__='"'"'/private/var/folders/39/h509vzss5cb4ftjj6z648wsm0000gn/T/pip-install-aeJkjH/datasets/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info -

  from .autonotebook import tqdm as notebook_tqdm


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [2]:
   from huggingface_hub import notebook_login
   notebook_login()


ImportError: The `notebook_login` function can only be used in a notebook (Jupyter or Colab) and you need the `ipywidgets` module: `pip install ipywidgets`.