In [1]:
!nvidia-smi

Fri Nov 15 19:29:14 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  |   00000000:04:00.0 Off |                    0 |
| N/A   34C    P0             67W /  700W |       1MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install unsloth
!pip install wandb

[0m

In [3]:
# # %%capture
# # This cell will take time
# !pip install unsloth
# # Also get the latest nightly Unsloth!
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [4]:
import pandas as pd 

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [6]:
# Load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA H100 80GB HBM3. Max memory: 79.209 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 9.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

In [9]:
prompt = """
You are an expert math teacher and problem solver. Your task is to carefully review a math question, the provided answer, and the detailed solution steps.
Verify if the provided answer is correct based strictly on the solution steps.
Do not include any explanations or additional text, and do not provide anything other than the words 'True' or 'False'.
### Math Question:
{}

### Provided Answer:
{}

### Detailed Solution Steps:
{}

### Correctness:
(True or False)
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    question = examples["question"]
    ans = examples["answer"]
    solution = examples["solution"]
    output = examples["is_correct"]
    texts = []
    for ques, ans_text, sol, correct in zip(question, ans, solution, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(ques, ans_text, sol, correct) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [10]:
# Apply the formatting function to the sampled dataset
train_dataset = dataset['train'].map(formatting_prompts_func, batched=True)

In [11]:
from datasets import Dataset
import random
from datasets import concatenate_datasets

# Define the number of samples for training and evaluation
train_size_per_class = 20000  # 15,000 'True' and 15,000 'False' for training (total 30,000)
eval_size_per_class = 4000    # 1,500 'True' and 1,500 'False' for evaluation (total 3,000)

# Filter the dataset into 'True' and 'False' samples
true_samples = train_dataset.filter(lambda example: example['is_correct'] == True)
false_samples = train_dataset.filter(lambda example: example['is_correct'] == False)

# Ensure we have enough samples of each class
assert len(true_samples) >= train_size_per_class + eval_size_per_class, "Not enough 'True' samples"
assert len(false_samples) >= train_size_per_class + eval_size_per_class, "Not enough 'False' samples"

# Shuffle and sample the training data
true_train_sampled = true_samples.shuffle(seed=3407).select(range(train_size_per_class))
false_train_sampled = false_samples.shuffle(seed=3407).select(range(train_size_per_class))

# Shuffle and sample the evaluation data
true_eval_sampled = true_samples.shuffle(seed=3407).select(range(train_size_per_class, train_size_per_class + eval_size_per_class))
false_eval_sampled = false_samples.shuffle(seed=3407).select(range(train_size_per_class, train_size_per_class + eval_size_per_class))

balanced_train_dataset = concatenate_datasets([true_train_sampled, false_train_sampled]).shuffle(seed=3407)
balanced_eval_dataset = concatenate_datasets([true_eval_sampled, false_eval_sampled]).shuffle(seed=3407)


In [12]:
# Convert the datasets to Pandas DataFrames
train_distribution_df = balanced_train_dataset.to_pandas()
eval_distribution_df = balanced_eval_dataset.to_pandas()

# Print the distribution of the 'is_correct' column
train_distribution = train_distribution_df['is_correct'].value_counts()
eval_distribution = eval_distribution_df['is_correct'].value_counts()

print("Balanced Train Dataset Distribution:")
print(train_distribution)

print("\nBalanced Eval Dataset Distribution:")
print(eval_distribution)


Balanced Train Dataset Distribution:
is_correct
True     20000
False    20000
Name: count, dtype: int64

Balanced Eval Dataset Distribution:
is_correct
False    4000
True     4000
Name: count, dtype: int64


In [13]:
from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5)

In [14]:
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          52 bits physical, 57 bits virtual
  Byte Order:             Little Endian
CPU(s):                   64
  On-line CPU(s) list:    0-63
Vendor ID:                AuthenticAMD
  Model name:             AMD EPYC 9124 16-Core Processor
    CPU family:           25
    Model:                17
    Thread(s) per core:   2
    Core(s) per socket:   16
    Socket(s):            2
    Stepping:             1
    Frequency boost:      enabled
    CPU max MHz:          3711.9141
    CPU min MHz:          1500.0000
    BogoMIPS:             5990.97
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall
                           nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep
                          _good nopl nonstop_tsc cpuid extd_apicid aperfmperf ra
                          pl pni p

In [15]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import random

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=32, # smaller for better perf?  # higher batch size?
    gradient_accumulation_steps=8, # 8 
    # warmup_steps=50,    # 30
    warmup_ratio=0.1,
    num_train_epochs=2,
    learning_rate=2e-4, #1e-4 might be too large for fine tun   # 2e-4
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    optim="adamw_hf",    # adamw_torch
    weight_decay=0.01,  # 0.03 too high  # 0.01
    lr_scheduler_type="cosine",    # cosine_with_restarts
    seed=3407,
    output_dir="outputs",
    dataloader_num_workers=32,
    evaluation_strategy="steps",
    eval_steps=10,
    logging_dir="./logs",  # Directory to save TensorBoard logs
    logging_steps=1,
    # report_to="tensorboard",  # Enable TensorBoard logging
    report_to="wandb",
    dataloader_pin_memory=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,    # Set to False since lower loss is better
    load_best_model_at_end=True,
    save_total_limit=2,
    max_grad_norm=0.5,               # Add gradient clipping  # try 0.5
)

model.gradient_checkpointing_enable()


# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=balanced_train_dataset,
    eval_dataset=balanced_eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=32,
    packing=False,  # Can make training 5x faster for short sequences
    args=training_args,
    callbacks=[early_stopping_callback],
)



In [16]:
import wandb

In [17]:
!wandb login 06e028f429168761bc1fcee0499e25c9e432267e

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [18]:
### Initialize a new W&B run
wandb.init(
    project="DL-Fall-24 Kaggle Contest",   # Set your project name
    name="final run large lora i do not give a fuck2",          # Optional: Set a custom run name
    config=training_args,          # Pass in the training arguments
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbietabigbit[0m ([33mbietabigbit-new-york-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [19]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 40,000 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 32 | Gradient Accumulation steps = 8
\        /    Total batch size = 256 | Total steps = 312
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
10,1.1049,1.002021
20,0.6787,0.663405
30,0.6271,0.620605
40,0.6113,0.604267
50,0.6099,0.593916
60,0.5791,0.586251
70,0.6033,0.579356
80,0.5669,0.573634
90,0.6054,0.567766
100,0.5747,0.561682


TrainOutput(global_step=312, training_loss=0.5611275769770145, metrics={'train_runtime': 11812.4217, 'train_samples_per_second': 6.773, 'train_steps_per_second': 0.026, 'total_flos': 2.6785260773509693e+18, 'train_loss': 0.5611275769770145, 'epoch': 1.9968})

In [21]:
# model.train()
# model.gradient_checkpointing_enable()

In [22]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 40,000 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 32 | Gradient Accumulation steps = 8
\        /    Total batch size = 256 | Total steps = 312
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
10,0.4548,0.483406
20,0.4701,0.48286
30,0.4654,0.486612
40,0.4598,0.490479
50,0.4618,0.488419
60,0.4389,0.484585
70,0.4631,0.48061
80,0.4388,0.475399
90,0.4727,0.469415
100,0.4422,0.465048


TrainOutput(global_step=312, training_loss=0.3984998298379091, metrics={'train_runtime': 11795.0911, 'train_samples_per_second': 6.782, 'train_steps_per_second': 0.026, 'total_flos': 2.6785260773509693e+18, 'train_loss': 0.3984998298379091, 'epoch': 1.9968})

In [23]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 40,000 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 32 | Gradient Accumulation steps = 8
\        /    Total batch size = 256 | Total steps = 312
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
10,0.3519,0.411528
20,0.3577,0.412967
30,0.3506,0.417545
40,0.3446,0.424376
50,0.3552,0.430502
60,0.3408,0.433188
70,0.3725,0.432162
80,0.3546,0.431295
84,0.3663,0.43139


KeyboardInterrupt: 

In [24]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [25]:
eval_metrics = trainer.evaluate()
print(f"Validation Loss: {eval_metrics['eval_loss']}")

Validation Loss: 0.43139007687568665


In [26]:
from tqdm.auto import tqdm

def generate_kaggle_predictions(
    model,
    tokenizer,
    test_dataset,
    batch_size=8,
    max_new_tokens=64
):
    """
    Generate predictions for Kaggle submission and calculate accuracy.

    Args:
        model: The trained model
        tokenizer: The tokenizer
        test_dataset: Test dataset containing questions and answers
        batch_size: Batch size for inference
        max_new_tokens: Maximum number of new tokens to generate

    Returns:
        list: List of boolean predictions
    """
    model.eval()
    all_predictions = []
    all_labels = test_dataset['is_correct']  # Ground truth labels
    correct_predictions = 0

    # Process test data in batches
    for i in tqdm(range(0, len(test_dataset), batch_size), desc="Generating predictions"):
        batch_questions = test_dataset['question'][i:i + batch_size]
        batch_answers = test_dataset['answer'][i:i + batch_size]
        # batch_solutions = test_dataset.get('solution', [""] * len(batch_questions))
        batch_solutions = test_dataset['solution'][i:i + batch_size] if 'solution' in test_dataset.column_names else [""] * len(batch_questions)
        batch_labels = all_labels[i:i + batch_size]

        # Prepare prompts for each question-answer pair
        prompts = []
        for question, answer, solution in zip(batch_questions, batch_answers, batch_solutions):
            prompt = f"""
You are an expert math teacher and problem solver. Your task is to carefully review a math question, the provided answer, and the detailed solution steps.
Verify if the provided answer is correct based strictly on the solution steps.
Do not include any explanations or additional text, and do not provide anything other than the words 'True' or 'False'.
### Question:
{question}

### Given Answer:
{answer}

### Given Solution:
{solution}


### Correctness:
(True or False)
"""
            prompts.append(prompt)

        # Tokenize
        inputs = tokenizer(
            prompts,
            padding=True,
            return_tensors="pt",
            truncation=True
        ).to(model.device)

        # Generate predictions
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # Process each output in the batch
            for j, output in enumerate(outputs):
                # Get only the generated part (exclude input)
                input_length = inputs.input_ids[j].shape[0]
                prediction = tokenizer.decode(output[input_length:], skip_special_tokens=True)
                # Convert to boolean based on presence of "true" in prediction
                bool_pred = "true" in prediction.strip().lower()

                all_predictions.append(bool_pred)

                # Compare prediction with ground truth and count correct predictions
                if bool_pred == batch_labels[j]:
                    correct_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / len(test_dataset)
    print(f"Accuracy: {accuracy:.4f}")

    return all_predictions

def create_submission_file(predictions, output_file="submission.csv"):
    """
    Create Kaggle submission file with 'ID' and 'is_correct' columns.

    Args:
        predictions: List of boolean predictions
        output_file: Name of output CSV file
    """
    # Create DataFrame with 'ID' and 'is_correct' columns
    # bool_predictions = [pred.strip().lower() == "true" for pred in predictions]
    bool_predictions = [pred for pred in predictions]
    submission_df = pd.DataFrame({
        'ID': list(range(len(predictions))),  # ID column from 0 to len(predictions) - 1
        'is_correct': bool_predictions
    })

    # Save to CSV
    submission_df.to_csv(output_file, index=False)
    print(f"Submission file saved to {output_file}")

    # Print distribution of predictions
    true_count = sum(predictions)
    false_count = len(predictions) - true_count
    print(f"\nPrediction distribution:")
    print(f"True: {true_count} ({true_count/len(predictions)*100:.2f}%)")
    print(f"False: {false_count} ({false_count/len(predictions)*100:.2f}%)")


# Main execution function
def generate_submission(model, tokenizer, test_dataset, batch_size=8, output_file="submission.csv"):
    """
    Generate and save Kaggle submission file.

    Args:
        model: The trained model
        tokenizer: The tokenizer
        test_dataset: Test dataset
        batch_size: Batch size for inference
        output_file: Output file path
    """
    print("Generating predictions...")
    predictions = generate_kaggle_predictions(
        model=model,
        tokenizer=tokenizer,
        test_dataset=test_dataset,
        batch_size=batch_size
    )

    print("\nCreating submission file...")
    create_submission_file(predictions, output_file)

    return predictions

In [27]:
# Make sure model is in evaluation mode and using fast inference
FastLanguageModel.for_inference(model)

# Generate submission
predictions = generate_submission(
    model=model,
    tokenizer=tokenizer,
    test_dataset=dataset['test'],
    batch_size=64,  # Adjust based on your GPU memory
    output_file="submission.csv"
)

Generating predictions...


Generating predictions:   0%|          | 0/157 [00:00<?, ?it/s]

Accuracy: 0.2939

Creating submission file...
Submission file saved to submission.csv

Prediction distribution:
True: 2939 (29.39%)
False: 7061 (70.61%)


In [28]:
model.save_pretrained("outputs/saved_model")
tokenizer.save_pretrained("outputs/saved_model")

('outputs/saved_model/tokenizer_config.json',
 'outputs/saved_model/special_tokens_map.json',
 'outputs/saved_model/tokenizer.json')

In [30]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("outputs/saved_model")
tokenizer = AutoTokenizer.from_pretrained("outputs/saved_model")


`low_cpu_mem_usage` was None, now default to True since model is quantized.
