# Set up / Load Model and Data

In [None]:
# Set up Code
# This cell will take time
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [None]:
# load original model (Run this cell to start from scratch)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

from peft import LoftQConfig, LoraConfig

model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 # 16 default
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = LoftQConfig(loftq_bits=4), # And LoftQ

)

In [None]:
# load pre-trained model (Run this cell to load already trained version)

# load fine-tuned model from huggingface
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "suzoliver/Nov14_r64", # Name of HuggingFace directory
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

# Partition dataset and prep for training

In [None]:
# Set prompt for LLM

prompt = """Tell me if the answer to the following math question is correct or not. You can only respond 'True' or 'False'. Below is Question and Answer and an Explanation of the Answer.
### Question:
{}

### Answer:
{}

## Explanation:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    question = examples["question"]
    ans       = examples["answer"]
    explan    = examples["solution"]
    output      = examples["is_correct"]
    texts = []
    for instruction, input, explan, output in zip(question, ans, explan, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, explan, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [None]:
# Parition dataset into train and validation

# adjust train_size depending on how many points are needed for training
split_data = dataset['train'].train_test_split(train_size = 30000, shuffle=False)

# get val_data from test split of split_data, adjust train_size for the size of the validation set
val_data = split_data['test'].train_test_split(train_size = 1000, shuffle=True)

In [None]:
# Process the training dataset and generate prompt for each datapoint

train_dataset = split_data['train'].map(formatting_prompts_func, batched = True)
val_dataset = val_data['train'].map(formatting_prompts_func, batched = True,)

# Garbage Collector

In [None]:
# garbage collector to run when facing out-of-memory issues

import gc

gc.collect()
torch.cuda.empty_cache()

# Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Set training arguments

training_args = TrainingArguments(
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 8, # highest possible without tripping colab memory error
        gradient_accumulation_steps = 16,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_steps = 20, # only do validation every 20 steps because it is slows down training
        eval_strategy="steps"
    )

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args,
)

In [None]:
# Run code and store training and validation losses

trainer_stats = trainer.train()

In [None]:
# Save training and validation losses to csv
import csv

with open('trainHist.csv', mode='w', newline='') as file:
  # Create a CSV writer object
  csv_writer = csv.writer(file)

  # Write any row with 'loss' (training loss) to csv
  for i in range(len(trainer.state.log_history)):
    if 'loss' in trainer.state.log_history[i].keys():
      csv_writer.writerow(trainer.state.log_history[i].values())

with open('valHist.csv', mode='w', newline='') as file:
  # Create a CSV writer object
  csv_writer = csv.writer(file)

  # Write any row with 'eval_loss' (validation loss) to csv
  for i in range(len(trainer.state.log_history)):
    if 'eval_loss' in trainer.state.log_history[i].keys():
      csv_writer.writerow(trainer.state.log_history[i].values())

# Inference

In [None]:
# create prompt without output so that model can provide answer

def formatting_prompts_func2(examples):
    question = examples["question"]
    ans       = examples["answer"]
    explan    = examples["solution"]
    texts = []
    for instruction, explan, input in zip(question, explan, ans):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, explan, "")
        texts.append(text)
    return { "text" : texts, }

# use this line for validation
#val_temp = split_data['test'].train_test_split(test_size = 1000, shuffle=False)

# use this line to predict on test set
val_temp = dataset

val_dataset = val_temp['test'].map(formatting_prompts_func2, batched = True)

In [None]:
# Create pipeline for inferencing

FastLanguageModel.for_inference(model)

from transformers import pipeline
pl = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=4)

In [None]:
# do inference with pipeline and store in ans variable

nData = 10000; # 10000 for testing, 1000 for validation set

ans = [0] * nData
for i, out in enumerate(pl(val_dataset["text"][0:nData], return_full_text=False, batch_size=8)):
  ans[i] = [i,out[0]['generated_text']]


In [None]:
# convert inference output to booleans

ansBool = ans
for i in range(nData):
  # convert string to boolean for comparison with is_correct field
  if ans[i][1] == 'True' or ans[i][1] == 'False':
    ansBool[i][1] = eval(ans[i][1])
  else:
    # if the output is something other than 'True' or 'False', guess 'True'
    # note, this should only happen on models that aren't sufficiently trained
    ansBool[i][1] = True

print(ansBool)

In [None]:
# create csv for submission to Kaggle contest (for test data)

import csv

with open('output.csv', mode='w', newline='') as file:
  # Create a CSV writer object
  csv_writer = csv.writer(file)

  # Write the rows to the CSV file
  csv_writer.writerows(ansBool)

In [None]:
# for validation, compare model output to is_correct field
# note: does not work for test data as is_correct is TRUE for those data

correct = 0

for i in range(nData):

  pred_answer = ans[i][1]
  true_answer = str(val_dataset[i]['is_correct'])
  if pred_answer == true_answer:
    correct += 1

print(correct)


# Save model to HuggingFace

In [None]:
# login to huggingface to get write access

from huggingface_hub import notebook_login
notebook_login()


In [None]:
# save model and tokenizer to huggingface data base

model.push_to_hub("suzoliver/Nov14_r64_2")
tokenizer.push_to_hub("suzoliver/Nov14_r64_2")