# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [1]:
!pip install torch transformers datasets peft accelerate 
!pip install huggingface_hub scikit-learn
!pip install evaluate
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.6.0
Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl (84

Installing collected packages: evaluate
[0mSuccessfully installed evaluate-0.4.3
Defaulting to user installation because normal site-packages is not writeable


In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel, AutoPeftModelForSequenceClassification, TaskType
import numpy as np
import torch
import os
import shutil
import sys
import time
import traceback
import json
from datasets import DatasetDict, load_metric

# aka big bang
planck0 = time.time()

def p(text, width=80):
    print("\n"*3+"="*width+"\n"+text.upper().center(width)+"\n"+"="*width)
    
def secToHuman(elapsed_time):
    hours, rem = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(rem, 60)
    # return hours, minutes, seconds
    return f"{int(hours):02}:{int(minutes):02}:{seconds:.2f}"    

# variables
CHECKPOINTS = "./checkpoints"
PEFT_MODEL = "./peft_model"

resultset = []

label_names = ["not spam", "spam"]
id2label = {idx: label for idx, label in enumerate(label_names)}
label2id = {label: idx for idx, label in enumerate(label_names)}
model = None
peft_model = None
tokenizer = None
tokenized_ds = {} 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

###############################################################################
# Loading and Evaluating a Foundation Model
###############################################################################
split = ['train', 'test']

raw_dataset = load_dataset("sms_spam")
full_dataset = raw_dataset['train'].train_test_split(test_size=0.2, seed=42, shuffle=True)

# dataset =  DatasetDict({"train": full_dataset["train"].shuffle(seed=42).select(range(1000)),  # Keep only 1000 samples
#             "test":  full_dataset["test"].shuffle(seed=42).select(range(200))  # Keep only 200 samples
#             })
dataset =  DatasetDict({"train": full_dataset["train"].shuffle(seed=42),  
            "test":  full_dataset["test"].shuffle(seed=42)
            })

print("Dataset loaded.")


Downloading readme:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 359k/359k [00:00<00:00, 1.38MB/s]


Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

Dataset loaded.


In [3]:
def gpt2_compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):  # Ensure it's not a dictionary
        logits = logits[0]
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": (predictions == labels).mean()}

def tokenize_fn(examples):
    global tokenizer
    return tokenizer(examples["sms"], padding="max_length", truncation=True, max_length=256)



In [4]:
def evaluate_gpt2_model(dataset, with_train=False):
    global tokenizer
    # Load GPT-2 tokenizer and model
    # from transformers import BitsAndBytesConfig
    # bnb_config = BitsAndBytesConfig(load_in_8bit=True)  # 8-bit quantization

    model_name = "gpt2"
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2, id2label=id2label, label2id=label2id,
    #     quantization_config=bnb_config
    ).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # GPT-2 doesn't have a padding token, so use eos_token and set padding_side to left
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"  # Ensure left padding for GPT-2

    for s in split:
        tokenized_ds[s] = dataset[s].map(tokenize_fn, batched=True)

    tokenized_ds["train"] = tokenized_ds["train"].map(
        lambda e: {'labels': e['label']},  
        batched=True,
        remove_columns=['label']
    )
    tokenized_ds["test"] = tokenized_ds["test"].map(
        lambda e: {'labels': e['label']},  
        batched=True,
        remove_columns=['label']
    )

    tokenized_ds["train"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    tokenized_ds["test"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    print("===========================================================")
    print(tokenized_ds["train"].column_names)
    print("===========================================================")


    ###############################################################################
    # Load model and freeze base parameters
    ###############################################################################

    model.config.pad_token_id = tokenizer.pad_token_id
    model.resize_token_embeddings(len(tokenizer))  # Adjust embedding size for new tokens

    for name, param in model.named_parameters():
        if "score" not in name:  # Keep classification head trainable
            param.requires_grad = True

    print(model)

    for name, param in model.named_parameters():
        print(name, param.requires_grad)


    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=CHECKPOINTS, 
            resume_from_checkpoint=True,
            learning_rate=2e-5, 
            per_device_train_batch_size=16, 
            per_device_eval_batch_size=16, 
            num_train_epochs=2, 
            weight_decay=0.01, 
            evaluation_strategy="epoch",
            save_strategy="epoch", 
            metric_for_best_model="accuracy",  # Change from "eval_loss" to "accuracy"
            load_best_model_at_end=True, 
        ),
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["test"], 
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt"),
        compute_metrics=gpt2_compute_metrics, 
    )

    metrics = trainer.evaluate()
    result = f"GPT2 Evaluation metrics before everything: {metrics}"
    if (with_train):
        trainer.train()
        metrics = trainer.evaluate()
        result+=f"\nEvaluation metrics after gpt2 train: {metrics}"
    return model, result

In [5]:
start_time = time.time()

model, result = evaluate_gpt2_model(dataset, True)

end_time = time.time()
elapsed_time = secToHuman(end_time - start_time)
r = ["Run GPT2 Evaluation and Traing", elapsed_time, result]
resultset.append(r)
print(f"{r}")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


['sms', 'input_ids', 'attention_mask', 'labels']
GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)
transformer.wte.weight True
transformer.wpe.weight True

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.069521,0.98565
2,0.144600,0.072138,0.984753


Checkpoint destination directory ./checkpoints/checkpoint-279 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./checkpoints/checkpoint-558 already exists and is non-empty.Saving will proceed but saved results may be invalid.


['Run GPT2 Evaluation and Traing', '00:08:50.99', "GPT2 Evaluation metrics before everything: {'eval_loss': 0.4561626613140106, 'eval_accuracy': 0.7793721973094171, 'eval_runtime': 17.0024, 'eval_samples_per_second': 65.579, 'eval_steps_per_second': 4.117}\nEvaluation metrics after gpt2 train: {'eval_loss': 0.06952129304409027, 'eval_accuracy': 0.9856502242152466, 'eval_runtime': 18.3855, 'eval_samples_per_second': 60.645, 'eval_steps_per_second': 3.807, 'epoch': 2.0}"]


In [6]:
start_time = time.time()

model, result = evaluate_gpt2_model(dataset, False)

end_time = time.time()
elapsed_time = secToHuman(end_time - start_time)
r = ["Run GPT2 Evaluation", elapsed_time, result]
resultset.append(r)
print(f"{r}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


['sms', 'input_ids', 'attention_mask', 'labels']
GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)
transformer.wte.weight True
transformer.wpe.weight True

['Run GPT2 Evaluation', '00:00:24.46', "GPT2 Evaluation metrics before everything: {'eval_loss': 0.6713482737541199, 'eval_accuracy': 0.6098654708520179, 'eval_runtime': 18.8693, 'eval_samples_per_second': 59.091, 'eval_steps_per_second': 3.71}"]


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [7]:
###############################################################################
# Performing Parameter-Efficient Fine-Tuning
###############################################################################
# tokenizer = AutoTokenizer.from_pretrained(GPT2_FINETUNED_MODEL)
# model = AutoModelForSequenceClassification.from_pretrained(GPT2_FINETUNED_MODEL, ignore_mismatched_sizes=True).to(device)
torch.cuda.empty_cache()



In [8]:
def lora_compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    print("Logits Type:", type(logits))
    print("Logits Shape:", np.array(logits, dtype=object).shape)
    print("Labels Type:", type(labels))
    print("Labels Shape:", np.array(labels, dtype=object).shape)

    # Extract logits if they are inside a tuple
    if isinstance(logits, tuple):
        logits = logits[0]  # Take the first element of the tuple

    # Ensure logits is a NumPy array
    logits = np.array(logits)

    # Ensure labels is a NumPy array
    labels = np.array(labels)

    # Compute predictions
    predictions = np.argmax(logits, axis=-1)

    accuracy = load_metric("accuracy", trust_remote_code=True) 
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]

    print("Computed Accuracy:", acc)
    return {"eval_accuracy": acc}


start_time = time.time()
result = []

tokenized_ds = dataset.map(
    lambda x: tokenizer(x["sms"], padding="max_length", truncation=True, max_length=512),
    batched=True
)
tokenized_ds = tokenized_ds.rename_columns({"label": "labels"})
tokenized_ds["train"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_ds["test"].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


lora_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.1, bias="none", task_type=TaskType.SEQ_CLS)
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
peft_model.to(device)

peft_model.save_pretrained(PEFT_MODEL)
print("peft model saved")


trainer = Trainer(
    model=peft_model,  # Make sure to pass the PEFT model here
    args=TrainingArguments(
        output_dir=CHECKPOINTS,
        resume_from_checkpoint=True,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        logging_steps=1,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        metric_for_best_model="eval_accuracy",  
        save_strategy="epoch",
        load_best_model_at_end=True,
        label_names=["labels"],
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt"),
    compute_metrics=lora_compute_metrics,
)
trainer.compute_metrics=lora_compute_metrics



Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]



trainable params: 297,984 || all params: 124,737,792 || trainable%: 0.23888830740245906
peft model saved


In [9]:
from datasets import DatasetDict, load_metric

# Run evaluation manually
eval_output = trainer.predict(tokenized_ds["test"])
logits = eval_output.predictions
labels = eval_output.label_ids  
metrics = trainer.evaluate()
result.append(f"Lora Evaluation metrics before training: {metrics}")


Logits Type: <class 'numpy.ndarray'>
Logits Shape: (1115, 2)
Labels Type: <class 'numpy.ndarray'>
Labels Shape: (1115,)


  accuracy = load_metric("accuracy", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Computed Accuracy: 0.431390134529148


Logits Type: <class 'numpy.ndarray'>
Logits Shape: (1115, 2)
Labels Type: <class 'numpy.ndarray'>
Labels Shape: (1115,)
Computed Accuracy: 0.431390134529148


In [10]:

print(tokenized_ds["test"].column_names)  # Should include "labels"
trainer.train() # resume_from_checkpoint=CHECKPOINTS+"/checkpoint-last")
metrics = trainer.evaluate()
result.append(f"Lora Evaluation metrics after training: {metrics}")


end_time = time.time()
elapsed_time = secToHuman(end_time - start_time)
r = ["Run Lora Evaluation and Traing", elapsed_time, result]
resultset.append(r)
print(f"{r}")


['sms', 'labels', 'input_ids', 'attention_mask']


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7591,0.336463,0.865471
2,0.0983,0.307216,0.868161
3,0.062,0.296908,0.868161


Logits Type: <class 'numpy.ndarray'>
Logits Shape: (1115, 2)
Labels Type: <class 'numpy.ndarray'>
Labels Shape: (1115,)
Computed Accuracy: 0.8654708520179372


Checkpoint destination directory ./checkpoints/checkpoint-558 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Logits Type: <class 'numpy.ndarray'>
Logits Shape: (1115, 2)
Labels Type: <class 'numpy.ndarray'>
Labels Shape: (1115,)
Computed Accuracy: 0.8681614349775785


Checkpoint destination directory ./checkpoints/checkpoint-1116 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Logits Type: <class 'numpy.ndarray'>
Logits Shape: (1115, 2)
Labels Type: <class 'numpy.ndarray'>
Labels Shape: (1115,)
Computed Accuracy: 0.8681614349775785


Checkpoint destination directory ./checkpoints/checkpoint-1674 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Logits Type: <class 'numpy.ndarray'>
Logits Shape: (1115, 2)
Labels Type: <class 'numpy.ndarray'>
Labels Shape: (1115,)
Computed Accuracy: 0.8681614349775785
['Run Lora Evaluation and Traing', '00:23:19.39', ["Lora Evaluation metrics before training: {'eval_accuracy': 0.431390134529148, 'eval_loss': 0.8402268290519714, 'eval_runtime': 42.4556, 'eval_samples_per_second': 26.263, 'eval_steps_per_second': 3.298}", "Lora Evaluation metrics after training: {'eval_accuracy': 0.8681614349775785, 'eval_loss': 0.30721625685691833, 'eval_runtime': 42.5423, 'eval_samples_per_second': 26.209, 'eval_steps_per_second': 3.291, 'epoch': 3.0}"]]


## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [15]:


###############################################################################
# Performing Inference with a PEFT Model
###############################################################################

def hf_compute_metrics(eval_pred):
    logits, labels = eval_pred

    if isinstance(logits, tuple):
        logits = logits[0]  # Extract logits array if it's a tuple

    predictions = np.argmax(logits, axis=-1)

    accuracy = load_metric("accuracy", trust_remote_code=True) 
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]

    return {"accuracy": acc}

start_time = time.time()
result = [] 
NUM_LABELS = 2

model = AutoModelForSequenceClassification.from_pretrained(PEFT_MODEL, ignore_mismatched_sizes=True).to(device)
print("peft model loaded")

model.to(device)
model.config.pad_token_id = tokenizer.pad_token_id

trainer = Trainer(
    model=peft_model,  # PEFT model
    args=TrainingArguments(
        output_dir=CHECKPOINTS, 
        resume_from_checkpoint=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss", 
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=hf_compute_metrics,
)

# Evaluate the fine-tuned model on the test set
hf_results = trainer.evaluate()
result.append(f"Hugging Face Evaluation metrics: {hf_results}")

end_time = time.time()
elapsed_time = secToHuman(end_time - start_time)
r = ["Run Hugging Face Evaluation", elapsed_time, result]
resultset.append(r)
print(f"{r}")



Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


peft model loaded


['Run Hugging Face Evaluation', '00:00:54.72', ["Hugging Face Evaluation metrics: {'eval_loss': 0.30721625685691833, 'eval_accuracy': 0.8681614349775785, 'eval_runtime': 49.0418, 'eval_samples_per_second': 22.736, 'eval_steps_per_second': 2.855}"]]


In [13]:
from datetime import datetime

big_crunch = time.time()
elapsed_time = secToHuman(big_crunch - planck0)
start_d = datetime.fromtimestamp(planck0 / 1000).strftime('%Y-%m-%d %H:%M:%S')
end_d = datetime.fromtimestamp(big_crunch / 1000).strftime('%Y-%m-%d %H:%M:%S')

r = {"Runtime Summary",  elapsed_time, ""}
resultset.append(r)


In [14]:
import pandas as pd

# Convert to DataFrame
df = pd.DataFrame(resultset, columns=["Task", "Elapsed Time", "Result"])
df_styled = df.style.set_properties(**{"text-align": "left", "white-space": "pre-wrap"})  # Preserve formatting

col_space_dict = {"Task": 30, "Elapsed Time": 15, "Result": 15}

# Display in Jupyter Notebook
display(df_styled)

# Write to a fixed-width formatted text file
output_file = "resultset.txt"
text = df.to_string(index=False, col_space=col_space_dict, justify="left")
with open(output_file, "w") as f:
     f.write(text)  # Fixed-width columns

print("\n\n\nSo Long, and Thanks for All the Fish\n\n")


Unnamed: 0,Task,Elapsed Time,Result
0,Run GPT2 Evaluation and Traing,00:08:50.99,"GPT2 Evaluation metrics before everything: {'eval_loss': 0.4561626613140106, 'eval_accuracy': 0.7793721973094171, 'eval_runtime': 17.0024, 'eval_samples_per_second': 65.579, 'eval_steps_per_second': 4.117} Evaluation metrics after gpt2 train: {'eval_loss': 0.06952129304409027, 'eval_accuracy': 0.9856502242152466, 'eval_runtime': 18.3855, 'eval_samples_per_second': 60.645, 'eval_steps_per_second': 3.807, 'epoch': 2.0}"
1,Run GPT2 Evaluation,00:00:24.46,"GPT2 Evaluation metrics before everything: {'eval_loss': 0.6713482737541199, 'eval_accuracy': 0.6098654708520179, 'eval_runtime': 18.8693, 'eval_samples_per_second': 59.091, 'eval_steps_per_second': 3.71}"
2,Run Lora Evaluation and Traing,00:23:19.39,"[""Lora Evaluation metrics before training: {'eval_accuracy': 0.431390134529148, 'eval_loss': 0.8402268290519714, 'eval_runtime': 42.4556, 'eval_samples_per_second': 26.263, 'eval_steps_per_second': 3.298}"", ""Lora Evaluation metrics after training: {'eval_accuracy': 0.8681614349775785, 'eval_loss': 0.30721625685691833, 'eval_runtime': 42.5423, 'eval_samples_per_second': 26.209, 'eval_steps_per_second': 3.291, 'epoch': 3.0}""]"
3,Run Hugging Face Evaluation,00:00:42.62,"[""Hugging Face Evaluation metrics: {'eval_loss': 0.30721625685691833, 'eval_accuracy': 0.8681614349775785, 'eval_runtime': 42.6018, 'eval_samples_per_second': 26.173, 'eval_steps_per_second': 3.286}""]"
4,,Runtime Summary,00:33:30.01





So Long, and Thanks for All the Fish


