In [1]:
# Enable auto-reload for imported modules
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# Get project root (from training/notebooks/ go up 2 levels)
project_root = Path.cwd().parent.parent  

# Add paths
sys.path.insert(0, str(project_root))

# Verify paths
print("✓ Paths added to sys.path")

from training import settings
import os

os.environ["TRANSFORMERS_CACHE"] = str(settings.TRANSFORMER_CACHE_DIR)
os.environ["HF_HOME"] = str(settings.TRANSFORMER_DATASETS_DIR)

✓ Paths added to sys.path


In [None]:
import os

class Settings:
    TRANSFORMER_CACHE_DIR = ""
    TRANSFORMER_DATASETS_DIR = ""
    HUGGINGFACE_HUB_TOKEN = ""

settings = Settings()

os.environ["TRANSFORMERS_CACHE"] = str(settings.TRANSFORMER_CACHE_DIR)
os.environ["HF_HOME"] = str(settings.TRANSFORMER_DATASETS_DIR)

### Fine Tune Configuration

In [9]:
from peft import LoraConfig, TaskType
from transformers import TrainingArguments

HUGGINGFACE_MODEL_ID = "prajjwal1/bert-tiny"
DTYPE = "auto"

LABEL_NUMS = 2

LABEL2IDS = {"payment": 1, "not payment": 0}
ID2LABELS = {v: k for k, v in LABEL2IDS.items()}

LORA_CONFIG=LoraConfig(
    r=8,
    lora_alpha=32,
    task_type=TaskType.SEQ_CLS,
    # lora_dropout=0.1,
    # bias="none",
)


CHECKPOINTS_OUTPUT_DIR = "./.checkpoints/"

TRAINING_ARGS = TrainingArguments(
    output_dir=CHECKPOINTS_OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    load_best_model_at_end=True,
)


### Load Base Model

In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import get_peft_model
import torch

device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"

model = AutoModelForSequenceClassification.from_pretrained(HUGGINGFACE_MODEL_ID, dtype=DTYPE, label2id=LABEL2IDS, id2label=ID2LABELS).to(device)
tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_ID, dtype=DTYPE)

peft_model = get_peft_model(model, LORA_CONFIG)
peft_model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 8,450 || all params: 4,394,628 || trainable%: 0.1923


### Dataset Load
the code snippet should include a Dataset instance of `train_dataset`, `test_dataset`, `eval_dataset`

In [11]:
# from transformers import Trainer, TrainingArguments
from datasets import Dataset, Features, Value, ClassLabel, List, load_dataset

ds = load_dataset("maroon14/payment-related-llm-generated")

# ds_path = os.path.join(settings.BASE_DIR, "training/data/dataset.json")

# labels = [0, 1]
# dataset_features = Features({
#     'text': Value('string'),
#     'label': ClassLabel(names=labels),
#     'tags': List(Value('string'))
# })

# dataset = Dataset.from_json(ds_path, features=dataset_features)
# dataset = dataset.train_test_split(test_size=0.1, seed=12)

dataset = ds['train'].train_test_split(test_size=0.1, seed=20)
train_ds = dataset['train']
test_ds = dataset['test']

# # Use for fine-tuning
# train_dataset = dataset['train']
# test_dataset = dataset['test']
# print(f"✓ Loaded dataset with {len(train_dataset)} training samples and {len(test_dataset)} test samples")


### Metric Computation Function

In [12]:
# load metrics
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred

    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

### Preprocessing Sample Method
The method should map correct the field to be input to model

In [13]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=128)

train_tokenized_data  = train_ds.map(preprocess_function, batched=True)
test_tokenized_data  = test_ds.map(preprocess_function, batched=True)

print("✓ Tokenized the dataset")

Map: 100%|██████████| 1339/1339 [00:00<00:00, 22520.81 examples/s]

✓ Tokenized the dataset





In [14]:
# hyperparameters
from transformers import Trainer

trainer = Trainer(
    model=peft_model,
    args=TRAINING_ARGS,
    train_dataset=train_tokenized_data,
    eval_dataset=test_tokenized_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Test Fine-Tuned Model

In [51]:
from transformers import pipeline

classifier = pipeline("text-classification", model=trainer.model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)

text_tests = [
    "Hi there, just sent you $1",
]

outputs = classifier(text_tests)

print(outputs)

Device set to use mps:0


[{'label': 'payment', 'score': 0.9587512016296387}]


In [None]:
# Push to Hugging Face Hub
trainer.model.save_pretrained("maroon14/payment-related-seq-cls", use_auth_token=settings.HUGGINGFACE_HUB_TOKEN)