In [26]:
from dotenv import load_dotenv
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BertForSequenceClassification
from datasets import Dataset, DatasetDict
from peft import get_peft_model, LoraConfig, TaskType


In [12]:
load_dotenv()
hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")

In [48]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hugging_face_token)

In [45]:
from datasets import load_dataset

dataset_name = "data/balanced_record_without_llm.csv"
dataset = load_dataset("csv", data_files=dataset_name)

# processing function which will be applied to the dataset
def preprocess_function(examples):
    text_inputs = examples["default/domain"]
    encoding = tokenizer(text_inputs, truncation=True, padding="max_length", max_length=30)
    encoding["label"] = examples["default/class"]

    return encoding

dataset = dataset.map(preprocess_function, batched=True)

splited_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset, test_dataset = splited_dataset["train"], splited_dataset["test"]


Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

In [29]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3, token=hugging_face_token) # we have 3 different labels 

model.safetensors:  62%|######1   | 273M/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=3, # 3 epochs
    per_device_train_batch_size=16, # batch size
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    logging_dir="./bert_logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [58]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1431,0.081626
2,0.1782,0.077624
3,0.0434,0.069964


TrainOutput(global_step=45000, training_loss=0.07704883529939172, metrics={'train_runtime': 3830.9665, 'train_samples_per_second': 187.942, 'train_steps_per_second': 11.746, 'total_flos': 1.11000973104e+16, 'train_loss': 0.07704883529939172, 'epoch': 3.0})

In [59]:
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")


('./finetuned_model\\tokenizer_config.json',
 './finetuned_model\\special_tokens_map.json',
 './finetuned_model\\vocab.txt',
 './finetuned_model\\added_tokens.json',
 './finetuned_model\\tokenizer.json')