In this notebook we're going to test the hypothesis that pretraining a model on domain data results in better performance of the model when finetuning a model

The three models we'll be using are:
1. FinBERT 2 v_1
2. FinBERT 2 v_2
3. DistilBERT

Note that the FinBERT's are based on the DistilBERT models
The evaluation metric we'll use are Accuracy, cross entropy loss and macro F1 average.

In [1]:
pip install -U datasets transformers

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.

In [2]:
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, Pipeline
from huggingface_hub import login
import os

#login(os.getenv("MACBOOK_HF_KEY"))

In [13]:

finetune_ds = load_dataset('takala/financial_phrasebank', 'sentences_50agree', trust_remote_code=True)

tokenized_dataset = {}

#Setting up the basic info for each model
configs = [
    {'name': 'bert_base', 'model': 'distilbert/distilbert-base-uncased', 'tokenizer': AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')},
    {'name': 'finBERT2_v2', 'model': 'Czunzun/finBERT2_v2', 'tokenizer': AutoTokenizer.from_pretrained('Czunzun/finBERT2_v2')}
]


#Tokenizing each dataset
for config in configs:
    tokenized_dataset[config['name']] = finetune_ds.map(lambda train: config['tokenizer'](train['sentence'], padding="max_length", truncation=True))




#Splitting each tokenized dataset for test,train,eval split with standard 80% train
for dataset in tokenized_dataset:
    traintest_split = tokenized_dataset[dataset]['train'].train_test_split(test_size=.2)
    eval_split = traintest_split['test'].train_test_split(test_size=.5)
    ds = DatasetDict({
        'train': traintest_split['train'],
        'test': traintest_split['test'],
        'eval': eval_split['train']
    })
    tokenized_dataset[dataset] = ds


In [6]:
#computing loss metrics
def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, log_loss, f1_score
    import numpy as np

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = pred.predictions

    if probs.max() > 1 or probs.min() < 0:
        probs = np.exp(probs) / np.sum(np.exp(probs), axis=-1, keepdims=True)  # softmax


    accuracy = accuracy_score(labels, preds)
    cross_entropy = log_loss(labels, probs)
    f1 = f1_score(labels, preds, average='macro')
    return {
        'accuracy': accuracy,
        'cross_entropy': cross_entropy,
        'f1': f1
    }




#Setting up the training arguments
def training_step(config):
    from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer


    data_collator = DataCollatorWithPadding(tokenizer=config['tokenizer'])
    model = AutoModelForSequenceClassification.from_pretrained(config['model'], num_labels=3)

    training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    tpu_num_cores=1,
    dataloader_pin_memory=True,
    dataloader_num_workers=4
    )

    trainer=Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        processing_class=config['tokenizer'],
        compute_metrics=compute_metrics,
        train_dataset=tokenized_dataset[config['name']]['train'],
        eval_dataset=tokenized_dataset[config['name']]['test']
        )


    trainer.train()
    results = trainer.evaluate()

    print(results)

In [14]:
#Initialize training for both datasets
for config in configs:
    #Training
    training_step(config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6918
1000,0.5508
1500,0.3676
2000,0.3642
2500,0.1587


{'eval_loss': 0.8303661346435547, 'eval_accuracy': 0.8402061855670103, 'eval_cross_entropy': 0.823719762512187, 'eval_f1': 0.8083342887148938, 'eval_runtime': 3.8259, 'eval_samples_per_second': 254.057, 'eval_steps_per_second': 63.514, 'epoch': 3.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Czunzun/finBERT2_v2 and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6273
1000,0.5449
1500,0.3514
2000,0.3117
2500,0.1621


{'eval_loss': 0.6951795220375061, 'eval_accuracy': 0.8597938144329897, 'eval_cross_entropy': 0.6966099646543678, 'eval_f1': 0.8424264741159958, 'eval_runtime': 3.8924, 'eval_samples_per_second': 249.716, 'eval_steps_per_second': 62.429, 'epoch': 3.0}
