In this notebook we're going to test the hypothesis that pretraining a model on domain data results in better performance of the model when finetuning a model

The three models we'll be using are:
1. FinBERT 2 v_1
2. FinBERT 2 v_2
3. DistilBERT

Note that the FinBERT's are based on the DistilBERT models
The evaluation metric we'll use are Accuracy, cross entropy loss and macro F1 average.

In [None]:
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, Pipeline
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()
login(os.getenv("MACBOOK_HF_KEY"))

In [31]:

finetune_ds = load_dataset('takala/financial_phrasebank', 'sentences_50agree', trust_remote_code=True)

tokenized_dataset = {}

#Setting up the basic info for each model
configs = [
    {'name': 'bert_base', 'model': 'google-bert/bert-base-uncased', 'tokenizer': AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')},
    {'name': 'finBERT2_v2', 'model': 'Czunzun/finBERT2_v2', 'tokenizer': AutoTokenizer.from_pretrained('Czunzun/finBERT2_v2')}
]


#Tokenizing each dataset
for config in configs:
    tokenized_dataset[config['name']] = finetune_ds.map(lambda train: config['tokenizer'](train['sentence'], padding="max_length", truncation=True))




#Splitting each tokenized dataset for test,train,eval split with standard 80% train
for dataset in tokenized_dataset:
    traintest_split = tokenized_dataset[dataset]['train'].train_test_split(test_size=.2)
    eval_split = traintest_split['test'].train_test_split(test_size=.5)
    ds = DatasetDict({
        'train': traintest_split['train'],
        'test': traintest_split['test'],
        'eval': eval_split['train']
    })
    tokenized_dataset[dataset] = ds


Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

In [32]:
tokenized_dataset['bert_base']

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3876
    })
    test: Dataset({
        features: ['sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 970
    })
    eval: Dataset({
        features: ['sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [None]:
#computing loss metrics
def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, log_loss, f1_score
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    accuracy = accuracy_score(labels, preds, average='weighted')
    cross_entropy = log_loss(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'cross_entropy': cross_entropy,
        'f1': f1    
    }
   
    
    
    
#Setting up the training arguments 
def training_step(config):
    from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
    
    
    data_collator = DataCollatorWithPadding(tokenizer=config['tokenizer'])
    model = AutoModelForSequenceClassification.from_pretrained(config['model'], num_labels=3)
    
    training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    tpu_num_cores=1,
    dataloader_pin_memory=True,
    dataloader_num_workers=4,

    )
    
    trainer=Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        tokenizer=config['tokenizer'],
        compute_metrics=compute_metrics,
        train_dataset=tokenized_dataset[config['name']]['train'],
        eval_dataset=tokenized_dataset[config['name']]['test']
        )
    
    
    trainer.train()
    results = trainer.evaluate()
    
    print(results)

In [37]:
#Initialize training for both datasets
for config in configs:

    #Training
    training_step(config=config)

    #computing loss metrics

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer=Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disablin

KeyboardInterrupt: 

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/homebrew/Cellar/python@3.10/3.10.17_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/homebrew/Cellar/python@3.10/3.10.17_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
  File "/opt/homebrew/lib/python3.10/site-packages/transformers/data/__init__.py", line 29, in <module>
    from .processors import (
  File "/opt/homebrew/lib/python3.10/site-packages/transformers/data/processors/__init__.py", line 15, in <module>
    from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
  File "/opt/homebrew/lib/python3.10/site-packages/transformers/data/processors/glue.py", line 30, in <module>
    import tensorflow as tf
  File "/Users/cristia

In [None]:
#Coimpute lo